@microsoft/m365-copilot-eval 1.0.1-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +415 -0
  3. package/TERMS.txt +65 -0
  4. package/package.json +82 -0
  5. package/src/clients/cli/auth/__init__.py +1 -0
  6. package/src/clients/cli/auth/auth_handler.py +262 -0
  7. package/src/clients/cli/custom_evaluators/CitationsEvaluator.py +136 -0
  8. package/src/clients/cli/custom_evaluators/ConcisenessNonLLMEvaluator.py +18 -0
  9. package/src/clients/cli/custom_evaluators/ExactMatchEvaluator.py +25 -0
  10. package/src/clients/cli/custom_evaluators/PII/PII.py +45 -0
  11. package/src/clients/cli/custom_evaluators/PartialMatchEvaluator.py +39 -0
  12. package/src/clients/cli/custom_evaluators/__init__.py +1 -0
  13. package/src/clients/cli/demo_usage.py +83 -0
  14. package/src/clients/cli/generate_report.py +251 -0
  15. package/src/clients/cli/main.py +766 -0
  16. package/src/clients/cli/readme.md +301 -0
  17. package/src/clients/cli/requirements.txt +10 -0
  18. package/src/clients/cli/response_extractor.py +589 -0
  19. package/src/clients/cli/samples/PartnerSuccess.json +122 -0
  20. package/src/clients/cli/samples/example_prompts.json +14 -0
  21. package/src/clients/cli/samples/example_prompts_alt.json +12 -0
  22. package/src/clients/cli/samples/prompts_ambiguity.json +22 -0
  23. package/src/clients/cli/samples/prompts_rag_grounding.json +22 -0
  24. package/src/clients/cli/samples/prompts_security_injection.json +22 -0
  25. package/src/clients/cli/samples/prompts_tool_use_negatives.json +22 -0
  26. package/src/clients/cli/samples/psaSample.json +18 -0
  27. package/src/clients/cli/samples/starter.json +10 -0
  28. package/src/clients/node-js/bin/runevals.js +505 -0
  29. package/src/clients/node-js/config/default.js +25 -0
  30. package/src/clients/node-js/lib/cache-utils.js +119 -0
  31. package/src/clients/node-js/lib/expiry-check.js +164 -0
  32. package/src/clients/node-js/lib/index.js +25 -0
  33. package/src/clients/node-js/lib/python-runtime.js +253 -0
  34. package/src/clients/node-js/lib/venv-manager.js +242 -0
@@ -0,0 +1,251 @@
1
+ import json
2
+ import markdown
3
+ import pandas as pd
4
+ from pathlib import Path
5
+
6
+ def calculate_aggregate_statistics(results):
7
+ """Calculate aggregate statistics across all evaluation results."""
8
+ if not results:
9
+ return {}
10
+
11
+ # Extract all metrics from the first result to know what metrics we have
12
+ first_result = results[0]
13
+ metrics = first_result.get('results', {})
14
+
15
+ aggregates = {}
16
+
17
+ for metric_key in metrics.keys():
18
+ if not metric_key.endswith('_score'):
19
+ continue
20
+
21
+ metric_name = metric_key[:-6] # Remove '_score' suffix
22
+ metric_display_name = metric_name.replace('_', ' ').title()
23
+
24
+ scores = []
25
+ pass_count = 0
26
+ fail_count = 0
27
+ threshold_value = None
28
+
29
+ for result in results:
30
+ metric_data = result.get('results', {}).get(metric_key)
31
+ if metric_data:
32
+ try:
33
+ # Parse the JSON string to get the actual data
34
+ parsed_data = json.loads(metric_data) if isinstance(metric_data, str) else metric_data
35
+
36
+ # Extract score, result, and threshold
37
+ score = parsed_data.get('score')
38
+ if score is None:
39
+ score = parsed_data.get(metric_name)
40
+ if score is None:
41
+ score = parsed_data.get(f'{metric_name}_score')
42
+
43
+ result_status = parsed_data.get('result')
44
+ if result_status is None:
45
+ result_status = parsed_data.get(f'{metric_name}_result')
46
+
47
+ threshold = parsed_data.get('threshold')
48
+ if threshold is None:
49
+ threshold = parsed_data.get(f'{metric_name}_threshold')
50
+
51
+ if score is not None:
52
+ scores.append(float(score))
53
+
54
+ if result_status:
55
+ if str(result_status).lower() == 'pass':
56
+ pass_count += 1
57
+ elif str(result_status).lower() == 'fail':
58
+ fail_count += 1
59
+
60
+ # Capture threshold (should be consistent across all results)
61
+ if threshold is not None and threshold_value is None:
62
+ threshold_value = threshold
63
+
64
+ except (json.JSONDecodeError, ValueError, TypeError):
65
+ continue
66
+
67
+ if scores:
68
+ avg_score = sum(scores) / len(scores)
69
+ total_evaluated = pass_count + fail_count
70
+ pass_rate = (pass_count / total_evaluated * 100) if total_evaluated > 0 else 0
71
+
72
+ aggregates[metric_display_name] = {
73
+ 'total_prompts': len(results),
74
+ 'total_evaluated': total_evaluated,
75
+ 'pass_count': pass_count,
76
+ 'fail_count': fail_count,
77
+ 'pass_rate': pass_rate,
78
+ 'avg_score': avg_score,
79
+ 'threshold': threshold_value,
80
+ 'scores': scores
81
+ }
82
+
83
+ return aggregates
84
+
85
+ def parse_score(score_str):
86
+ try:
87
+ return json.loads(score_str)
88
+ except Exception:
89
+ return {}
90
+
91
+ def format_score(score):
92
+ try:
93
+ val = float(score)
94
+ except (TypeError, ValueError):
95
+ return score
96
+ if val.is_integer():
97
+ return str(int(val))
98
+ s = f"{val:.3f}".rstrip('0').rstrip('.')
99
+ return s or "0"
100
+
101
+ def extract_metric_rows(entry):
102
+ """
103
+ Build generic metric rows from any `*_score` keys on an entry.
104
+ Each row has: Metric, Result, Score, Threshold, Reason.
105
+ Supports metrics under entry['results'] and falls back to top-level for backward compatibility.
106
+ """
107
+ rows = []
108
+
109
+ def pick(d, candidates):
110
+ for k in candidates:
111
+ if k in d and d[k] not in (None, ''):
112
+ return d[k]
113
+ return ''
114
+
115
+ def iter_score_fields(e):
116
+ container = e.get('results') if isinstance(e, dict) else None
117
+ if isinstance(container, dict):
118
+ for k, v in container.items():
119
+ if isinstance(k, str) and k.endswith('_score'):
120
+ yield k, v
121
+ return
122
+ # fallback to top-level flat structure
123
+ for k, v in e.items():
124
+ if isinstance(k, str) and k.endswith('_score'):
125
+ yield k, v
126
+
127
+ for key, raw in iter_score_fields(entry):
128
+ metric_id = key[:-6] # strip "_score"
129
+ metric_obj = parse_score(raw) if isinstance(raw, (str, bytes)) else (raw or {})
130
+
131
+ display_name = metric_id.replace('_', ' ').title()
132
+
133
+ # Candidate key patterns inside the parsed metric object
134
+ score_val = pick(metric_obj, [metric_id, f'{metric_id}_score', 'score', 'value'])
135
+ result_val = pick(metric_obj, [f'{metric_id}_result', 'result', 'status'])
136
+ threshold_val = pick(metric_obj, [f'{metric_id}_threshold', 'threshold', 'min_threshold', 'expected'])
137
+ reason_val = pick(metric_obj, [f'{metric_id}_reason', 'reason', 'rationale', 'explanation'])
138
+
139
+ rows.append({
140
+ 'Metric': display_name,
141
+ 'Result': str(result_val).lower() if isinstance(result_val, str) else result_val,
142
+ 'Score': format_score(score_val),
143
+ 'Threshold': format_score(threshold_val),
144
+ 'Reason': reason_val
145
+ })
146
+ return rows
147
+
148
+ def generate_html_report(results):
149
+ html = [
150
+ '<!DOCTYPE html>',
151
+ '<html lang="en">',
152
+ '<head>',
153
+ ' <meta charset="UTF-8">',
154
+ ' <title>M365 Copilot Agents Evaluation Scores Report</title>',
155
+ ' <style>',
156
+ ' body { font-family: Arial, sans-serif; margin: 2em; padding: 1.5em; background: #fafafa; }',
157
+ ' h1 { margin-top: 0; color: #2c3e50; }',
158
+ ' h2 { color: #34495e; margin-top: 2em; }',
159
+ ' table { border-collapse: collapse; width: 100%; margin: 1.5em 0; }',
160
+ ' th, td { border: 1px solid #ccc; padding: 10px 12px; text-align: left; vertical-align: top; }',
161
+ ' th { background: #f4f4f4; font-weight: 600; }',
162
+ ' details { padding: 0.75em 1em; border: 1px solid #ddd; border-radius: 6px; background: #fff; }',
163
+ ' details summary { cursor: pointer; font-weight: 600; margin: -0.75em -1em 0.75em; padding: 0.75em 1em; background: #eef2f5; border-bottom: 1px solid #ddd; }',
164
+ ' .pass { background: #d4edda; color: #155724; }',
165
+ ' .fail { background: #f8d7da; color: #721c24; }',
166
+ ' .score-details { font-size: 0.95em; color: #333; background: #f9f9f9; }',
167
+ ' .score-details table { margin-top: 0.5em !important; }',
168
+ ' .score-details th { background: #e9ecef; }',
169
+ ' .aggregate-section { margin: 2em 0; }',
170
+ ' .aggregate-section h2 { color: #34495e; margin-top: 0; }',
171
+ ' .pass-rate-excellent { background: #d1f2eb; color: #186a3b; font-weight: bold; }',
172
+ ' .pass-rate-good { background: #fef9e7; color: #7d6608; font-weight: bold; }',
173
+ ' .pass-rate-poor { background: #fadbd8; color: #b03a2e; font-weight: bold; }',
174
+ ' .individual-results { margin-top: 3em; }',
175
+ ' </style>',
176
+ '</head>',
177
+ '<body>',
178
+ ' <h1> M365 Copilot Agents Evaluation Report</h1>',
179
+ ]
180
+
181
+ # Add aggregate statistics if multiple results
182
+ if len(results) > 1:
183
+ aggregates = calculate_aggregate_statistics(results)
184
+ if aggregates:
185
+ html.append('<div class="aggregate-section">')
186
+ html.append(f'<h2> Aggregate Statistics ({len(results)} prompts evaluated)</h2>')
187
+
188
+ # Create aggregate table with same style as individual results
189
+ html.append('<table>')
190
+ html.append('<tr><th>Metric</th><th>Pass Rate</th><th>Passed</th><th>Failed</th><th>Avg Score</th><th>Threshold</th></tr>')
191
+
192
+ for metric_name, stats in aggregates.items():
193
+ pass_rate_class = 'pass-rate-excellent' if stats['pass_rate'] >= 80 else 'pass-rate-good' if stats['pass_rate'] >= 60 else 'pass-rate-poor'
194
+ threshold_display = stats.get('threshold', 'N/A')
195
+ html.append(f'''
196
+ <tr>
197
+ <td><strong>{metric_name}</strong></td>
198
+ <td class="{pass_rate_class}">{stats['pass_rate']:.1f}%</td>
199
+ <td class="pass">{stats['pass_count']}</td>
200
+ <td class="fail">{stats['fail_count']}</td>
201
+ <td>{stats['avg_score']:.2f}</td>
202
+ <td>{threshold_display}</td>
203
+ </tr>
204
+ ''')
205
+
206
+ html.append('</table>')
207
+ html.append('</div>')
208
+
209
+ # Individual results section
210
+ html.append('<div class="individual-results">')
211
+ html.append('<h2> Individual Results</h2>')
212
+
213
+ for idx, entry in enumerate(results, 1):
214
+ html.append(f'<h3>Prompt {idx}: {entry.get("prompt", "")}</h3>')
215
+ html.append('<table>')
216
+ html.append('<tr><th>Response</th><td>{}</td></tr>'.format(markdown.markdown(entry.get("response", ""))))
217
+ html.append('<tr><th>Expected Response</th><td>{}</td></tr>'.format(markdown.markdown(entry.get("expected_response", ""))))
218
+ html.append('</table>')
219
+
220
+ score_rows = extract_metric_rows(entry)
221
+ score_df = pd.DataFrame(score_rows)
222
+
223
+ def highlight_result(val):
224
+ lv = str(val).lower()
225
+ if lv == 'pass':
226
+ return 'background-color: #d4edda; color: #155724;'
227
+ elif lv == 'fail':
228
+ return 'background-color: #f8d7da; color: #721c24;'
229
+ return ''
230
+
231
+ score_html = (
232
+ score_df.style
233
+ .map(highlight_result, subset=['Result'])
234
+ .set_table_attributes('style="margin-top:1em;"')
235
+ .hide(axis="index")
236
+ .to_html()
237
+ )
238
+
239
+ html.append('<details open class="score-details"><summary>Show/Hide Evaluation Metric Scores</summary>')
240
+ html.append(score_html)
241
+ html.append('</details>')
242
+
243
+ html.append(f'<p><em>Entry {idx} of {len(results)}</em></p>')
244
+ if idx < len(results): # Don't add HR after last item
245
+ html.append('<hr>')
246
+
247
+ html.append('</div>') # Close individual-results
248
+ html.append('</body></html>')
249
+ html.append('<p><small>Generated by M365 Copilot Agents Evaluation CLI.</small></p>')
250
+
251
+ return '\n'.join(html)