@microsoft/m365-copilot-eval 1.2.0-preview.1 → 1.3.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,150 @@
1
+ """Evaluator resolution module for per-prompt evaluator configuration.
2
+
3
+ Resolves which evaluators to run on each prompt by merging prompt-level config
4
+ with file-level defaults and system defaults, following extend/replace modes.
5
+ """
6
+
7
+ import difflib
8
+ import logging
9
+ from typing import Any, Dict, Optional, Tuple
10
+
11
+ from common import (
12
+ RELEVANCE,
13
+ COHERENCE,
14
+ GROUNDEDNESS,
15
+ TOOL_CALL_ACCURACY,
16
+ CITATIONS,
17
+ EXACT_MATCH,
18
+ PARTIAL_MATCH,
19
+ REQUIRES_AZURE_OPENAI,
20
+ REQUIRES_TOOL_DEFINITIONS,
21
+ SYSTEM_DEFAULT_EVALUATORS,
22
+ RegistryEntry,
23
+ )
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ # Static registry of available evaluators per data-model.md
29
+ EVALUATOR_REGISTRY: Dict[str, RegistryEntry] = {
30
+ RELEVANCE: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
31
+ COHERENCE: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
32
+ GROUNDEDNESS: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
33
+ TOOL_CALL_ACCURACY: RegistryEntry(type="tool", requires=[REQUIRES_AZURE_OPENAI, REQUIRES_TOOL_DEFINITIONS], default_threshold=3),
34
+ CITATIONS: RegistryEntry(type="non-llm", requires=[], default_threshold=1),
35
+ EXACT_MATCH: RegistryEntry(type="non-llm", requires=[], default_threshold=None),
36
+ PARTIAL_MATCH: RegistryEntry(type="non-llm", requires=[], default_threshold=0.5),
37
+ }
38
+
39
+
40
+ def validate_evaluator_names(evaluator_map: Dict[str, Any]) -> None:
41
+ """Validate that all evaluator names in the map exist in the registry.
42
+
43
+ Raises ValueError with categorized valid names and
44
+ 'Did you mean?' suggestions for close matches.
45
+ """
46
+ invalid_names = [name for name in evaluator_map if name not in EVALUATOR_REGISTRY]
47
+ if not invalid_names:
48
+ return
49
+
50
+ # Categorize valid evaluators for the error message
51
+ llm_evals = [n for n, r in EVALUATOR_REGISTRY.items() if r.type == "llm"]
52
+ tool_evals = [n for n, r in EVALUATOR_REGISTRY.items() if r.type == "tool"]
53
+ non_llm_evals = [n for n, r in EVALUATOR_REGISTRY.items() if r.type == "non-llm"]
54
+
55
+ lines = []
56
+ for name in invalid_names:
57
+ lines.append(f'Unknown evaluator "{name}".')
58
+ close = difflib.get_close_matches(name, EVALUATOR_REGISTRY.keys(), n=1, cutoff=0.5)
59
+ if close:
60
+ lines.append(f'Did you mean "{close[0]}"?')
61
+
62
+ lines.append("")
63
+ lines.append("Valid evaluators are:")
64
+ lines.append(f" - {', '.join(llm_evals)} (LLM-based)")
65
+ lines.append(f" - {', '.join(tool_evals)} (tool evaluation)")
66
+ lines.append(f" - {', '.join(non_llm_evals)} (non-LLM)")
67
+
68
+ raise ValueError("\n".join(lines))
69
+
70
+
71
+ def check_prerequisites(
72
+ evaluator_name: str,
73
+ available_context: Dict[str, bool],
74
+ ) -> Tuple[bool, Optional[str]]:
75
+ """Check if prerequisites for an evaluator are available.
76
+
77
+ Returns (True, None) if all prerequisites are met, or
78
+ (False, warning_message) if a prerequisite is missing.
79
+ """
80
+ registry_entry = EVALUATOR_REGISTRY.get(evaluator_name)
81
+ if not registry_entry:
82
+ return False, f"Unknown evaluator: {evaluator_name}"
83
+
84
+ for req in registry_entry.requires:
85
+ if not available_context.get(req, False):
86
+ msg = (
87
+ f"Skipping evaluator '{evaluator_name}': "
88
+ f"missing prerequisite '{req}'"
89
+ )
90
+ return False, msg
91
+
92
+ return True, None
93
+
94
+
95
+ def resolve_default_evaluators(file_defaults: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
96
+ """Resolve effective default evaluators, falling back to system defaults.
97
+
98
+ Precedence: file-level defaults > system defaults.
99
+ An explicit empty dict means "no default evaluators".
100
+ """
101
+ # File-level defaults (including explicit empty dict)
102
+ if file_defaults is not None:
103
+ return file_defaults
104
+
105
+ # System defaults
106
+ return {name: {} for name in SYSTEM_DEFAULT_EVALUATORS}
107
+
108
+
109
+ def resolve_evaluators_for_prompt(
110
+ prompt_evaluators: Optional[Dict[str, Any]],
111
+ evaluators_mode: str,
112
+ prompt: str,
113
+ default_evaluators: Dict[str, Any],
114
+ ) -> Dict[str, Any]:
115
+ """Resolve which evaluators to run for a single prompt.
116
+
117
+ Args:
118
+ prompt_evaluators: Per-prompt evaluator config (None if not specified).
119
+ evaluators_mode: How to combine with defaults ("extend" or "replace").
120
+ prompt: The prompt text (used in warning messages).
121
+ default_evaluators: Resolved default evaluators (from resolve_default_evaluators).
122
+
123
+ Returns:
124
+ Resolved EvaluatorMap (dict of evaluator_name -> options).
125
+ """
126
+ # No prompt-level config → use defaults
127
+ if prompt_evaluators is None:
128
+ return dict(default_evaluators)
129
+
130
+ if evaluators_mode == "replace":
131
+ if not prompt_evaluators:
132
+ logger.warning(
133
+ "Empty evaluators with 'replace' mode for prompt: '%s'. "
134
+ "No evaluators will run.",
135
+ prompt[:80],
136
+ )
137
+ return dict(prompt_evaluators)
138
+
139
+ # mode == "extend": merge defaults with prompt overrides (prompt wins on conflict)
140
+ merged = dict(default_evaluators)
141
+ merged.update(prompt_evaluators)
142
+ return merged
143
+
144
+
145
+ def get_evaluator_threshold(evaluator_name: str, options: Dict[str, Any]) -> Optional[float]:
146
+ """Get the threshold for an evaluator, with option override support."""
147
+ if "threshold" in options:
148
+ return options["threshold"]
149
+ entry = EVALUATOR_REGISTRY.get(evaluator_name)
150
+ return entry.default_threshold if entry else None
@@ -1,76 +1,75 @@
1
+ import html as html_module
1
2
  import json
2
3
  import markdown
4
+ from common import METRIC_IDS, pascal_case_to_title
3
5
  import pandas as pd
4
6
  from pathlib import Path
5
7
 
6
8
  def calculate_aggregate_statistics(results):
7
- """Calculate aggregate statistics across all evaluation results."""
9
+ """Calculate aggregate statistics across all evaluation results.
10
+
11
+ Scans ALL results (not just the first) to discover which metrics were used,
12
+ correctly handling per-prompt evaluator variation. Each metric reports
13
+ prompts_evaluated (how many prompts it actually ran on) and total_prompts.
14
+ """
8
15
  if not results:
9
16
  return {}
10
-
11
- # Extract all metrics from the first result to know what metrics we have
12
- first_result = results[0]
13
- metrics = first_result.get('results', {})
14
-
17
+
18
+ # Discover all metric keys across all results
19
+ all_metric_keys = set()
20
+ for result in results:
21
+ all_metric_keys.update(result.get('results', {}).keys())
22
+
15
23
  aggregates = {}
16
-
17
- for metric_key in metrics.keys():
18
- if not metric_key.endswith('_score'):
19
- continue
20
-
21
- metric_name = metric_key[:-6] # Remove '_score' suffix
22
- metric_display_name = metric_name.replace('_', ' ').title()
23
-
24
+
25
+ for eval_name in sorted(all_metric_keys):
26
+ display_name = pascal_case_to_title(eval_name)
27
+ metric_id = METRIC_IDS.get(eval_name, eval_name.lower())
28
+
24
29
  scores = []
25
30
  pass_count = 0
26
31
  fail_count = 0
27
32
  threshold_value = None
28
-
33
+ prompts_evaluated = 0
34
+
29
35
  for result in results:
30
- metric_data = result.get('results', {}).get(metric_key)
31
- if metric_data:
32
- try:
33
- # Parse the JSON string to get the actual data
34
- parsed_data = json.loads(metric_data) if isinstance(metric_data, str) else metric_data
35
-
36
- # Extract score, result, and threshold
37
- score = parsed_data.get('score')
38
- if score is None:
39
- score = parsed_data.get(metric_name)
40
- if score is None:
41
- score = parsed_data.get(f'{metric_name}_score')
42
-
43
- result_status = parsed_data.get('result')
44
- if result_status is None:
45
- result_status = parsed_data.get(f'{metric_name}_result')
46
-
47
- threshold = parsed_data.get('threshold')
48
- if threshold is None:
49
- threshold = parsed_data.get(f'{metric_name}_threshold')
50
-
51
- if score is not None:
52
- scores.append(float(score))
53
-
54
- if result_status:
55
- if str(result_status).lower() == 'pass':
56
- pass_count += 1
57
- elif str(result_status).lower() == 'fail':
58
- fail_count += 1
59
-
60
- # Capture threshold (should be consistent across all results)
61
- if threshold is not None and threshold_value is None:
62
- threshold_value = threshold
63
-
64
- except (json.JSONDecodeError, ValueError, TypeError):
65
- continue
66
-
67
- if scores:
68
- avg_score = sum(scores) / len(scores)
36
+ metric_data = result.get('results', {}).get(eval_name)
37
+ if metric_data is None:
38
+ continue # This metric did not run for this prompt
39
+
40
+ prompts_evaluated += 1
41
+ try:
42
+ parsed_data = json.loads(metric_data) if isinstance(metric_data, str) else metric_data
43
+
44
+ score = parsed_data.get(metric_id)
45
+
46
+ result_status = parsed_data.get('result')
47
+
48
+ threshold = parsed_data.get('threshold')
49
+
50
+ if score is not None:
51
+ scores.append(float(score))
52
+
53
+ if result_status:
54
+ if str(result_status).lower() == 'pass':
55
+ pass_count += 1
56
+ elif str(result_status).lower() == 'fail':
57
+ fail_count += 1
58
+
59
+ if threshold is not None and threshold_value is None:
60
+ threshold_value = threshold
61
+
62
+ except (json.JSONDecodeError, ValueError, TypeError):
63
+ continue
64
+
65
+ if scores or pass_count > 0 or fail_count > 0:
66
+ avg_score = sum(scores) / len(scores) if scores else 0
69
67
  total_evaluated = pass_count + fail_count
70
68
  pass_rate = (pass_count / total_evaluated * 100) if total_evaluated > 0 else 0
71
-
72
- aggregates[metric_display_name] = {
69
+
70
+ aggregates[display_name] = {
73
71
  'total_prompts': len(results),
72
+ 'prompts_evaluated': prompts_evaluated,
74
73
  'total_evaluated': total_evaluated,
75
74
  'pass_count': pass_count,
76
75
  'fail_count': fail_count,
@@ -79,7 +78,7 @@ def calculate_aggregate_statistics(results):
79
78
  'threshold': threshold_value,
80
79
  'scores': scores
81
80
  }
82
-
81
+
83
82
  return aggregates
84
83
 
85
84
  def parse_score(score_str):
@@ -100,9 +99,9 @@ def format_score(score):
100
99
 
101
100
  def extract_metric_rows(entry):
102
101
  """
103
- Build generic metric rows from any `*_score` keys on an entry.
102
+ Build generic metric rows from evaluation results.
104
103
  Each row has: Metric, Result, Score, Threshold, Reason.
105
- Supports metrics under entry['results'] and falls back to top-level for backward compatibility.
104
+ Omits metrics that did not run (None values) for this prompt.
106
105
  """
107
106
  rows = []
108
107
 
@@ -112,29 +111,22 @@ def extract_metric_rows(entry):
112
111
  return d[k]
113
112
  return ''
114
113
 
115
- def iter_score_fields(e):
116
- container = e.get('results') if isinstance(e, dict) else None
117
- if isinstance(container, dict):
118
- for k, v in container.items():
119
- if isinstance(k, str) and k.endswith('_score'):
120
- yield k, v
121
- return
122
- # fallback to top-level flat structure
123
- for k, v in e.items():
124
- if isinstance(k, str) and k.endswith('_score'):
125
- yield k, v
126
-
127
- for key, raw in iter_score_fields(entry):
128
- metric_id = key[:-6] # strip "_score"
114
+ results_container = entry.get('results', {}) if isinstance(entry, dict) else {}
115
+
116
+ for eval_name, raw in results_container.items():
117
+ if raw is None:
118
+ continue # Skip metrics that did not run for this prompt
119
+
129
120
  metric_obj = parse_score(raw) if isinstance(raw, (str, bytes)) else (raw or {})
130
121
 
131
- display_name = metric_id.replace('_', ' ').title()
122
+ display_name = pascal_case_to_title(eval_name)
123
+ metric_id = METRIC_IDS.get(eval_name, eval_name.lower())
132
124
 
133
125
  # Candidate key patterns inside the parsed metric object
134
- score_val = pick(metric_obj, [metric_id, f'{metric_id}_score', 'score', 'value'])
135
- result_val = pick(metric_obj, [f'{metric_id}_result', 'result', 'status'])
136
- threshold_val = pick(metric_obj, [f'{metric_id}_threshold', 'threshold', 'min_threshold', 'expected'])
137
- reason_val = pick(metric_obj, [f'{metric_id}_reason', 'reason', 'rationale', 'explanation'])
126
+ score_val = pick(metric_obj, [metric_id])
127
+ result_val = pick(metric_obj, ['result'])
128
+ threshold_val = pick(metric_obj, ['threshold'])
129
+ reason_val = pick(metric_obj, [f'{metric_id}_reason', 'reason'])
138
130
 
139
131
  rows.append({
140
132
  'Metric': display_name,
@@ -145,7 +137,13 @@ def extract_metric_rows(entry):
145
137
  })
146
138
  return rows
147
139
 
148
- def generate_html_report(results):
140
+ def _escape(text):
141
+ """HTML-escape user-controlled content to prevent XSS."""
142
+ if text is None:
143
+ return ""
144
+ return html_module.escape(str(text))
145
+
146
+ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=None):
149
147
  html = [
150
148
  '<!DOCTYPE html>',
151
149
  '<html lang="en">',
@@ -172,29 +170,43 @@ def generate_html_report(results):
172
170
  ' .pass-rate-good { background: #fef9e7; color: #7d6608; font-weight: bold; }',
173
171
  ' .pass-rate-poor { background: #fadbd8; color: #b03a2e; font-weight: bold; }',
174
172
  ' .individual-results { margin-top: 3em; }',
173
+ ' .evaluator-badge { display: inline-block; padding: 2px 8px; margin: 2px; border-radius: 4px; font-size: 0.85em; background: #e8eaf6; color: #283593; }',
175
174
  ' </style>',
176
175
  '</head>',
177
176
  '<body>',
178
177
  ' <h1> M365 Copilot Agents Evaluation Report</h1>',
179
178
  ]
180
179
 
180
+ # Add metadata section
181
+ metadata_items = []
182
+ if agent_name:
183
+ metadata_items.append(f'<strong>Agent Name:</strong> {_escape(agent_name)}')
184
+ if agent_id:
185
+ metadata_items.append(f'<strong>Agent ID:</strong> {_escape(agent_id)}')
186
+ if cli_version:
187
+ metadata_items.append(f'<strong>CLI Version:</strong> {_escape(cli_version)}')
188
+ if metadata_items:
189
+ html.append(f' <p style="color: #666; font-size: 0.95em;">{" &nbsp;|&nbsp; ".join(metadata_items)}</p>')
190
+
181
191
  # Add aggregate statistics if multiple results
182
192
  if len(results) > 1:
183
193
  aggregates = calculate_aggregate_statistics(results)
184
194
  if aggregates:
185
195
  html.append('<div class="aggregate-section">')
186
196
  html.append(f'<h2> Aggregate Statistics ({len(results)} prompts evaluated)</h2>')
187
-
188
- # Create aggregate table with same style as individual results
197
+
189
198
  html.append('<table>')
190
- html.append('<tr><th>Metric</th><th>Pass Rate</th><th>Passed</th><th>Failed</th><th>Avg Score</th><th>Threshold</th></tr>')
191
-
199
+ html.append('<tr><th>Metric</th><th>Prompts</th><th>Pass Rate</th><th>Passed</th><th>Failed</th><th>Avg Score</th><th>Threshold</th></tr>')
200
+
192
201
  for metric_name, stats in aggregates.items():
193
202
  pass_rate_class = 'pass-rate-excellent' if stats['pass_rate'] >= 80 else 'pass-rate-good' if stats['pass_rate'] >= 60 else 'pass-rate-poor'
194
- threshold_display = stats.get('threshold', 'N/A')
203
+ threshold_display = _escape(str(stats.get('threshold', 'N/A')))
204
+ prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
205
+ total_prompts = stats.get('total_prompts', len(results))
195
206
  html.append(f'''
196
207
  <tr>
197
- <td><strong>{metric_name}</strong></td>
208
+ <td><strong>{_escape(metric_name)}</strong></td>
209
+ <td>{prompts_evaluated}/{total_prompts}</td>
198
210
  <td class="{pass_rate_class}">{stats['pass_rate']:.1f}%</td>
199
211
  <td class="pass">{stats['pass_count']}</td>
200
212
  <td class="fail">{stats['fail_count']}</td>
@@ -202,7 +214,7 @@ def generate_html_report(results):
202
214
  <td>{threshold_display}</td>
203
215
  </tr>
204
216
  ''')
205
-
217
+
206
218
  html.append('</table>')
207
219
  html.append('</div>')
208
220
 
@@ -211,34 +223,42 @@ def generate_html_report(results):
211
223
  html.append('<h2> Individual Results</h2>')
212
224
 
213
225
  for idx, entry in enumerate(results, 1):
214
- html.append(f'<h3>Prompt {idx}: {entry.get("prompt", "")}</h3>')
226
+ html.append(f'<h3>Prompt {idx}: {_escape(entry.get("prompt", ""))}</h3>')
227
+
228
+ # Show evaluator badges for this prompt
229
+ evaluators_ran = entry.get('evaluators_ran', [])
230
+ if evaluators_ran:
231
+ badges = ''.join(f'<span class="evaluator-badge">{_escape(e)}</span>' for e in evaluators_ran)
232
+ html.append(f'<p>Evaluators: {badges}</p>')
233
+
215
234
  html.append('<table>')
216
- html.append('<tr><th>Response</th><td>{}</td></tr>'.format(markdown.markdown(entry.get("response", ""))))
217
- html.append('<tr><th>Expected Response</th><td>{}</td></tr>'.format(markdown.markdown(entry.get("expected_response", ""))))
235
+ html.append('<tr><th>Response</th><td>{}</td></tr>'.format(markdown.markdown(_escape(entry.get("response", "")))))
236
+ html.append('<tr><th>Expected Response</th><td>{}</td></tr>'.format(markdown.markdown(_escape(entry.get("expected_response", "")))))
218
237
  html.append('</table>')
219
238
 
220
239
  score_rows = extract_metric_rows(entry)
221
- score_df = pd.DataFrame(score_rows)
222
-
223
- def highlight_result(val):
224
- lv = str(val).lower()
225
- if lv == 'pass':
226
- return 'background-color: #d4edda; color: #155724;'
227
- elif lv == 'fail':
228
- return 'background-color: #f8d7da; color: #721c24;'
229
- return ''
230
-
231
- score_html = (
232
- score_df.style
233
- .map(highlight_result, subset=['Result'])
234
- .set_table_attributes('style="margin-top:1em;"')
235
- .hide(axis="index")
236
- .to_html()
237
- )
238
-
239
- html.append('<details open class="score-details"><summary>Show/Hide Evaluation Metric Scores</summary>')
240
- html.append(score_html)
241
- html.append('</details>')
240
+ if score_rows:
241
+ score_df = pd.DataFrame(score_rows)
242
+
243
+ def highlight_result(val):
244
+ lv = str(val).lower()
245
+ if lv == 'pass':
246
+ return 'background-color: #d4edda; color: #155724;'
247
+ elif lv == 'fail':
248
+ return 'background-color: #f8d7da; color: #721c24;'
249
+ return ''
250
+
251
+ score_html = (
252
+ score_df.style
253
+ .map(highlight_result, subset=['Result'])
254
+ .set_table_attributes('style="margin-top:1em;"')
255
+ .hide(axis="index")
256
+ .to_html()
257
+ )
258
+
259
+ html.append('<details open class="score-details"><summary>Show/Hide Evaluation Metric Scores</summary>')
260
+ html.append(score_html)
261
+ html.append('</details>')
242
262
 
243
263
  html.append(f'<p><em>Entry {idx} of {len(results)}</em></p>')
244
264
  if idx < len(results): # Don't add HR after last item