@microsoft/m365-copilot-eval 1.2.1-preview.1 → 1.4.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/README.md +140 -101
  2. package/package.json +7 -4
  3. package/schema/CHANGELOG.md +8 -0
  4. package/schema/v1/eval-document.schema.json +256 -8
  5. package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
  6. package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
  7. package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
  8. package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
  9. package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
  10. package/schema/v1/examples/valid/comprehensive.json +27 -2
  11. package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
  12. package/schema/v1/examples/valid/multi-turn-output.json +59 -0
  13. package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
  14. package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
  15. package/schema/version.json +2 -2
  16. package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
  17. package/src/clients/cli/api_clients/A2A/a2a_client.py +456 -0
  18. package/src/clients/cli/api_clients/REST/__init__.py +3 -0
  19. package/src/clients/cli/api_clients/REST/sydney_client.py +204 -0
  20. package/src/clients/cli/api_clients/__init__.py +3 -0
  21. package/src/clients/cli/api_clients/base_agent_client.py +78 -0
  22. package/src/clients/cli/cli_logging/__init__.py +0 -0
  23. package/src/clients/cli/cli_logging/console_diagnostics.py +107 -0
  24. package/src/clients/cli/cli_logging/logging_utils.py +144 -0
  25. package/src/clients/cli/common.py +62 -0
  26. package/src/clients/cli/custom_evaluators/CitationsEvaluator.py +3 -3
  27. package/src/clients/cli/custom_evaluators/ExactMatchEvaluator.py +11 -11
  28. package/src/clients/cli/custom_evaluators/PartialMatchEvaluator.py +1 -11
  29. package/src/clients/cli/evaluator_resolver.py +150 -0
  30. package/src/clients/cli/generate_report.py +347 -184
  31. package/src/clients/cli/main.py +1288 -481
  32. package/src/clients/cli/parallel_executor.py +57 -0
  33. package/src/clients/cli/readme.md +14 -7
  34. package/src/clients/cli/requirements.txt +1 -1
  35. package/src/clients/cli/response_extractor.py +30 -14
  36. package/src/clients/cli/retry_policy.py +52 -0
  37. package/src/clients/cli/samples/multiturn_example.json +35 -0
  38. package/src/clients/cli/throttle_gate.py +82 -0
  39. package/src/clients/node-js/bin/runevals.js +134 -41
  40. package/src/clients/node-js/config/default.js +5 -1
  41. package/src/clients/node-js/lib/agent-id.js +12 -0
  42. package/src/clients/node-js/lib/env-loader.js +11 -16
  43. package/src/clients/node-js/lib/eula-manager.js +78 -0
  44. package/src/clients/node-js/lib/progress.js +13 -11
@@ -1,76 +1,82 @@
1
- import json
1
+ import html as html_module
2
2
  import markdown
3
- import pandas as pd
4
- from pathlib import Path
3
+ from common import METRIC_IDS, STATUS_PASS, STATUS_FAIL, STATUS_ERROR, STATUS_PARTIAL, STATUS_UNKNOWN, pascal_case_to_title
4
+ from datetime import datetime, timezone
5
5
 
6
6
  def calculate_aggregate_statistics(results):
7
- """Calculate aggregate statistics across all evaluation results."""
7
+ """Calculate aggregate statistics across all evaluation results.
8
+
9
+ Scans ALL results (not just the first) to discover which metrics were used,
10
+ correctly handling per-prompt evaluator variation. Each metric reports
11
+ prompts_evaluated (how many prompts it actually ran on) and total_prompts.
12
+ """
8
13
  if not results:
9
14
  return {}
10
-
11
- # Extract all metrics from the first result to know what metrics we have
12
- first_result = results[0]
13
- metrics = first_result.get('results', {})
14
-
15
+
16
+ # Flatten: multi-turn threads contribute each turn as a separate item
17
+ flat_results = []
18
+ for result in results:
19
+ if result.get("type") == "multi_turn":
20
+ for turn in result.get("turns", []):
21
+ flat_results.append(turn)
22
+ else:
23
+ flat_results.append(result)
24
+
25
+ # Discover all metric keys across all results
26
+ all_metric_keys = set()
27
+ for result in flat_results:
28
+ all_metric_keys.update(result.get('results', {}).keys())
29
+
15
30
  aggregates = {}
16
-
17
- for metric_key in metrics.keys():
18
- if not metric_key.endswith('_score'):
19
- continue
20
-
21
- metric_name = metric_key[:-6] # Remove '_score' suffix
22
- metric_display_name = metric_name.replace('_', ' ').title()
23
-
31
+
32
+ for eval_name in sorted(all_metric_keys):
33
+ display_name = pascal_case_to_title(eval_name)
34
+ metric_id = METRIC_IDS.get(eval_name, eval_name.lower())
35
+
24
36
  scores = []
25
37
  pass_count = 0
26
38
  fail_count = 0
27
39
  threshold_value = None
28
-
29
- for result in results:
30
- metric_data = result.get('results', {}).get(metric_key)
31
- if metric_data:
32
- try:
33
- # Parse the JSON string to get the actual data
34
- parsed_data = json.loads(metric_data) if isinstance(metric_data, str) else metric_data
35
-
36
- # Extract score, result, and threshold
37
- score = parsed_data.get('score')
38
- if score is None:
39
- score = parsed_data.get(metric_name)
40
- if score is None:
41
- score = parsed_data.get(f'{metric_name}_score')
42
-
43
- result_status = parsed_data.get('result')
44
- if result_status is None:
45
- result_status = parsed_data.get(f'{metric_name}_result')
46
-
47
- threshold = parsed_data.get('threshold')
48
- if threshold is None:
49
- threshold = parsed_data.get(f'{metric_name}_threshold')
50
-
51
- if score is not None:
52
- scores.append(float(score))
53
-
54
- if result_status:
55
- if str(result_status).lower() == 'pass':
56
- pass_count += 1
57
- elif str(result_status).lower() == 'fail':
58
- fail_count += 1
59
-
60
- # Capture threshold (should be consistent across all results)
61
- if threshold is not None and threshold_value is None:
62
- threshold_value = threshold
63
-
64
- except (json.JSONDecodeError, ValueError, TypeError):
65
- continue
66
-
67
- if scores:
68
- avg_score = sum(scores) / len(scores)
40
+ prompts_evaluated = 0
41
+
42
+ for result in flat_results:
43
+ parsed_data = result.get('results', {}).get(eval_name)
44
+ if parsed_data is None:
45
+ continue # This metric did not run for this prompt
46
+ if not isinstance(parsed_data, dict):
47
+ continue
48
+
49
+ prompts_evaluated += 1
50
+ try:
51
+ score = parsed_data.get(metric_id)
52
+
53
+ result_status = parsed_data.get('result')
54
+
55
+ threshold = parsed_data.get('threshold')
56
+
57
+ if score is not None:
58
+ scores.append(float(score))
59
+
60
+ if result_status:
61
+ if str(result_status).lower() == STATUS_PASS:
62
+ pass_count += 1
63
+ elif str(result_status).lower() == STATUS_FAIL:
64
+ fail_count += 1
65
+
66
+ if threshold is not None and threshold_value is None:
67
+ threshold_value = threshold
68
+
69
+ except (ValueError, TypeError):
70
+ continue
71
+
72
+ if scores or pass_count > 0 or fail_count > 0:
73
+ avg_score = sum(scores) / len(scores) if scores else 0
69
74
  total_evaluated = pass_count + fail_count
70
75
  pass_rate = (pass_count / total_evaluated * 100) if total_evaluated > 0 else 0
71
-
72
- aggregates[metric_display_name] = {
73
- 'total_prompts': len(results),
76
+
77
+ aggregates[display_name] = {
78
+ 'total_prompts': len(flat_results),
79
+ 'prompts_evaluated': prompts_evaluated,
74
80
  'total_evaluated': total_evaluated,
75
81
  'pass_count': pass_count,
76
82
  'fail_count': fail_count,
@@ -79,14 +85,8 @@ def calculate_aggregate_statistics(results):
79
85
  'threshold': threshold_value,
80
86
  'scores': scores
81
87
  }
82
-
83
- return aggregates
84
88
 
85
- def parse_score(score_str):
86
- try:
87
- return json.loads(score_str)
88
- except Exception:
89
- return {}
89
+ return aggregates
90
90
 
91
91
  def format_score(score):
92
92
  try:
@@ -100,9 +100,9 @@ def format_score(score):
100
100
 
101
101
  def extract_metric_rows(entry):
102
102
  """
103
- Build generic metric rows from any `*_score` keys on an entry.
103
+ Build generic metric rows from evaluation results.
104
104
  Each row has: Metric, Result, Score, Threshold, Reason.
105
- Supports metrics under entry['results'] and falls back to top-level for backward compatibility.
105
+ Omits metrics that did not run (None values) for this prompt.
106
106
  """
107
107
  rows = []
108
108
 
@@ -112,29 +112,22 @@ def extract_metric_rows(entry):
112
112
  return d[k]
113
113
  return ''
114
114
 
115
- def iter_score_fields(e):
116
- container = e.get('results') if isinstance(e, dict) else None
117
- if isinstance(container, dict):
118
- for k, v in container.items():
119
- if isinstance(k, str) and k.endswith('_score'):
120
- yield k, v
121
- return
122
- # fallback to top-level flat structure
123
- for k, v in e.items():
124
- if isinstance(k, str) and k.endswith('_score'):
125
- yield k, v
115
+ results_container = entry.get('results', {}) if isinstance(entry, dict) else {}
126
116
 
127
- for key, raw in iter_score_fields(entry):
128
- metric_id = key[:-6] # strip "_score"
129
- metric_obj = parse_score(raw) if isinstance(raw, (str, bytes)) else (raw or {})
117
+ for eval_name, metric_obj in results_container.items():
118
+ if metric_obj is None:
119
+ continue # Skip metrics that did not run for this prompt
120
+ if not isinstance(metric_obj, dict):
121
+ continue
130
122
 
131
- display_name = metric_id.replace('_', ' ').title()
123
+ display_name = pascal_case_to_title(eval_name)
124
+ metric_id = METRIC_IDS.get(eval_name, eval_name.lower())
132
125
 
133
126
  # Candidate key patterns inside the parsed metric object
134
- score_val = pick(metric_obj, [metric_id, f'{metric_id}_score', 'score', 'value'])
135
- result_val = pick(metric_obj, [f'{metric_id}_result', 'result', 'status'])
136
- threshold_val = pick(metric_obj, [f'{metric_id}_threshold', 'threshold', 'min_threshold', 'expected'])
137
- reason_val = pick(metric_obj, [f'{metric_id}_reason', 'reason', 'rationale', 'explanation'])
127
+ score_val = pick(metric_obj, [metric_id])
128
+ result_val = pick(metric_obj, ['result'])
129
+ threshold_val = pick(metric_obj, ['threshold'])
130
+ reason_val = pick(metric_obj, [f'{metric_id}_reason', 'reason'])
138
131
 
139
132
  rows.append({
140
133
  'Metric': display_name,
@@ -145,107 +138,277 @@ def extract_metric_rows(entry):
145
138
  })
146
139
  return rows
147
140
 
148
- def generate_html_report(results):
141
+ def prompt_passed(entry):
142
+ """Determine whether a prompt passed all evaluations.
143
+
144
+ Centralized predicate used by both the summary banner and per-prompt
145
+ cards so that pass/fail counts stay consistent.
146
+
147
+ Called in two contexts:
148
+ - On un-flattened results: multi-turn threads have type="multi_turn"
149
+ and are evaluated via their summary.overall_status.
150
+ - On flattened results (banner counts): individual turns have
151
+ status="pass"/"fail"/"error" and are evaluated like single-turn items.
152
+
153
+ A prompt/turn fails when:
154
+ - it is a multi-turn thread with overall_status != 'pass', OR
155
+ - its status is explicitly 'fail' or 'error', OR
156
+ - any metric result is explicitly 'fail'.
157
+ Otherwise it is considered passed (including prompts with no metric rows).
158
+ """
159
+ if entry.get("type") == "multi_turn":
160
+ summary = entry.get("summary", {})
161
+ return summary.get("overall_status") == STATUS_PASS
162
+
163
+ status = str(entry.get('status', '')).lower()
164
+ if status in (STATUS_FAIL, STATUS_ERROR):
165
+ return False
166
+ metric_rows = extract_metric_rows(entry)
167
+ if any(str(row.get('Result', '')).lower() == STATUS_FAIL for row in metric_rows):
168
+ return False
169
+ return True
170
+
171
+ def _escape(text):
172
+ """HTML-escape user-controlled content to prevent XSS."""
173
+ if text is None:
174
+ return ""
175
+ return html_module.escape(str(text))
176
+
177
+ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=None):
178
+ aggregates = calculate_aggregate_statistics(results)
179
+
180
+ # Flatten multi-turn threads for banner counts (consistent with aggregate stats)
181
+ flat_items = []
182
+ for entry in results:
183
+ if entry.get("type") == "multi_turn":
184
+ flat_items.extend(entry.get("turns", []))
185
+ else:
186
+ flat_items.append(entry)
187
+ total_prompts = len(flat_items)
188
+
189
+ passed_prompts = sum(1 for item in flat_items if prompt_passed(item))
190
+ failed_prompt_count = total_prompts - passed_prompts
191
+ overall_pass_rate = (passed_prompts / total_prompts * 100) if total_prompts else 0
192
+ generated_utc = datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
193
+
149
194
  html = [
150
- '<!DOCTYPE html>',
151
- '<html lang="en">',
152
- '<head>',
153
- ' <meta charset="UTF-8">',
154
- ' <title>M365 Copilot Agents Evaluation Scores Report</title>',
155
- ' <style>',
156
- ' body { font-family: Arial, sans-serif; margin: 2em; padding: 1.5em; background: #fafafa; }',
157
- ' h1 { margin-top: 0; color: #2c3e50; }',
158
- ' h2 { color: #34495e; margin-top: 2em; }',
159
- ' table { border-collapse: collapse; width: 100%; margin: 1.5em 0; }',
160
- ' th, td { border: 1px solid #ccc; padding: 10px 12px; text-align: left; vertical-align: top; }',
161
- ' th { background: #f4f4f4; font-weight: 600; }',
162
- ' details { padding: 0.75em 1em; border: 1px solid #ddd; border-radius: 6px; background: #fff; }',
163
- ' details summary { cursor: pointer; font-weight: 600; margin: -0.75em -1em 0.75em; padding: 0.75em 1em; background: #eef2f5; border-bottom: 1px solid #ddd; }',
164
- ' .pass { background: #d4edda; color: #155724; }',
165
- ' .fail { background: #f8d7da; color: #721c24; }',
166
- ' .score-details { font-size: 0.95em; color: #333; background: #f9f9f9; }',
167
- ' .score-details table { margin-top: 0.5em !important; }',
168
- ' .score-details th { background: #e9ecef; }',
169
- ' .aggregate-section { margin: 2em 0; }',
170
- ' .aggregate-section h2 { color: #34495e; margin-top: 0; }',
171
- ' .pass-rate-excellent { background: #d1f2eb; color: #186a3b; font-weight: bold; }',
172
- ' .pass-rate-good { background: #fef9e7; color: #7d6608; font-weight: bold; }',
173
- ' .pass-rate-poor { background: #fadbd8; color: #b03a2e; font-weight: bold; }',
174
- ' .individual-results { margin-top: 3em; }',
175
- ' </style>',
176
- '</head>',
177
- '<body>',
178
- ' <h1> M365 Copilot Agents Evaluation Report</h1>',
195
+ '<!DOCTYPE html>',
196
+ '<html lang="en">',
197
+ '<head>',
198
+ ' <meta charset="UTF-8">',
199
+ ' <meta name="viewport" content="width=device-width, initial-scale=1.0">',
200
+ ' <title>M365 Copilot Agents Evaluation Scores Report</title>',
201
+ ' <style>',
202
+ ' :root {',
203
+ ' --bg: #f6f7f9;',
204
+ ' --panel: #ffffff;',
205
+ ' --ink: #1f2937;',
206
+ ' --muted: #5b6473;',
207
+ ' --ok-bg: #e7f7ed;',
208
+ ' --ok-ink: #15603a;',
209
+ ' --bad-bg: #fdecec;',
210
+ ' --bad-ink: #8b1e2f;',
211
+ ' --border: #dde2ea;',
212
+ ' --bar-track: #e8edf5;',
213
+ ' --bar-fill: #2b6cb0;',
214
+ ' }',
215
+ ' * { box-sizing: border-box; }',
216
+ ' body { margin: 0; background: radial-gradient(circle at top right, #eef3ff 0%, var(--bg) 45%); color: var(--ink); font-family: "Segoe UI", Tahoma, sans-serif; }',
217
+ ' .container { max-width: 1100px; margin: 0 auto; padding: 24px 18px 40px; }',
218
+ ' h1 { margin: 0 0 8px; }',
219
+ ' .meta { color: var(--muted); margin-bottom: 20px; }',
220
+ ' .summary-banner { display: grid; grid-template-columns: repeat(4, minmax(140px, 1fr)); gap: 12px; margin: 16px 0 24px; }',
221
+ ' .summary-tile { background: var(--panel); border: 1px solid var(--border); border-radius: 12px; padding: 14px; }',
222
+ ' .summary-label { display: block; font-size: 12px; color: var(--muted); text-transform: uppercase; letter-spacing: .06em; }',
223
+ ' .summary-value { display: block; margin-top: 6px; font-size: 24px; font-weight: 700; }',
224
+ ' .section { background: var(--panel); border: 1px solid var(--border); border-radius: 12px; padding: 16px; margin-bottom: 18px; }',
225
+ ' .evaluator-row { margin: 12px 0; }',
226
+ ' .evaluator-head { display: flex; justify-content: space-between; gap: 8px; font-size: 14px; margin-bottom: 6px; }',
227
+ ' .progress-track { width: 100%; background: var(--bar-track); border-radius: 999px; overflow: hidden; height: 12px; }',
228
+ ' .progress-fill { height: 100%; background: var(--bar-fill); }',
229
+ ' .prompt-result-cards { display: grid; gap: 14px; }',
230
+ ' .prompt-card { border: 1px solid var(--border); border-radius: 12px; background: var(--panel); padding: 14px; overflow: hidden; overflow-wrap: break-word; }',
231
+ ' .status-chip { display: inline-block; padding: 3px 8px; border-radius: 999px; font-size: 12px; font-weight: 600; margin-bottom: 10px; }',
232
+ ' .status-pass { background: var(--ok-bg); color: var(--ok-ink); }',
233
+ ' .status-fail { background: var(--bad-bg); color: var(--bad-ink); }',
234
+ ' .prompt-card h3 { margin: 0 0 8px; font-size: 16px; }',
235
+ ' .kv { margin: 8px 0; }',
236
+ ' .kv > strong { display: block; min-width: 130px; color: var(--muted); margin-bottom: 4px; }',
237
+ ' .kv .md-content { padding-left: 4px; font-size: 14px; line-height: 1.5; }',
238
+ ' .kv .md-content p { margin: 4px 0; }',
239
+ ' .kv .md-content h1, .kv .md-content h2, .kv .md-content h3, .kv .md-content h4 { font-size: 14px; margin: 8px 0 4px; }',
240
+ ' .kv .md-content ul, .kv .md-content ol { margin: 4px 0; padding-left: 20px; }',
241
+ ' .kv .md-content li { margin: 2px 0; }',
242
+ ' .kv .md-content pre { background: #f4f6fa; padding: 8px; border-radius: 4px; overflow-x: auto; font-size: 13px; margin: 4px 0; }',
243
+ ' .kv .md-content code { font-size: 13px; background: #f4f6fa; padding: 1px 4px; border-radius: 3px; }',
244
+ ' .kv .md-content pre code { padding: 0; background: none; }',
245
+ ' .kv .md-content hr { border: none; border-top: 1px solid var(--border); margin: 6px 0; }',
246
+ ' .metric-table { width: 100%; border-collapse: collapse; margin-top: 10px; table-layout: fixed; }',
247
+ ' .metric-table th, .metric-table td { border: 1px solid var(--border); padding: 8px; text-align: left; vertical-align: top; }',
248
+ ' .metric-table th { background: #f4f6fa; }',
249
+ ' .metric-table .cell-pass { background: var(--ok-bg); color: var(--ok-ink); font-weight: 600; }',
250
+ ' .metric-table .cell-fail { background: var(--bad-bg); color: var(--bad-ink); font-weight: 600; }',
251
+ ' .evaluator-badge { display: inline-block; padding: 2px 8px; margin: 2px; border-radius: 4px; font-size: 0.85em; background: #e8eaf6; color: #283593; }',
252
+ ' .footer { margin-top: 20px; color: var(--muted); font-size: 13px; }',
253
+ ' @media (max-width: 760px) { .summary-banner { grid-template-columns: repeat(2, minmax(140px, 1fr)); } .kv strong { min-width: 90px; } }',
254
+ ' </style>',
255
+ '</head>',
256
+ '<body>',
257
+ ' <div class="container">',
258
+ ' <h1>M365 Copilot Agents Evaluation Report</h1>',
179
259
  ]
180
260
 
181
- # Add aggregate statistics if multiple results
182
- if len(results) > 1:
183
- aggregates = calculate_aggregate_statistics(results)
184
- if aggregates:
185
- html.append('<div class="aggregate-section">')
186
- html.append(f'<h2> Aggregate Statistics ({len(results)} prompts evaluated)</h2>')
187
-
188
- # Create aggregate table with same style as individual results
189
- html.append('<table>')
190
- html.append('<tr><th>Metric</th><th>Pass Rate</th><th>Passed</th><th>Failed</th><th>Avg Score</th><th>Threshold</th></tr>')
191
-
192
- for metric_name, stats in aggregates.items():
193
- pass_rate_class = 'pass-rate-excellent' if stats['pass_rate'] >= 80 else 'pass-rate-good' if stats['pass_rate'] >= 60 else 'pass-rate-poor'
194
- threshold_display = stats.get('threshold', 'N/A')
195
- html.append(f'''
196
- <tr>
197
- <td><strong>{metric_name}</strong></td>
198
- <td class="{pass_rate_class}">{stats['pass_rate']:.1f}%</td>
199
- <td class="pass">{stats['pass_count']}</td>
200
- <td class="fail">{stats['fail_count']}</td>
201
- <td>{stats['avg_score']:.2f}</td>
202
- <td>{threshold_display}</td>
203
- </tr>
204
- ''')
205
-
206
- html.append('</table>')
261
+ metadata_items = []
262
+ if agent_name:
263
+ metadata_items.append(f'<strong>Agent Name:</strong> {_escape(agent_name)}')
264
+ if agent_id:
265
+ metadata_items.append(f'<strong>Agent ID:</strong> {_escape(agent_id)}')
266
+ if cli_version:
267
+ metadata_items.append(f'<strong>CLI Version:</strong> {_escape(cli_version)}')
268
+ if metadata_items:
269
+ html.append(f' <p class="meta">{" | ".join(metadata_items)}</p>')
270
+
271
+ html.append(' <section class="summary-banner" aria-label="summary banner">')
272
+ html.append(f' <div class="summary-tile"><span class="summary-label">Total Prompts</span><span class="summary-value">{total_prompts}</span></div>')
273
+ html.append(f' <div class="summary-tile"><span class="summary-label">Passed</span><span class="summary-value">{passed_prompts}</span></div>')
274
+ html.append(f' <div class="summary-tile"><span class="summary-label">Failed</span><span class="summary-value">{failed_prompt_count}</span></div>')
275
+ html.append(f' <div class="summary-tile"><span class="summary-label">Pass Rate</span><span class="summary-value">{overall_pass_rate:.1f}%</span></div>')
276
+ html.append(' </section>')
277
+
278
+ html.append(' <section class="section">')
279
+ html.append(' <h2>Aggregate Evaluator Statistics</h2>')
280
+ if aggregates:
281
+ for metric_name, stats in aggregates.items():
282
+ pass_rate = stats.get('pass_rate', 0)
283
+ prompts_evaluated = stats.get('prompts_evaluated', stats.get('total_evaluated', 0))
284
+ html.append('<div class="evaluator-row">')
285
+ avg_score = stats.get('avg_score', 0)
286
+ threshold = stats.get('threshold', 'N/A')
287
+ html.append(
288
+ f'<div class="evaluator-head"><strong>{_escape(metric_name)}</strong>'
289
+ f'<span>{pass_rate:.1f}% ({stats.get("pass_count", 0)} pass / {stats.get("fail_count", 0)} fail, {prompts_evaluated}/{total_prompts} prompts)'
290
+ f' &middot; Avg Score: {avg_score:.2f} &middot; Threshold: {_escape(str(threshold))}</span></div>'
291
+ )
292
+ html.append('<div class="progress-track" role="progressbar" aria-valuemin="0" aria-valuemax="100" aria-valuenow="{:.1f}" aria-label="{} pass rate">'.format(pass_rate, _escape(metric_name)))
293
+ html.append(f'<div class="progress-fill" style="width:{pass_rate:.1f}%"></div></div>')
207
294
  html.append('</div>')
295
+ else:
296
+ html.append(' <p>No evaluator aggregates available.</p>')
297
+ html.append(' </section>')
208
298
 
209
- # Individual results section
210
- html.append('<div class="individual-results">')
211
- html.append('<h2> Individual Results</h2>')
299
+ html.append(' <section class="section">')
300
+ html.append(' <h2>Prompt Results</h2>')
301
+ html.append(' <div class="prompt-result-cards">')
212
302
 
213
303
  for idx, entry in enumerate(results, 1):
214
- html.append(f'<h3>Prompt {idx}: {entry.get("prompt", "")}</h3>')
215
- html.append('<table>')
216
- html.append('<tr><th>Response</th><td>{}</td></tr>'.format(markdown.markdown(entry.get("response", ""))))
217
- html.append('<tr><th>Expected Response</th><td>{}</td></tr>'.format(markdown.markdown(entry.get("expected_response", ""))))
218
- html.append('</table>')
219
-
220
- score_rows = extract_metric_rows(entry)
221
- score_df = pd.DataFrame(score_rows)
222
-
223
- def highlight_result(val):
224
- lv = str(val).lower()
225
- if lv == 'pass':
226
- return 'background-color: #d4edda; color: #155724;'
227
- elif lv == 'fail':
228
- return 'background-color: #f8d7da; color: #721c24;'
229
- return ''
230
-
231
- score_html = (
232
- score_df.style
233
- .map(highlight_result, subset=['Result'])
234
- .set_table_attributes('style="margin-top:1em;"')
235
- .hide(axis="index")
236
- .to_html()
237
- )
238
-
239
- html.append('<details open class="score-details"><summary>Show/Hide Evaluation Metric Scores</summary>')
240
- html.append(score_html)
241
- html.append('</details>')
242
-
243
- html.append(f'<p><em>Entry {idx} of {len(results)}</em></p>')
244
- if idx < len(results): # Don't add HR after last item
245
- html.append('<hr>')
246
-
247
- html.append('</div>') # Close individual-results
248
- html.append('</body></html>')
249
- html.append('<p><small>Generated by M365 Copilot Agents Evaluation CLI.</small></p>')
304
+ if entry.get("type") == "multi_turn":
305
+ # Multi-turn thread card
306
+ thread_name = _escape(entry.get("name", "Unnamed Thread"))
307
+ summary = entry.get("summary", {})
308
+ status = summary.get("overall_status", STATUS_UNKNOWN)
309
+ is_passed = status == STATUS_PASS
310
+ chip_class = 'status-pass' if is_passed else 'status-fail'
311
+ chip_text = 'PASSED' if is_passed else ('PARTIAL' if status == STATUS_PARTIAL else 'FAILED')
312
+
313
+ html.append(' <article class="prompt-card">')
314
+ html.append(f' <span class="status-chip {chip_class}">{chip_text}</span>')
315
+ html.append(f' <h3>Thread {idx}: {thread_name}</h3>')
316
+ html.append(f' <p>{summary.get("turns_passed", 0)}/{summary.get("turns_total", 0)} turns passed</p>')
317
+
318
+ for t_idx, turn in enumerate(entry.get("turns", []), 1):
319
+ turn_status = turn.get("status", STATUS_UNKNOWN)
320
+ turn_chip_class = 'status-pass' if turn_status == STATUS_PASS else 'status-fail'
321
+ turn_chip_text = {
322
+ STATUS_PASS: 'PASSED',
323
+ STATUS_FAIL: 'FAILED',
324
+ STATUS_ERROR: 'ERROR',
325
+ }.get(turn_status, turn_status.upper())
326
+
327
+ html.append(f' <div style="margin-left:16px;padding:8px 0;border-top:1px solid var(--border);">')
328
+ html.append(f' <span class="status-chip {turn_chip_class}">{turn_chip_text}</span>')
329
+ html.append(f' <strong>Turn {t_idx}:</strong> {_escape(turn.get("prompt", ""))}')
330
+
331
+ turn_evaluators = turn.get('evaluators_ran', [])
332
+ if turn_evaluators:
333
+ badges = ''.join(f'<span class="evaluator-badge">{_escape(e)}</span>' for e in turn_evaluators)
334
+ html.append(f' <p>Evaluators: {badges}</p>')
335
+
336
+ if turn.get("response"):
337
+ html.append(f' <div class="kv"><strong>Response:</strong><div class="md-content">{markdown.markdown(_escape(turn.get("response", "")))}</div></div>')
338
+ if turn.get("error"):
339
+ html.append(f' <p class="kv"><strong>Error:</strong> {_escape(turn["error"])}</p>')
340
+
341
+ turn_rows = extract_metric_rows(turn)
342
+ if turn_rows:
343
+ html.append(' <table class="metric-table">')
344
+ html.append(' <tr><th>Metric</th><th>Result</th><th>Score</th><th>Threshold</th><th>Reason</th></tr>')
345
+ for row in turn_rows:
346
+ result_val = str(row.get("Result", "")).lower()
347
+ result_class = ' class="cell-pass"' if result_val == STATUS_PASS else ' class="cell-fail"' if result_val == STATUS_FAIL else ""
348
+ html.append(
349
+ '<tr>'
350
+ f'<td>{_escape(row.get("Metric", ""))}</td>'
351
+ f'<td{result_class}>{_escape(str(row.get("Result", "")))}</td>'
352
+ f'<td>{_escape(str(row.get("Score", "")))}</td>'
353
+ f'<td>{_escape(str(row.get("Threshold", "")))}</td>'
354
+ f'<td>{_escape(str(row.get("Reason", "")))}</td>'
355
+ '</tr>'
356
+ )
357
+ html.append(' </table>')
358
+
359
+ html.append(' </div>')
360
+
361
+ html.append(' </article>')
362
+ else:
363
+ score_rows = extract_metric_rows(entry)
364
+ is_passed = prompt_passed(entry)
365
+ chip_class = 'status-pass' if is_passed else 'status-fail'
366
+ chip_text = 'PASSED' if is_passed else 'FAILED'
367
+
368
+ html.append(' <article class="prompt-card">')
369
+ html.append(f' <span class="status-chip {chip_class}">{chip_text}</span>')
370
+ html.append(f' <h3>Prompt {idx}: {_escape(entry.get("prompt", ""))}</h3>')
371
+
372
+ evaluators_ran = entry.get('evaluators_ran', [])
373
+ if evaluators_ran:
374
+ badges = ''.join(f'<span class="evaluator-badge">{_escape(e)}</span>' for e in evaluators_ran)
375
+ html.append(f' <p>Evaluators: {badges}</p>')
376
+
377
+ html.append(f' <div class="kv"><strong>Response:</strong><div class="md-content">{markdown.markdown(_escape(entry.get("response", "")))}</div></div>')
378
+ html.append(f' <div class="kv"><strong>Expected:</strong><div class="md-content">{markdown.markdown(_escape(entry.get("expected_response", "")))}</div></div>')
379
+
380
+ error_details = entry.get('error_details') or entry.get('errorDetails')
381
+ if error_details:
382
+ html.append(f' <p class="kv"><strong>Error Details:</strong> {_escape(error_details)}</p>')
383
+
384
+ if score_rows:
385
+ html.append(' <table class="metric-table">')
386
+ html.append(' <tr><th>Metric</th><th>Result</th><th>Score</th><th>Threshold</th><th>Reason</th></tr>')
387
+ for row in score_rows:
388
+ result_val = str(row.get("Result", "")).lower()
389
+ result_class = ' class="cell-pass"' if result_val == STATUS_PASS else ' class="cell-fail"' if result_val == STATUS_FAIL else ""
390
+ html.append(
391
+ '<tr>'
392
+ f'<td>{_escape(row.get("Metric", ""))}</td>'
393
+ f'<td{result_class}>{_escape(str(row.get("Result", "")))}</td>'
394
+ f'<td>{_escape(str(row.get("Score", "")))}</td>'
395
+ f'<td>{_escape(str(row.get("Threshold", "")))}</td>'
396
+ f'<td>{_escape(str(row.get("Reason", "")))}</td>'
397
+ '</tr>'
398
+ )
399
+ html.append(' </table>')
400
+
401
+ html.append(' </article>')
402
+
403
+ if not results:
404
+ html.append(' <p>No prompt results found.</p>')
405
+
406
+ html.append(' </div>')
407
+ html.append(' </section>')
408
+
409
+ html.append(f' <p class="footer">Generated by M365 Copilot Agents Evaluation CLI &mdash; <time datetime="{generated_utc}">{generated_utc} UTC</time></p>')
410
+ html.append(' </div>')
411
+ html.append('</body>')
412
+ html.append('</html>')
250
413
 
251
414
  return '\n'.join(html)