@microsoft/m365-copilot-eval 1.3.0-preview.1 → 1.5.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/README.md +135 -100
  2. package/package.json +7 -4
  3. package/schema/CHANGELOG.md +7 -0
  4. package/schema/v1/eval-document.schema.json +143 -11
  5. package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
  6. package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
  7. package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
  8. package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
  9. package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
  10. package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
  11. package/schema/v1/examples/valid/multi-turn-output.json +59 -0
  12. package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
  13. package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
  14. package/schema/version.json +2 -2
  15. package/src/clients/cli/agent_selector.py +74 -0
  16. package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
  17. package/src/clients/cli/api_clients/A2A/a2a_client.py +475 -0
  18. package/src/clients/cli/api_clients/__init__.py +3 -0
  19. package/src/clients/cli/api_clients/base_agent_client.py +77 -0
  20. package/src/clients/cli/cli_args.py +136 -0
  21. package/src/clients/cli/cli_logging/cli_logger.py +33 -0
  22. package/src/clients/cli/cli_logging/console_diagnostics.py +56 -2
  23. package/src/clients/cli/cli_logging/logging_utils.py +0 -1
  24. package/src/clients/cli/common.py +64 -0
  25. package/src/clients/cli/env_validator.py +73 -0
  26. package/src/clients/cli/evaluation_runner.py +653 -0
  27. package/src/clients/cli/evaluator_resolver.py +9 -6
  28. package/src/clients/cli/generate_report.py +272 -129
  29. package/src/clients/cli/main.py +157 -1174
  30. package/src/clients/cli/parallel_executor.py +57 -0
  31. package/src/clients/cli/prompt_loader.py +148 -0
  32. package/src/clients/cli/readme.md +9 -53
  33. package/src/clients/cli/requirements.txt +1 -1
  34. package/src/clients/cli/response_extractor.py +4 -603
  35. package/src/clients/cli/result_writer.py +488 -0
  36. package/src/clients/cli/retry_policy.py +52 -0
  37. package/src/clients/cli/samples/multiturn_example.json +35 -0
  38. package/src/clients/cli/throttle_gate.py +82 -0
  39. package/src/clients/node-js/bin/runevals.js +82 -20
  40. package/src/clients/node-js/config/default.js +12 -11
  41. package/src/clients/node-js/lib/agent-id.js +12 -0
  42. package/src/clients/node-js/lib/env-loader.js +14 -20
  43. package/src/clients/node-js/lib/eula-manager.js +78 -0
  44. package/src/clients/node-js/lib/progress.js +13 -11
@@ -1,9 +1,7 @@
1
1
  import html as html_module
2
- import json
3
2
  import markdown
4
- from common import METRIC_IDS, pascal_case_to_title
5
- import pandas as pd
6
- from pathlib import Path
3
+ from common import METRIC_IDS, STATUS_PASS, STATUS_FAIL, STATUS_ERROR, STATUS_PARTIAL, STATUS_UNKNOWN, pascal_case_to_title
4
+ from datetime import datetime, timezone
7
5
 
8
6
  def calculate_aggregate_statistics(results):
9
7
  """Calculate aggregate statistics across all evaluation results.
@@ -15,9 +13,18 @@ def calculate_aggregate_statistics(results):
15
13
  if not results:
16
14
  return {}
17
15
 
16
+ # Flatten: multi-turn threads contribute each turn as a separate item
17
+ flat_results = []
18
+ for result in results:
19
+ if result.get("type") == "multi_turn":
20
+ for turn in result.get("turns", []):
21
+ flat_results.append(turn)
22
+ else:
23
+ flat_results.append(result)
24
+
18
25
  # Discover all metric keys across all results
19
26
  all_metric_keys = set()
20
- for result in results:
27
+ for result in flat_results:
21
28
  all_metric_keys.update(result.get('results', {}).keys())
22
29
 
23
30
  aggregates = {}
@@ -32,15 +39,15 @@ def calculate_aggregate_statistics(results):
32
39
  threshold_value = None
33
40
  prompts_evaluated = 0
34
41
 
35
- for result in results:
36
- metric_data = result.get('results', {}).get(eval_name)
37
- if metric_data is None:
42
+ for result in flat_results:
43
+ parsed_data = result.get('results', {}).get(eval_name)
44
+ if parsed_data is None:
38
45
  continue # This metric did not run for this prompt
46
+ if not isinstance(parsed_data, dict):
47
+ continue
39
48
 
40
49
  prompts_evaluated += 1
41
50
  try:
42
- parsed_data = json.loads(metric_data) if isinstance(metric_data, str) else metric_data
43
-
44
51
  score = parsed_data.get(metric_id)
45
52
 
46
53
  result_status = parsed_data.get('result')
@@ -51,15 +58,15 @@ def calculate_aggregate_statistics(results):
51
58
  scores.append(float(score))
52
59
 
53
60
  if result_status:
54
- if str(result_status).lower() == 'pass':
61
+ if str(result_status).lower() == STATUS_PASS:
55
62
  pass_count += 1
56
- elif str(result_status).lower() == 'fail':
63
+ elif str(result_status).lower() == STATUS_FAIL:
57
64
  fail_count += 1
58
65
 
59
66
  if threshold is not None and threshold_value is None:
60
67
  threshold_value = threshold
61
68
 
62
- except (json.JSONDecodeError, ValueError, TypeError):
69
+ except (ValueError, TypeError):
63
70
  continue
64
71
 
65
72
  if scores or pass_count > 0 or fail_count > 0:
@@ -68,7 +75,7 @@ def calculate_aggregate_statistics(results):
68
75
  pass_rate = (pass_count / total_evaluated * 100) if total_evaluated > 0 else 0
69
76
 
70
77
  aggregates[display_name] = {
71
- 'total_prompts': len(results),
78
+ 'total_prompts': len(flat_results),
72
79
  'prompts_evaluated': prompts_evaluated,
73
80
  'total_evaluated': total_evaluated,
74
81
  'pass_count': pass_count,
@@ -81,12 +88,6 @@ def calculate_aggregate_statistics(results):
81
88
 
82
89
  return aggregates
83
90
 
84
- def parse_score(score_str):
85
- try:
86
- return json.loads(score_str)
87
- except Exception:
88
- return {}
89
-
90
91
  def format_score(score):
91
92
  try:
92
93
  val = float(score)
@@ -113,11 +114,11 @@ def extract_metric_rows(entry):
113
114
 
114
115
  results_container = entry.get('results', {}) if isinstance(entry, dict) else {}
115
116
 
116
- for eval_name, raw in results_container.items():
117
- if raw is None:
117
+ for eval_name, metric_obj in results_container.items():
118
+ if metric_obj is None:
118
119
  continue # Skip metrics that did not run for this prompt
119
-
120
- metric_obj = parse_score(raw) if isinstance(raw, (str, bytes)) else (raw or {})
120
+ if not isinstance(metric_obj, dict):
121
+ continue
121
122
 
122
123
  display_name = pascal_case_to_title(eval_name)
123
124
  metric_id = METRIC_IDS.get(eval_name, eval_name.lower())
@@ -137,6 +138,36 @@ def extract_metric_rows(entry):
137
138
  })
138
139
  return rows
139
140
 
141
+ def prompt_passed(entry):
142
+ """Determine whether a prompt passed all evaluations.
143
+
144
+ Centralized predicate used by both the summary banner and per-prompt
145
+ cards so that pass/fail counts stay consistent.
146
+
147
+ Called in two contexts:
148
+ - On un-flattened results: multi-turn threads have type="multi_turn"
149
+ and are evaluated via their summary.overall_status.
150
+ - On flattened results (banner counts): individual turns have
151
+ status="pass"/"fail"/"error" and are evaluated like single-turn items.
152
+
153
+ A prompt/turn fails when:
154
+ - it is a multi-turn thread with overall_status != 'pass', OR
155
+ - its status is explicitly 'fail' or 'error', OR
156
+ - any metric result is explicitly 'fail'.
157
+ Otherwise it is considered passed (including prompts with no metric rows).
158
+ """
159
+ if entry.get("type") == "multi_turn":
160
+ summary = entry.get("summary", {})
161
+ return summary.get("overall_status") == STATUS_PASS
162
+
163
+ status = str(entry.get('status', '')).lower()
164
+ if status in (STATUS_FAIL, STATUS_ERROR):
165
+ return False
166
+ metric_rows = extract_metric_rows(entry)
167
+ if any(str(row.get('Result', '')).lower() == STATUS_FAIL for row in metric_rows):
168
+ return False
169
+ return True
170
+
140
171
  def _escape(text):
141
172
  """HTML-escape user-controlled content to prevent XSS."""
142
173
  if text is None:
@@ -144,40 +175,89 @@ def _escape(text):
144
175
  return html_module.escape(str(text))
145
176
 
146
177
  def generate_html_report(results, agent_name=None, agent_id=None, cli_version=None):
178
+ aggregates = calculate_aggregate_statistics(results)
179
+
180
+ # Flatten multi-turn threads for banner counts (consistent with aggregate stats)
181
+ flat_items = []
182
+ for entry in results:
183
+ if entry.get("type") == "multi_turn":
184
+ flat_items.extend(entry.get("turns", []))
185
+ else:
186
+ flat_items.append(entry)
187
+ total_prompts = len(flat_items)
188
+
189
+ passed_prompts = sum(1 for item in flat_items if prompt_passed(item))
190
+ failed_prompt_count = total_prompts - passed_prompts
191
+ overall_pass_rate = (passed_prompts / total_prompts * 100) if total_prompts else 0
192
+ generated_utc = datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
193
+
147
194
  html = [
148
- '<!DOCTYPE html>',
149
- '<html lang="en">',
150
- '<head>',
151
- ' <meta charset="UTF-8">',
152
- ' <title>M365 Copilot Agents Evaluation Scores Report</title>',
153
- ' <style>',
154
- ' body { font-family: Arial, sans-serif; margin: 2em; padding: 1.5em; background: #fafafa; }',
155
- ' h1 { margin-top: 0; color: #2c3e50; }',
156
- ' h2 { color: #34495e; margin-top: 2em; }',
157
- ' table { border-collapse: collapse; width: 100%; margin: 1.5em 0; }',
158
- ' th, td { border: 1px solid #ccc; padding: 10px 12px; text-align: left; vertical-align: top; }',
159
- ' th { background: #f4f4f4; font-weight: 600; }',
160
- ' details { padding: 0.75em 1em; border: 1px solid #ddd; border-radius: 6px; background: #fff; }',
161
- ' details summary { cursor: pointer; font-weight: 600; margin: -0.75em -1em 0.75em; padding: 0.75em 1em; background: #eef2f5; border-bottom: 1px solid #ddd; }',
162
- ' .pass { background: #d4edda; color: #155724; }',
163
- ' .fail { background: #f8d7da; color: #721c24; }',
164
- ' .score-details { font-size: 0.95em; color: #333; background: #f9f9f9; }',
165
- ' .score-details table { margin-top: 0.5em !important; }',
166
- ' .score-details th { background: #e9ecef; }',
167
- ' .aggregate-section { margin: 2em 0; }',
168
- ' .aggregate-section h2 { color: #34495e; margin-top: 0; }',
169
- ' .pass-rate-excellent { background: #d1f2eb; color: #186a3b; font-weight: bold; }',
170
- ' .pass-rate-good { background: #fef9e7; color: #7d6608; font-weight: bold; }',
171
- ' .pass-rate-poor { background: #fadbd8; color: #b03a2e; font-weight: bold; }',
172
- ' .individual-results { margin-top: 3em; }',
173
- ' .evaluator-badge { display: inline-block; padding: 2px 8px; margin: 2px; border-radius: 4px; font-size: 0.85em; background: #e8eaf6; color: #283593; }',
174
- ' </style>',
175
- '</head>',
176
- '<body>',
177
- ' <h1> M365 Copilot Agents Evaluation Report</h1>',
195
+ '<!DOCTYPE html>',
196
+ '<html lang="en">',
197
+ '<head>',
198
+ ' <meta charset="UTF-8">',
199
+ ' <meta name="viewport" content="width=device-width, initial-scale=1.0">',
200
+ ' <title>M365 Copilot Agents Evaluation Scores Report</title>',
201
+ ' <style>',
202
+ ' :root {',
203
+ ' --bg: #f6f7f9;',
204
+ ' --panel: #ffffff;',
205
+ ' --ink: #1f2937;',
206
+ ' --muted: #5b6473;',
207
+ ' --ok-bg: #e7f7ed;',
208
+ ' --ok-ink: #15603a;',
209
+ ' --bad-bg: #fdecec;',
210
+ ' --bad-ink: #8b1e2f;',
211
+ ' --border: #dde2ea;',
212
+ ' --bar-track: #e8edf5;',
213
+ ' --bar-fill: #2b6cb0;',
214
+ ' }',
215
+ ' * { box-sizing: border-box; }',
216
+ ' body { margin: 0; background: radial-gradient(circle at top right, #eef3ff 0%, var(--bg) 45%); color: var(--ink); font-family: "Segoe UI", Tahoma, sans-serif; }',
217
+ ' .container { max-width: 1100px; margin: 0 auto; padding: 24px 18px 40px; }',
218
+ ' h1 { margin: 0 0 8px; }',
219
+ ' .meta { color: var(--muted); margin-bottom: 20px; }',
220
+ ' .summary-banner { display: grid; grid-template-columns: repeat(4, minmax(140px, 1fr)); gap: 12px; margin: 16px 0 24px; }',
221
+ ' .summary-tile { background: var(--panel); border: 1px solid var(--border); border-radius: 12px; padding: 14px; }',
222
+ ' .summary-label { display: block; font-size: 12px; color: var(--muted); text-transform: uppercase; letter-spacing: .06em; }',
223
+ ' .summary-value { display: block; margin-top: 6px; font-size: 24px; font-weight: 700; }',
224
+ ' .section { background: var(--panel); border: 1px solid var(--border); border-radius: 12px; padding: 16px; margin-bottom: 18px; }',
225
+ ' .evaluator-row { margin: 12px 0; }',
226
+ ' .evaluator-head { display: flex; justify-content: space-between; gap: 8px; font-size: 14px; margin-bottom: 6px; }',
227
+ ' .progress-track { width: 100%; background: var(--bar-track); border-radius: 999px; overflow: hidden; height: 12px; }',
228
+ ' .progress-fill { height: 100%; background: var(--bar-fill); }',
229
+ ' .prompt-result-cards { display: grid; gap: 14px; }',
230
+ ' .prompt-card { border: 1px solid var(--border); border-radius: 12px; background: var(--panel); padding: 14px; overflow: hidden; overflow-wrap: break-word; }',
231
+ ' .status-chip { display: inline-block; padding: 3px 8px; border-radius: 999px; font-size: 12px; font-weight: 600; margin-bottom: 10px; }',
232
+ ' .status-pass { background: var(--ok-bg); color: var(--ok-ink); }',
233
+ ' .status-fail { background: var(--bad-bg); color: var(--bad-ink); }',
234
+ ' .prompt-card h3 { margin: 0 0 8px; font-size: 16px; }',
235
+ ' .kv { margin: 8px 0; }',
236
+ ' .kv > strong { display: block; min-width: 130px; color: var(--muted); margin-bottom: 4px; }',
237
+ ' .kv .md-content { padding-left: 4px; font-size: 14px; line-height: 1.5; }',
238
+ ' .kv .md-content p { margin: 4px 0; }',
239
+ ' .kv .md-content h1, .kv .md-content h2, .kv .md-content h3, .kv .md-content h4 { font-size: 14px; margin: 8px 0 4px; }',
240
+ ' .kv .md-content ul, .kv .md-content ol { margin: 4px 0; padding-left: 20px; }',
241
+ ' .kv .md-content li { margin: 2px 0; }',
242
+ ' .kv .md-content pre { background: #f4f6fa; padding: 8px; border-radius: 4px; overflow-x: auto; font-size: 13px; margin: 4px 0; }',
243
+ ' .kv .md-content code { font-size: 13px; background: #f4f6fa; padding: 1px 4px; border-radius: 3px; }',
244
+ ' .kv .md-content pre code { padding: 0; background: none; }',
245
+ ' .kv .md-content hr { border: none; border-top: 1px solid var(--border); margin: 6px 0; }',
246
+ ' .metric-table { width: 100%; border-collapse: collapse; margin-top: 10px; table-layout: fixed; }',
247
+ ' .metric-table th, .metric-table td { border: 1px solid var(--border); padding: 8px; text-align: left; vertical-align: top; }',
248
+ ' .metric-table th { background: #f4f6fa; }',
249
+ ' .metric-table .cell-pass { background: var(--ok-bg); color: var(--ok-ink); font-weight: 600; }',
250
+ ' .metric-table .cell-fail { background: var(--bad-bg); color: var(--bad-ink); font-weight: 600; }',
251
+ ' .evaluator-badge { display: inline-block; padding: 2px 8px; margin: 2px; border-radius: 4px; font-size: 0.85em; background: #e8eaf6; color: #283593; }',
252
+ ' .footer { margin-top: 20px; color: var(--muted); font-size: 13px; }',
253
+ ' @media (max-width: 760px) { .summary-banner { grid-template-columns: repeat(2, minmax(140px, 1fr)); } .kv strong { min-width: 90px; } }',
254
+ ' </style>',
255
+ '</head>',
256
+ '<body>',
257
+ ' <div class="container">',
258
+ ' <h1>M365 Copilot Agents Evaluation Report</h1>',
178
259
  ]
179
260
 
180
- # Add metadata section
181
261
  metadata_items = []
182
262
  if agent_name:
183
263
  metadata_items.append(f'<strong>Agent Name:</strong> {_escape(agent_name)}')
@@ -186,86 +266,149 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
186
266
  if cli_version:
187
267
  metadata_items.append(f'<strong>CLI Version:</strong> {_escape(cli_version)}')
188
268
  if metadata_items:
189
- html.append(f' <p style="color: #666; font-size: 0.95em;">{" &nbsp;|&nbsp; ".join(metadata_items)}</p>')
190
-
191
- # Add aggregate statistics if multiple results
192
- if len(results) > 1:
193
- aggregates = calculate_aggregate_statistics(results)
194
- if aggregates:
195
- html.append('<div class="aggregate-section">')
196
- html.append(f'<h2> Aggregate Statistics ({len(results)} prompts evaluated)</h2>')
197
-
198
- html.append('<table>')
199
- html.append('<tr><th>Metric</th><th>Prompts</th><th>Pass Rate</th><th>Passed</th><th>Failed</th><th>Avg Score</th><th>Threshold</th></tr>')
200
-
201
- for metric_name, stats in aggregates.items():
202
- pass_rate_class = 'pass-rate-excellent' if stats['pass_rate'] >= 80 else 'pass-rate-good' if stats['pass_rate'] >= 60 else 'pass-rate-poor'
203
- threshold_display = _escape(str(stats.get('threshold', 'N/A')))
204
- prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
205
- total_prompts = stats.get('total_prompts', len(results))
206
- html.append(f'''
207
- <tr>
208
- <td><strong>{_escape(metric_name)}</strong></td>
209
- <td>{prompts_evaluated}/{total_prompts}</td>
210
- <td class="{pass_rate_class}">{stats['pass_rate']:.1f}%</td>
211
- <td class="pass">{stats['pass_count']}</td>
212
- <td class="fail">{stats['fail_count']}</td>
213
- <td>{stats['avg_score']:.2f}</td>
214
- <td>{threshold_display}</td>
215
- </tr>
216
- ''')
217
-
218
- html.append('</table>')
269
+ html.append(f' <p class="meta">{" | ".join(metadata_items)}</p>')
270
+
271
+ html.append(' <section class="summary-banner" aria-label="summary banner">')
272
+ html.append(f' <div class="summary-tile"><span class="summary-label">Total Prompts</span><span class="summary-value">{total_prompts}</span></div>')
273
+ html.append(f' <div class="summary-tile"><span class="summary-label">Passed</span><span class="summary-value">{passed_prompts}</span></div>')
274
+ html.append(f' <div class="summary-tile"><span class="summary-label">Failed</span><span class="summary-value">{failed_prompt_count}</span></div>')
275
+ html.append(f' <div class="summary-tile"><span class="summary-label">Pass Rate</span><span class="summary-value">{overall_pass_rate:.1f}%</span></div>')
276
+ html.append(' </section>')
277
+
278
+ html.append(' <section class="section">')
279
+ html.append(' <h2>Aggregate Evaluator Statistics</h2>')
280
+ if aggregates:
281
+ for metric_name, stats in aggregates.items():
282
+ pass_rate = stats.get('pass_rate', 0)
283
+ prompts_evaluated = stats.get('prompts_evaluated', stats.get('total_evaluated', 0))
284
+ html.append('<div class="evaluator-row">')
285
+ avg_score = stats.get('avg_score', 0)
286
+ threshold = stats.get('threshold', 'N/A')
287
+ html.append(
288
+ f'<div class="evaluator-head"><strong>{_escape(metric_name)}</strong>'
289
+ f'<span>{pass_rate:.1f}% ({stats.get("pass_count", 0)} pass / {stats.get("fail_count", 0)} fail, {prompts_evaluated}/{total_prompts} prompts)'
290
+ f' &middot; Avg Score: {avg_score:.2f} &middot; Threshold: {_escape(str(threshold))}</span></div>'
291
+ )
292
+ html.append('<div class="progress-track" role="progressbar" aria-valuemin="0" aria-valuemax="100" aria-valuenow="{:.1f}" aria-label="{} pass rate">'.format(pass_rate, _escape(metric_name)))
293
+ html.append(f'<div class="progress-fill" style="width:{pass_rate:.1f}%"></div></div>')
219
294
  html.append('</div>')
295
+ else:
296
+ html.append(' <p>No evaluator aggregates available.</p>')
297
+ html.append(' </section>')
220
298
 
221
- # Individual results section
222
- html.append('<div class="individual-results">')
223
- html.append('<h2> Individual Results</h2>')
299
+ html.append(' <section class="section">')
300
+ html.append(' <h2>Prompt Results</h2>')
301
+ html.append(' <div class="prompt-result-cards">')
224
302
 
225
303
  for idx, entry in enumerate(results, 1):
226
- html.append(f'<h3>Prompt {idx}: {_escape(entry.get("prompt", ""))}</h3>')
227
-
228
- # Show evaluator badges for this prompt
229
- evaluators_ran = entry.get('evaluators_ran', [])
230
- if evaluators_ran:
231
- badges = ''.join(f'<span class="evaluator-badge">{_escape(e)}</span>' for e in evaluators_ran)
232
- html.append(f'<p>Evaluators: {badges}</p>')
233
-
234
- html.append('<table>')
235
- html.append('<tr><th>Response</th><td>{}</td></tr>'.format(markdown.markdown(_escape(entry.get("response", "")))))
236
- html.append('<tr><th>Expected Response</th><td>{}</td></tr>'.format(markdown.markdown(_escape(entry.get("expected_response", "")))))
237
- html.append('</table>')
238
-
239
- score_rows = extract_metric_rows(entry)
240
- if score_rows:
241
- score_df = pd.DataFrame(score_rows)
242
-
243
- def highlight_result(val):
244
- lv = str(val).lower()
245
- if lv == 'pass':
246
- return 'background-color: #d4edda; color: #155724;'
247
- elif lv == 'fail':
248
- return 'background-color: #f8d7da; color: #721c24;'
249
- return ''
250
-
251
- score_html = (
252
- score_df.style
253
- .map(highlight_result, subset=['Result'])
254
- .set_table_attributes('style="margin-top:1em;"')
255
- .hide(axis="index")
256
- .to_html()
257
- )
304
+ if entry.get("type") == "multi_turn":
305
+ # Multi-turn thread card
306
+ thread_name = _escape(entry.get("name", "Unnamed Thread"))
307
+ summary = entry.get("summary", {})
308
+ status = summary.get("overall_status", STATUS_UNKNOWN)
309
+ is_passed = status == STATUS_PASS
310
+ chip_class = 'status-pass' if is_passed else 'status-fail'
311
+ chip_text = 'PASSED' if is_passed else ('PARTIAL' if status == STATUS_PARTIAL else 'FAILED')
312
+
313
+ html.append(' <article class="prompt-card">')
314
+ html.append(f' <span class="status-chip {chip_class}">{chip_text}</span>')
315
+ html.append(f' <h3>Thread {idx}: {thread_name}</h3>')
316
+ html.append(f' <p>{summary.get("turns_passed", 0)}/{summary.get("turns_total", 0)} turns passed</p>')
317
+
318
+ for t_idx, turn in enumerate(entry.get("turns", []), 1):
319
+ turn_status = turn.get("status", STATUS_UNKNOWN)
320
+ turn_chip_class = 'status-pass' if turn_status == STATUS_PASS else 'status-fail'
321
+ turn_chip_text = {
322
+ STATUS_PASS: 'PASSED',
323
+ STATUS_FAIL: 'FAILED',
324
+ STATUS_ERROR: 'ERROR',
325
+ }.get(turn_status, turn_status.upper())
326
+
327
+ html.append(f' <div style="margin-left:16px;padding:8px 0;border-top:1px solid var(--border);">')
328
+ html.append(f' <span class="status-chip {turn_chip_class}">{turn_chip_text}</span>')
329
+ html.append(f' <strong>Turn {t_idx}:</strong> {_escape(turn.get("prompt", ""))}')
330
+
331
+ turn_evaluators = turn.get('evaluators_ran', [])
332
+ if turn_evaluators:
333
+ badges = ''.join(f'<span class="evaluator-badge">{_escape(e)}</span>' for e in turn_evaluators)
334
+ html.append(f' <p>Evaluators: {badges}</p>')
335
+
336
+ if turn.get("response"):
337
+ html.append(f' <div class="kv"><strong>Response:</strong><div class="md-content">{markdown.markdown(_escape(turn.get("response", "")))}</div></div>')
338
+ if turn.get("error"):
339
+ html.append(f' <p class="kv"><strong>Error:</strong> {_escape(turn["error"])}</p>')
340
+
341
+ turn_rows = extract_metric_rows(turn)
342
+ if turn_rows:
343
+ html.append(' <table class="metric-table">')
344
+ html.append(' <tr><th>Metric</th><th>Result</th><th>Score</th><th>Threshold</th><th>Reason</th></tr>')
345
+ for row in turn_rows:
346
+ result_val = str(row.get("Result", "")).lower()
347
+ result_class = ' class="cell-pass"' if result_val == STATUS_PASS else ' class="cell-fail"' if result_val == STATUS_FAIL else ""
348
+ html.append(
349
+ '<tr>'
350
+ f'<td>{_escape(row.get("Metric", ""))}</td>'
351
+ f'<td{result_class}>{_escape(str(row.get("Result", "")))}</td>'
352
+ f'<td>{_escape(str(row.get("Score", "")))}</td>'
353
+ f'<td>{_escape(str(row.get("Threshold", "")))}</td>'
354
+ f'<td>{_escape(str(row.get("Reason", "")))}</td>'
355
+ '</tr>'
356
+ )
357
+ html.append(' </table>')
358
+
359
+ html.append(' </div>')
360
+
361
+ html.append(' </article>')
362
+ else:
363
+ score_rows = extract_metric_rows(entry)
364
+ is_passed = prompt_passed(entry)
365
+ chip_class = 'status-pass' if is_passed else 'status-fail'
366
+ chip_text = 'PASSED' if is_passed else 'FAILED'
367
+
368
+ html.append(' <article class="prompt-card">')
369
+ html.append(f' <span class="status-chip {chip_class}">{chip_text}</span>')
370
+ html.append(f' <h3>Prompt {idx}: {_escape(entry.get("prompt", ""))}</h3>')
371
+
372
+ evaluators_ran = entry.get('evaluators_ran', [])
373
+ if evaluators_ran:
374
+ badges = ''.join(f'<span class="evaluator-badge">{_escape(e)}</span>' for e in evaluators_ran)
375
+ html.append(f' <p>Evaluators: {badges}</p>')
376
+
377
+ html.append(f' <div class="kv"><strong>Response:</strong><div class="md-content">{markdown.markdown(_escape(entry.get("response", "")))}</div></div>')
378
+ html.append(f' <div class="kv"><strong>Expected:</strong><div class="md-content">{markdown.markdown(_escape(entry.get("expected_response", "")))}</div></div>')
379
+
380
+ error_details = entry.get('error_details') or entry.get('errorDetails')
381
+ if error_details:
382
+ html.append(f' <p class="kv"><strong>Error Details:</strong> {_escape(error_details)}</p>')
383
+
384
+ if score_rows:
385
+ html.append(' <table class="metric-table">')
386
+ html.append(' <tr><th>Metric</th><th>Result</th><th>Score</th><th>Threshold</th><th>Reason</th></tr>')
387
+ for row in score_rows:
388
+ result_val = str(row.get("Result", "")).lower()
389
+ result_class = ' class="cell-pass"' if result_val == STATUS_PASS else ' class="cell-fail"' if result_val == STATUS_FAIL else ""
390
+ html.append(
391
+ '<tr>'
392
+ f'<td>{_escape(row.get("Metric", ""))}</td>'
393
+ f'<td{result_class}>{_escape(str(row.get("Result", "")))}</td>'
394
+ f'<td>{_escape(str(row.get("Score", "")))}</td>'
395
+ f'<td>{_escape(str(row.get("Threshold", "")))}</td>'
396
+ f'<td>{_escape(str(row.get("Reason", "")))}</td>'
397
+ '</tr>'
398
+ )
399
+ html.append(' </table>')
400
+
401
+ html.append(' </article>')
258
402
 
259
- html.append('<details open class="score-details"><summary>Show/Hide Evaluation Metric Scores</summary>')
260
- html.append(score_html)
261
- html.append('</details>')
403
+ if not results:
404
+ html.append(' <p>No prompt results found.</p>')
262
405
 
263
- html.append(f'<p><em>Entry {idx} of {len(results)}</em></p>')
264
- if idx < len(results): # Don't add HR after last item
265
- html.append('<hr>')
406
+ html.append(' </div>')
407
+ html.append(' </section>')
266
408
 
267
- html.append('</div>') # Close individual-results
268
- html.append('</body></html>')
269
- html.append('<p><small>Generated by M365 Copilot Agents Evaluation CLI.</small></p>')
409
+ html.append(f' <p class="footer">Generated by M365 Copilot Agents Evaluation CLI &mdash; <time datetime="{generated_utc}">{generated_utc} UTC</time></p>')
410
+ html.append(' </div>')
411
+ html.append('</body>')
412
+ html.append('</html>')
270
413
 
271
414
  return '\n'.join(html)