@microsoft/m365-copilot-eval 1.6.0-preview.1 → 1.7.0-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -1
- package/package.json +2 -2
- package/schema/v1/eval-document.schema.json +144 -333
- package/schema/v1/examples/invalid/error-result-with-score.json +16 -0
- package/schema/v1/examples/invalid/missing-error-on-error.json +13 -0
- package/schema/v1/examples/valid/multi-turn-output.json +2 -0
- package/schema/v1/examples/valid/scenarios-with-mixed-errors.json +239 -0
- package/src/clients/cli/common.py +8 -14
- package/src/clients/cli/error_messages.py +91 -0
- package/src/clients/cli/evaluation_runner.py +108 -97
- package/src/clients/cli/evaluator_resolver.py +8 -33
- package/src/clients/cli/generate_report.py +125 -96
- package/src/clients/cli/readme.md +1 -1
- package/src/clients/cli/result_writer.py +129 -110
- package/src/clients/cli/status_derivation.py +91 -0
- package/src/clients/node-js/config/default.js +1 -1
- package/src/clients/node-js/lib/env-loader.js +20 -13
|
@@ -2,6 +2,7 @@ import html as html_module
|
|
|
2
2
|
import markdown
|
|
3
3
|
from common import METRIC_IDS, STATUS_PASS, STATUS_FAIL, STATUS_ERROR, STATUS_PARTIAL, STATUS_UNKNOWN, pascal_case_to_title
|
|
4
4
|
from datetime import datetime, timezone
|
|
5
|
+
from evaluator_resolver import EVALUATOR_REGISTRY
|
|
5
6
|
|
|
6
7
|
def calculate_aggregate_statistics(results):
|
|
7
8
|
"""Calculate aggregate statistics across all evaluation results.
|
|
@@ -36,6 +37,7 @@ def calculate_aggregate_statistics(results):
|
|
|
36
37
|
scores = []
|
|
37
38
|
pass_count = 0
|
|
38
39
|
fail_count = 0
|
|
40
|
+
error_count = 0
|
|
39
41
|
threshold_value = None
|
|
40
42
|
prompts_evaluated = 0
|
|
41
43
|
|
|
@@ -49,19 +51,20 @@ def calculate_aggregate_statistics(results):
|
|
|
49
51
|
prompts_evaluated += 1
|
|
50
52
|
try:
|
|
51
53
|
score = parsed_data.get(metric_id)
|
|
52
|
-
|
|
53
54
|
result_status = parsed_data.get('result')
|
|
54
|
-
|
|
55
55
|
threshold = parsed_data.get('threshold')
|
|
56
56
|
|
|
57
57
|
if score is not None:
|
|
58
58
|
scores.append(float(score))
|
|
59
59
|
|
|
60
60
|
if result_status:
|
|
61
|
-
|
|
61
|
+
status = str(result_status).lower()
|
|
62
|
+
if status == STATUS_PASS:
|
|
62
63
|
pass_count += 1
|
|
63
|
-
elif
|
|
64
|
+
elif status == STATUS_FAIL:
|
|
64
65
|
fail_count += 1
|
|
66
|
+
elif status == STATUS_ERROR:
|
|
67
|
+
error_count += 1
|
|
65
68
|
|
|
66
69
|
if threshold is not None and threshold_value is None:
|
|
67
70
|
threshold_value = threshold
|
|
@@ -69,17 +72,33 @@ def calculate_aggregate_statistics(results):
|
|
|
69
72
|
except (ValueError, TypeError):
|
|
70
73
|
continue
|
|
71
74
|
|
|
72
|
-
|
|
75
|
+
# Surface evaluators that ran in any form — including those whose only
|
|
76
|
+
# attempts errored. Suppressing error-only evaluators would hide them
|
|
77
|
+
# from the aggregate report (SC-001).
|
|
78
|
+
if scores or pass_count > 0 or fail_count > 0 or error_count > 0:
|
|
73
79
|
avg_score = sum(scores) / len(scores) if scores else 0
|
|
80
|
+
# Per-evaluator pass rate is "agreement among completed evaluations" —
|
|
81
|
+
# errors are surfaced separately as a count, not folded into the rate.
|
|
74
82
|
total_evaluated = pass_count + fail_count
|
|
75
83
|
pass_rate = (pass_count / total_evaluated * 100) if total_evaluated > 0 else 0
|
|
76
84
|
|
|
85
|
+
# Defensive fallback: if no per-entry threshold was recorded
|
|
86
|
+
# (shouldn't happen — both successful and errored runtime entries
|
|
87
|
+
# carry it — but guard against malformed input), use the registry
|
|
88
|
+
# default. Evaluators with no registry default (e.g. ExactMatch)
|
|
89
|
+
# legitimately have threshold=None.
|
|
90
|
+
if threshold_value is None:
|
|
91
|
+
registry_entry = EVALUATOR_REGISTRY.get(eval_name)
|
|
92
|
+
if registry_entry is not None:
|
|
93
|
+
threshold_value = registry_entry.default_threshold
|
|
94
|
+
|
|
77
95
|
aggregates[display_name] = {
|
|
78
96
|
'total_prompts': len(flat_results),
|
|
79
97
|
'prompts_evaluated': prompts_evaluated,
|
|
80
98
|
'total_evaluated': total_evaluated,
|
|
81
99
|
'pass_count': pass_count,
|
|
82
100
|
'fail_count': fail_count,
|
|
101
|
+
'error_count': error_count,
|
|
83
102
|
'pass_rate': pass_rate,
|
|
84
103
|
'avg_score': avg_score,
|
|
85
104
|
'threshold': threshold_value,
|
|
@@ -127,7 +146,13 @@ def extract_metric_rows(entry):
|
|
|
127
146
|
score_val = pick(metric_obj, [metric_id])
|
|
128
147
|
result_val = pick(metric_obj, ['result'])
|
|
129
148
|
threshold_val = pick(metric_obj, ['threshold'])
|
|
130
|
-
|
|
149
|
+
# Errored entries carry the per-evaluator failure description in `error`
|
|
150
|
+
# (e.g. "Evaluator failed: Connection timeout"). Surface it in the Reason
|
|
151
|
+
# column so HTML readers see why the evaluator couldn't produce a result.
|
|
152
|
+
if result_val == STATUS_ERROR:
|
|
153
|
+
reason_val = metric_obj.get('error', '')
|
|
154
|
+
else:
|
|
155
|
+
reason_val = pick(metric_obj, [f'{metric_id}_reason', 'reason'])
|
|
131
156
|
|
|
132
157
|
rows.append({
|
|
133
158
|
'Metric': display_name,
|
|
@@ -138,35 +163,29 @@ def extract_metric_rows(entry):
|
|
|
138
163
|
})
|
|
139
164
|
return rows
|
|
140
165
|
|
|
141
|
-
|
|
142
|
-
""
|
|
166
|
+
_CHIP_CLASSES = {
|
|
167
|
+
STATUS_PASS: "status-pass",
|
|
168
|
+
STATUS_FAIL: "status-fail",
|
|
169
|
+
STATUS_PARTIAL: "status-partial",
|
|
170
|
+
STATUS_ERROR: "status-error",
|
|
171
|
+
}
|
|
143
172
|
|
|
144
|
-
Centralized predicate used by both the summary banner and per-prompt
|
|
145
|
-
cards so that pass/fail counts stay consistent.
|
|
146
173
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
- On flattened results (banner counts): individual turns have
|
|
151
|
-
status="pass"/"fail"/"error" and are evaluated like single-turn items.
|
|
174
|
+
def _chip_class(status):
|
|
175
|
+
"""Map a status value to its chip CSS class. Unknown statuses fall back to status-error."""
|
|
176
|
+
return _CHIP_CLASSES.get(status, "status-error")
|
|
152
177
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
178
|
+
|
|
179
|
+
def classify_attempt(entry):
|
|
180
|
+
"""Return one of {pass, fail, partial, error} for an attempt or a thread.
|
|
181
|
+
|
|
182
|
+
For an un-flattened multi-turn thread, returns the thread's overall_status.
|
|
183
|
+
For a single-turn item or a per-turn entry (from a flattened thread), returns
|
|
184
|
+
the entry's status — which is set authoritatively by the runner.
|
|
158
185
|
"""
|
|
159
186
|
if entry.get("type") == "multi_turn":
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
status = str(entry.get('status', '')).lower()
|
|
164
|
-
if status in (STATUS_FAIL, STATUS_ERROR):
|
|
165
|
-
return False
|
|
166
|
-
metric_rows = extract_metric_rows(entry)
|
|
167
|
-
if any(str(row.get('Result', '')).lower() == STATUS_FAIL for row in metric_rows):
|
|
168
|
-
return False
|
|
169
|
-
return True
|
|
187
|
+
return entry.get("summary", {}).get("overall_status", STATUS_UNKNOWN)
|
|
188
|
+
return entry.get("status", STATUS_UNKNOWN)
|
|
170
189
|
|
|
171
190
|
def _escape(text):
|
|
172
191
|
"""HTML-escape user-controlled content to prevent XSS."""
|
|
@@ -174,6 +193,35 @@ def _escape(text):
|
|
|
174
193
|
return ""
|
|
175
194
|
return html_module.escape(str(text))
|
|
176
195
|
|
|
196
|
+
|
|
197
|
+
_CELL_CLASSES = {
|
|
198
|
+
STATUS_PASS: "cell-pass",
|
|
199
|
+
STATUS_FAIL: "cell-fail",
|
|
200
|
+
STATUS_ERROR: "cell-error",
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _render_metric_table(html, rows):
|
|
205
|
+
"""Append a metric-table block to ``html`` (no-op if rows is empty)."""
|
|
206
|
+
if not rows:
|
|
207
|
+
return
|
|
208
|
+
html.append(' <table class="metric-table">')
|
|
209
|
+
html.append(' <tr><th>Metric</th><th>Result</th><th>Score</th><th>Threshold</th><th>Reason</th></tr>')
|
|
210
|
+
for row in rows:
|
|
211
|
+
result_val = str(row.get("Result", "")).lower()
|
|
212
|
+
cell_class = _CELL_CLASSES.get(result_val)
|
|
213
|
+
result_attr = f' class="{cell_class}"' if cell_class else ""
|
|
214
|
+
html.append(
|
|
215
|
+
'<tr>'
|
|
216
|
+
f'<td>{_escape(row.get("Metric", ""))}</td>'
|
|
217
|
+
f'<td{result_attr}>{_escape(str(row.get("Result", "")))}</td>'
|
|
218
|
+
f'<td>{_escape(str(row.get("Score", "")))}</td>'
|
|
219
|
+
f'<td>{_escape(str(row.get("Threshold", "")))}</td>'
|
|
220
|
+
f'<td>{_escape(str(row.get("Reason", "")))}</td>'
|
|
221
|
+
'</tr>'
|
|
222
|
+
)
|
|
223
|
+
html.append(' </table>')
|
|
224
|
+
|
|
177
225
|
def generate_html_report(results, agent_name=None, agent_id=None, cli_version=None):
|
|
178
226
|
aggregates = calculate_aggregate_statistics(results)
|
|
179
227
|
|
|
@@ -186,9 +234,14 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
|
|
|
186
234
|
flat_items.append(entry)
|
|
187
235
|
total_prompts = len(flat_items)
|
|
188
236
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
237
|
+
counts = {STATUS_PASS: 0, STATUS_FAIL: 0, STATUS_PARTIAL: 0, STATUS_ERROR: 0}
|
|
238
|
+
for item in flat_items:
|
|
239
|
+
c = classify_attempt(item)
|
|
240
|
+
if c in counts:
|
|
241
|
+
counts[c] += 1
|
|
242
|
+
incomplete_count = counts[STATUS_PARTIAL] + counts[STATUS_ERROR]
|
|
243
|
+
decisive_count = counts[STATUS_PASS] + counts[STATUS_FAIL]
|
|
244
|
+
overall_pass_rate = (counts[STATUS_PASS] / decisive_count * 100) if decisive_count else 0
|
|
192
245
|
generated_utc = datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
|
|
193
246
|
|
|
194
247
|
html = [
|
|
@@ -208,6 +261,10 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
|
|
|
208
261
|
' --ok-ink: #15603a;',
|
|
209
262
|
' --bad-bg: #fdecec;',
|
|
210
263
|
' --bad-ink: #8b1e2f;',
|
|
264
|
+
' --warn-bg: #fff4e0;',
|
|
265
|
+
' --warn-ink: #8a5a00;',
|
|
266
|
+
' --neutral-bg: #ececec;',
|
|
267
|
+
' --neutral-ink: #4a4a4a;',
|
|
211
268
|
' --border: #dde2ea;',
|
|
212
269
|
' --bar-track: #e8edf5;',
|
|
213
270
|
' --bar-fill: #2b6cb0;',
|
|
@@ -217,7 +274,7 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
|
|
|
217
274
|
' .container { max-width: 1100px; margin: 0 auto; padding: 24px 18px 40px; }',
|
|
218
275
|
' h1 { margin: 0 0 8px; }',
|
|
219
276
|
' .meta { color: var(--muted); margin-bottom: 20px; }',
|
|
220
|
-
' .summary-banner { display: grid; grid-template-columns: repeat(
|
|
277
|
+
' .summary-banner { display: grid; grid-template-columns: repeat(5, minmax(120px, 1fr)); gap: 12px; margin: 16px 0 24px; }',
|
|
221
278
|
' .summary-tile { background: var(--panel); border: 1px solid var(--border); border-radius: 12px; padding: 14px; }',
|
|
222
279
|
' .summary-label { display: block; font-size: 12px; color: var(--muted); text-transform: uppercase; letter-spacing: .06em; }',
|
|
223
280
|
' .summary-value { display: block; margin-top: 6px; font-size: 24px; font-weight: 700; }',
|
|
@@ -231,6 +288,8 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
|
|
|
231
288
|
' .status-chip { display: inline-block; padding: 3px 8px; border-radius: 999px; font-size: 12px; font-weight: 600; margin-bottom: 10px; }',
|
|
232
289
|
' .status-pass { background: var(--ok-bg); color: var(--ok-ink); }',
|
|
233
290
|
' .status-fail { background: var(--bad-bg); color: var(--bad-ink); }',
|
|
291
|
+
' .status-partial { background: var(--warn-bg); color: var(--warn-ink); }',
|
|
292
|
+
' .status-error { background: var(--neutral-bg); color: var(--neutral-ink); }',
|
|
234
293
|
' .prompt-card h3 { margin: 0 0 8px; font-size: 16px; }',
|
|
235
294
|
' .kv { margin: 8px 0; }',
|
|
236
295
|
' .kv > strong { display: block; min-width: 130px; color: var(--muted); margin-bottom: 4px; }',
|
|
@@ -248,9 +307,10 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
|
|
|
248
307
|
' .metric-table th { background: #f4f6fa; }',
|
|
249
308
|
' .metric-table .cell-pass { background: var(--ok-bg); color: var(--ok-ink); font-weight: 600; }',
|
|
250
309
|
' .metric-table .cell-fail { background: var(--bad-bg); color: var(--bad-ink); font-weight: 600; }',
|
|
310
|
+
' .metric-table .cell-error { background: var(--neutral-bg); color: var(--neutral-ink); font-weight: 600; }',
|
|
251
311
|
' .evaluator-badge { display: inline-block; padding: 2px 8px; margin: 2px; border-radius: 4px; font-size: 0.85em; background: #e8eaf6; color: #283593; }',
|
|
252
312
|
' .footer { margin-top: 20px; color: var(--muted); font-size: 13px; }',
|
|
253
|
-
' @media (max-width: 760px) { .summary-banner { grid-template-columns: repeat(2, minmax(
|
|
313
|
+
' @media (max-width: 760px) { .summary-banner { grid-template-columns: repeat(2, minmax(120px, 1fr)); } .kv strong { min-width: 90px; } }',
|
|
254
314
|
' </style>',
|
|
255
315
|
'</head>',
|
|
256
316
|
'<body>',
|
|
@@ -269,9 +329,10 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
|
|
|
269
329
|
html.append(f' <p class="meta">{" | ".join(metadata_items)}</p>')
|
|
270
330
|
|
|
271
331
|
html.append(' <section class="summary-banner" aria-label="summary banner">')
|
|
272
|
-
html.append(f' <div class="summary-tile"><span class="summary-label">Total
|
|
273
|
-
html.append(f' <div class="summary-tile"><span class="summary-label">Passed</span><span class="summary-value">{
|
|
274
|
-
html.append(f' <div class="summary-tile"><span class="summary-label">Failed</span><span class="summary-value">{
|
|
332
|
+
html.append(f' <div class="summary-tile"><span class="summary-label">Total</span><span class="summary-value">{total_prompts}</span></div>')
|
|
333
|
+
html.append(f' <div class="summary-tile"><span class="summary-label">Passed</span><span class="summary-value">{counts[STATUS_PASS]}</span></div>')
|
|
334
|
+
html.append(f' <div class="summary-tile"><span class="summary-label">Failed</span><span class="summary-value">{counts[STATUS_FAIL]}</span></div>')
|
|
335
|
+
html.append(f' <div class="summary-tile"><span class="summary-label">Incomplete</span><span class="summary-value">{incomplete_count}</span></div>')
|
|
275
336
|
html.append(f' <div class="summary-tile"><span class="summary-label">Pass Rate</span><span class="summary-value">{overall_pass_rate:.1f}%</span></div>')
|
|
276
337
|
html.append(' </section>')
|
|
277
338
|
|
|
@@ -283,11 +344,14 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
|
|
|
283
344
|
prompts_evaluated = stats.get('prompts_evaluated', stats.get('total_evaluated', 0))
|
|
284
345
|
html.append('<div class="evaluator-row">')
|
|
285
346
|
avg_score = stats.get('avg_score', 0)
|
|
286
|
-
|
|
347
|
+
threshold_val = stats.get('threshold')
|
|
348
|
+
threshold_str = "N/A" if threshold_val is None else str(threshold_val)
|
|
349
|
+
error_count = stats.get('error_count', 0)
|
|
350
|
+
error_clause = f' / {error_count} error' if error_count else ''
|
|
287
351
|
html.append(
|
|
288
352
|
f'<div class="evaluator-head"><strong>{_escape(metric_name)}</strong>'
|
|
289
|
-
f'<span>{pass_rate:.1f}% ({stats.get("pass_count", 0)} pass / {stats.get("fail_count", 0)} fail, {prompts_evaluated}/{total_prompts} prompts)'
|
|
290
|
-
f' · Avg Score: {avg_score:.2f} · Threshold: {_escape(
|
|
353
|
+
f'<span>{pass_rate:.1f}% ({stats.get("pass_count", 0)} pass / {stats.get("fail_count", 0)} fail{error_clause}, {prompts_evaluated}/{total_prompts} prompts)'
|
|
354
|
+
f' · Avg Score: {avg_score:.2f} · Threshold: {_escape(threshold_str)}</span></div>'
|
|
291
355
|
)
|
|
292
356
|
html.append('<div class="progress-track" role="progressbar" aria-valuemin="0" aria-valuemax="100" aria-valuenow="{:.1f}" aria-label="{} pass rate">'.format(pass_rate, _escape(metric_name)))
|
|
293
357
|
html.append(f'<div class="progress-fill" style="width:{pass_rate:.1f}%"></div></div>')
|
|
@@ -305,27 +369,18 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
|
|
|
305
369
|
# Multi-turn thread card
|
|
306
370
|
thread_name = _escape(entry.get("name", "Unnamed Thread"))
|
|
307
371
|
summary = entry.get("summary", {})
|
|
308
|
-
|
|
309
|
-
is_passed = status == STATUS_PASS
|
|
310
|
-
chip_class = 'status-pass' if is_passed else 'status-fail'
|
|
311
|
-
chip_text = 'PASSED' if is_passed else ('PARTIAL' if status == STATUS_PARTIAL else 'FAILED')
|
|
372
|
+
thread_status = summary.get("overall_status", STATUS_UNKNOWN)
|
|
312
373
|
|
|
313
374
|
html.append(' <article class="prompt-card">')
|
|
314
|
-
html.append(f' <span class="status-chip {
|
|
375
|
+
html.append(f' <span class="status-chip {_chip_class(thread_status)}">{thread_status.upper()}</span>')
|
|
315
376
|
html.append(f' <h3>Thread {idx}: {thread_name}</h3>')
|
|
316
377
|
html.append(f' <p>{summary.get("turns_passed", 0)}/{summary.get("turns_total", 0)} turns passed</p>')
|
|
317
378
|
|
|
318
379
|
for t_idx, turn in enumerate(entry.get("turns", []), 1):
|
|
319
380
|
turn_status = turn.get("status", STATUS_UNKNOWN)
|
|
320
|
-
turn_chip_class = 'status-pass' if turn_status == STATUS_PASS else 'status-fail'
|
|
321
|
-
turn_chip_text = {
|
|
322
|
-
STATUS_PASS: 'PASSED',
|
|
323
|
-
STATUS_FAIL: 'FAILED',
|
|
324
|
-
STATUS_ERROR: 'ERROR',
|
|
325
|
-
}.get(turn_status, turn_status.upper())
|
|
326
381
|
|
|
327
382
|
html.append(f' <div style="margin-left:16px;padding:8px 0;border-top:1px solid var(--border);">')
|
|
328
|
-
html.append(f' <span class="status-chip {
|
|
383
|
+
html.append(f' <span class="status-chip {_chip_class(turn_status)}">{turn_status.upper()}</span>')
|
|
329
384
|
html.append(f' <strong>Turn {t_idx}:</strong> {_escape(turn.get("prompt", ""))}')
|
|
330
385
|
|
|
331
386
|
turn_evaluators = turn.get('evaluators_ran', [])
|
|
@@ -335,38 +390,24 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
|
|
|
335
390
|
|
|
336
391
|
if turn.get("response"):
|
|
337
392
|
html.append(f' <div class="kv"><strong>Response:</strong><div class="md-content">{markdown.markdown(_escape(turn.get("response", "")))}</div></div>')
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
result_val = str(row.get("Result", "")).lower()
|
|
347
|
-
result_class = ' class="cell-pass"' if result_val == STATUS_PASS else ' class="cell-fail"' if result_val == STATUS_FAIL else ""
|
|
348
|
-
html.append(
|
|
349
|
-
'<tr>'
|
|
350
|
-
f'<td>{_escape(row.get("Metric", ""))}</td>'
|
|
351
|
-
f'<td{result_class}>{_escape(str(row.get("Result", "")))}</td>'
|
|
352
|
-
f'<td>{_escape(str(row.get("Score", "")))}</td>'
|
|
353
|
-
f'<td>{_escape(str(row.get("Threshold", "")))}</td>'
|
|
354
|
-
f'<td>{_escape(str(row.get("Reason", "")))}</td>'
|
|
355
|
-
'</tr>'
|
|
356
|
-
)
|
|
357
|
-
html.append(' </table>')
|
|
393
|
+
turn_error = turn.get("error")
|
|
394
|
+
if turn_error:
|
|
395
|
+
html.append(
|
|
396
|
+
f' <p class="kv" data-error-code="{_escape(turn_error.get("code", ""))}">'
|
|
397
|
+
f'<strong>Error:</strong> {_escape(turn_error.get("message", ""))}</p>'
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
_render_metric_table(html, extract_metric_rows(turn))
|
|
358
401
|
|
|
359
402
|
html.append(' </div>')
|
|
360
403
|
|
|
361
404
|
html.append(' </article>')
|
|
362
405
|
else:
|
|
363
406
|
score_rows = extract_metric_rows(entry)
|
|
364
|
-
|
|
365
|
-
chip_class = 'status-pass' if is_passed else 'status-fail'
|
|
366
|
-
chip_text = 'PASSED' if is_passed else 'FAILED'
|
|
407
|
+
item_status = classify_attempt(entry)
|
|
367
408
|
|
|
368
409
|
html.append(' <article class="prompt-card">')
|
|
369
|
-
html.append(f' <span class="status-chip {
|
|
410
|
+
html.append(f' <span class="status-chip {_chip_class(item_status)}">{item_status.upper()}</span>')
|
|
370
411
|
html.append(f' <h3>Prompt {idx}: {_escape(entry.get("prompt", ""))}</h3>')
|
|
371
412
|
|
|
372
413
|
evaluators_ran = entry.get('evaluators_ran', [])
|
|
@@ -377,26 +418,14 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
|
|
|
377
418
|
html.append(f' <div class="kv"><strong>Response:</strong><div class="md-content">{markdown.markdown(_escape(entry.get("response", "")))}</div></div>')
|
|
378
419
|
html.append(f' <div class="kv"><strong>Expected:</strong><div class="md-content">{markdown.markdown(_escape(entry.get("expected_response", "")))}</div></div>')
|
|
379
420
|
|
|
380
|
-
|
|
381
|
-
if
|
|
382
|
-
html.append(
|
|
421
|
+
item_error = entry.get('error')
|
|
422
|
+
if item_error:
|
|
423
|
+
html.append(
|
|
424
|
+
f' <p class="kv" data-error-code="{_escape(item_error.get("code", ""))}">'
|
|
425
|
+
f'<strong>Error:</strong> {_escape(item_error.get("message", ""))}</p>'
|
|
426
|
+
)
|
|
383
427
|
|
|
384
|
-
|
|
385
|
-
html.append(' <table class="metric-table">')
|
|
386
|
-
html.append(' <tr><th>Metric</th><th>Result</th><th>Score</th><th>Threshold</th><th>Reason</th></tr>')
|
|
387
|
-
for row in score_rows:
|
|
388
|
-
result_val = str(row.get("Result", "")).lower()
|
|
389
|
-
result_class = ' class="cell-pass"' if result_val == STATUS_PASS else ' class="cell-fail"' if result_val == STATUS_FAIL else ""
|
|
390
|
-
html.append(
|
|
391
|
-
'<tr>'
|
|
392
|
-
f'<td>{_escape(row.get("Metric", ""))}</td>'
|
|
393
|
-
f'<td{result_class}>{_escape(str(row.get("Result", "")))}</td>'
|
|
394
|
-
f'<td>{_escape(str(row.get("Score", "")))}</td>'
|
|
395
|
-
f'<td>{_escape(str(row.get("Threshold", "")))}</td>'
|
|
396
|
-
f'<td>{_escape(str(row.get("Reason", "")))}</td>'
|
|
397
|
-
'</tr>'
|
|
398
|
-
)
|
|
399
|
-
html.append(' </table>')
|
|
428
|
+
_render_metric_table(html, score_rows)
|
|
400
429
|
|
|
401
430
|
html.append(' </article>')
|
|
402
431
|
|
|
@@ -36,7 +36,7 @@ AZURE_AI_API_KEY="<azure-openai-key>"
|
|
|
36
36
|
AZURE_AI_API_VERSION="2024-12-01-preview"
|
|
37
37
|
AZURE_AI_MODEL_NAME="gpt-4o-mini"
|
|
38
38
|
|
|
39
|
-
# Your Tenant
|
|
39
|
+
# Your Tenant ID (or use TEAMS_APP_TENANT_ID from ATK .env.local)
|
|
40
40
|
TENANT_ID="<aad-tenant-id>"
|
|
41
41
|
|
|
42
42
|
# Optional: default agent id (overridable via --m365-agent-id)
|