@microsoft/m365-copilot-eval 1.2.0-preview.1 → 1.3.0-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -4
- package/package.json +3 -2
- package/schema/CHANGELOG.md +8 -0
- package/schema/v1/eval-document.schema.json +117 -1
- package/schema/v1/examples/valid/comprehensive.json +27 -2
- package/schema/version.json +2 -2
- package/src/clients/cli/cli_logging/__init__.py +0 -0
- package/src/clients/cli/cli_logging/console_diagnostics.py +55 -0
- package/src/clients/cli/cli_logging/logging_utils.py +145 -0
- package/src/clients/cli/common.py +51 -0
- package/src/clients/cli/custom_evaluators/CitationsEvaluator.py +3 -3
- package/src/clients/cli/custom_evaluators/ExactMatchEvaluator.py +11 -11
- package/src/clients/cli/custom_evaluators/PartialMatchEvaluator.py +1 -11
- package/src/clients/cli/evaluator_resolver.py +150 -0
- package/src/clients/cli/generate_report.py +130 -110
- package/src/clients/cli/main.py +545 -236
- package/src/clients/cli/readme.md +14 -7
- package/src/clients/cli/requirements.txt +1 -0
- package/src/clients/cli/response_extractor.py +32 -14
- package/src/clients/node-js/bin/runevals.js +58 -28
- package/src/clients/node-js/config/default.js +1 -1
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""Evaluator resolution module for per-prompt evaluator configuration.
|
|
2
|
+
|
|
3
|
+
Resolves which evaluators to run on each prompt by merging prompt-level config
|
|
4
|
+
with file-level defaults and system defaults, following extend/replace modes.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import difflib
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Any, Dict, Optional, Tuple
|
|
10
|
+
|
|
11
|
+
from common import (
|
|
12
|
+
RELEVANCE,
|
|
13
|
+
COHERENCE,
|
|
14
|
+
GROUNDEDNESS,
|
|
15
|
+
TOOL_CALL_ACCURACY,
|
|
16
|
+
CITATIONS,
|
|
17
|
+
EXACT_MATCH,
|
|
18
|
+
PARTIAL_MATCH,
|
|
19
|
+
REQUIRES_AZURE_OPENAI,
|
|
20
|
+
REQUIRES_TOOL_DEFINITIONS,
|
|
21
|
+
SYSTEM_DEFAULT_EVALUATORS,
|
|
22
|
+
RegistryEntry,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# Static registry of available evaluators per data-model.md
|
|
29
|
+
EVALUATOR_REGISTRY: Dict[str, RegistryEntry] = {
|
|
30
|
+
RELEVANCE: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
|
|
31
|
+
COHERENCE: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
|
|
32
|
+
GROUNDEDNESS: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
|
|
33
|
+
TOOL_CALL_ACCURACY: RegistryEntry(type="tool", requires=[REQUIRES_AZURE_OPENAI, REQUIRES_TOOL_DEFINITIONS], default_threshold=3),
|
|
34
|
+
CITATIONS: RegistryEntry(type="non-llm", requires=[], default_threshold=1),
|
|
35
|
+
EXACT_MATCH: RegistryEntry(type="non-llm", requires=[], default_threshold=None),
|
|
36
|
+
PARTIAL_MATCH: RegistryEntry(type="non-llm", requires=[], default_threshold=0.5),
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def validate_evaluator_names(evaluator_map: Dict[str, Any]) -> None:
|
|
41
|
+
"""Validate that all evaluator names in the map exist in the registry.
|
|
42
|
+
|
|
43
|
+
Raises ValueError with categorized valid names and
|
|
44
|
+
'Did you mean?' suggestions for close matches.
|
|
45
|
+
"""
|
|
46
|
+
invalid_names = [name for name in evaluator_map if name not in EVALUATOR_REGISTRY]
|
|
47
|
+
if not invalid_names:
|
|
48
|
+
return
|
|
49
|
+
|
|
50
|
+
# Categorize valid evaluators for the error message
|
|
51
|
+
llm_evals = [n for n, r in EVALUATOR_REGISTRY.items() if r.type == "llm"]
|
|
52
|
+
tool_evals = [n for n, r in EVALUATOR_REGISTRY.items() if r.type == "tool"]
|
|
53
|
+
non_llm_evals = [n for n, r in EVALUATOR_REGISTRY.items() if r.type == "non-llm"]
|
|
54
|
+
|
|
55
|
+
lines = []
|
|
56
|
+
for name in invalid_names:
|
|
57
|
+
lines.append(f'Unknown evaluator "{name}".')
|
|
58
|
+
close = difflib.get_close_matches(name, EVALUATOR_REGISTRY.keys(), n=1, cutoff=0.5)
|
|
59
|
+
if close:
|
|
60
|
+
lines.append(f'Did you mean "{close[0]}"?')
|
|
61
|
+
|
|
62
|
+
lines.append("")
|
|
63
|
+
lines.append("Valid evaluators are:")
|
|
64
|
+
lines.append(f" - {', '.join(llm_evals)} (LLM-based)")
|
|
65
|
+
lines.append(f" - {', '.join(tool_evals)} (tool evaluation)")
|
|
66
|
+
lines.append(f" - {', '.join(non_llm_evals)} (non-LLM)")
|
|
67
|
+
|
|
68
|
+
raise ValueError("\n".join(lines))
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def check_prerequisites(
|
|
72
|
+
evaluator_name: str,
|
|
73
|
+
available_context: Dict[str, bool],
|
|
74
|
+
) -> Tuple[bool, Optional[str]]:
|
|
75
|
+
"""Check if prerequisites for an evaluator are available.
|
|
76
|
+
|
|
77
|
+
Returns (True, None) if all prerequisites are met, or
|
|
78
|
+
(False, warning_message) if a prerequisite is missing.
|
|
79
|
+
"""
|
|
80
|
+
registry_entry = EVALUATOR_REGISTRY.get(evaluator_name)
|
|
81
|
+
if not registry_entry:
|
|
82
|
+
return False, f"Unknown evaluator: {evaluator_name}"
|
|
83
|
+
|
|
84
|
+
for req in registry_entry.requires:
|
|
85
|
+
if not available_context.get(req, False):
|
|
86
|
+
msg = (
|
|
87
|
+
f"Skipping evaluator '{evaluator_name}': "
|
|
88
|
+
f"missing prerequisite '{req}'"
|
|
89
|
+
)
|
|
90
|
+
return False, msg
|
|
91
|
+
|
|
92
|
+
return True, None
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def resolve_default_evaluators(file_defaults: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
|
96
|
+
"""Resolve effective default evaluators, falling back to system defaults.
|
|
97
|
+
|
|
98
|
+
Precedence: file-level defaults > system defaults.
|
|
99
|
+
An explicit empty dict means "no default evaluators".
|
|
100
|
+
"""
|
|
101
|
+
# File-level defaults (including explicit empty dict)
|
|
102
|
+
if file_defaults is not None:
|
|
103
|
+
return file_defaults
|
|
104
|
+
|
|
105
|
+
# System defaults
|
|
106
|
+
return {name: {} for name in SYSTEM_DEFAULT_EVALUATORS}
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def resolve_evaluators_for_prompt(
|
|
110
|
+
prompt_evaluators: Optional[Dict[str, Any]],
|
|
111
|
+
evaluators_mode: str,
|
|
112
|
+
prompt: str,
|
|
113
|
+
default_evaluators: Dict[str, Any],
|
|
114
|
+
) -> Dict[str, Any]:
|
|
115
|
+
"""Resolve which evaluators to run for a single prompt.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
prompt_evaluators: Per-prompt evaluator config (None if not specified).
|
|
119
|
+
evaluators_mode: How to combine with defaults ("extend" or "replace").
|
|
120
|
+
prompt: The prompt text (used in warning messages).
|
|
121
|
+
default_evaluators: Resolved default evaluators (from resolve_default_evaluators).
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
Resolved EvaluatorMap (dict of evaluator_name -> options).
|
|
125
|
+
"""
|
|
126
|
+
# No prompt-level config → use defaults
|
|
127
|
+
if prompt_evaluators is None:
|
|
128
|
+
return dict(default_evaluators)
|
|
129
|
+
|
|
130
|
+
if evaluators_mode == "replace":
|
|
131
|
+
if not prompt_evaluators:
|
|
132
|
+
logger.warning(
|
|
133
|
+
"Empty evaluators with 'replace' mode for prompt: '%s'. "
|
|
134
|
+
"No evaluators will run.",
|
|
135
|
+
prompt[:80],
|
|
136
|
+
)
|
|
137
|
+
return dict(prompt_evaluators)
|
|
138
|
+
|
|
139
|
+
# mode == "extend": merge defaults with prompt overrides (prompt wins on conflict)
|
|
140
|
+
merged = dict(default_evaluators)
|
|
141
|
+
merged.update(prompt_evaluators)
|
|
142
|
+
return merged
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def get_evaluator_threshold(evaluator_name: str, options: Dict[str, Any]) -> Optional[float]:
|
|
146
|
+
"""Get the threshold for an evaluator, with option override support."""
|
|
147
|
+
if "threshold" in options:
|
|
148
|
+
return options["threshold"]
|
|
149
|
+
entry = EVALUATOR_REGISTRY.get(evaluator_name)
|
|
150
|
+
return entry.default_threshold if entry else None
|
|
@@ -1,76 +1,75 @@
|
|
|
1
|
+
import html as html_module
|
|
1
2
|
import json
|
|
2
3
|
import markdown
|
|
4
|
+
from common import METRIC_IDS, pascal_case_to_title
|
|
3
5
|
import pandas as pd
|
|
4
6
|
from pathlib import Path
|
|
5
7
|
|
|
6
8
|
def calculate_aggregate_statistics(results):
|
|
7
|
-
"""Calculate aggregate statistics across all evaluation results.
|
|
9
|
+
"""Calculate aggregate statistics across all evaluation results.
|
|
10
|
+
|
|
11
|
+
Scans ALL results (not just the first) to discover which metrics were used,
|
|
12
|
+
correctly handling per-prompt evaluator variation. Each metric reports
|
|
13
|
+
prompts_evaluated (how many prompts it actually ran on) and total_prompts.
|
|
14
|
+
"""
|
|
8
15
|
if not results:
|
|
9
16
|
return {}
|
|
10
|
-
|
|
11
|
-
#
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
17
|
+
|
|
18
|
+
# Discover all metric keys across all results
|
|
19
|
+
all_metric_keys = set()
|
|
20
|
+
for result in results:
|
|
21
|
+
all_metric_keys.update(result.get('results', {}).keys())
|
|
22
|
+
|
|
15
23
|
aggregates = {}
|
|
16
|
-
|
|
17
|
-
for
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
metric_name = metric_key[:-6] # Remove '_score' suffix
|
|
22
|
-
metric_display_name = metric_name.replace('_', ' ').title()
|
|
23
|
-
|
|
24
|
+
|
|
25
|
+
for eval_name in sorted(all_metric_keys):
|
|
26
|
+
display_name = pascal_case_to_title(eval_name)
|
|
27
|
+
metric_id = METRIC_IDS.get(eval_name, eval_name.lower())
|
|
28
|
+
|
|
24
29
|
scores = []
|
|
25
30
|
pass_count = 0
|
|
26
31
|
fail_count = 0
|
|
27
32
|
threshold_value = None
|
|
28
|
-
|
|
33
|
+
prompts_evaluated = 0
|
|
34
|
+
|
|
29
35
|
for result in results:
|
|
30
|
-
metric_data = result.get('results', {}).get(
|
|
31
|
-
if metric_data:
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
if
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
if threshold is not None and threshold_value is None:
|
|
62
|
-
threshold_value = threshold
|
|
63
|
-
|
|
64
|
-
except (json.JSONDecodeError, ValueError, TypeError):
|
|
65
|
-
continue
|
|
66
|
-
|
|
67
|
-
if scores:
|
|
68
|
-
avg_score = sum(scores) / len(scores)
|
|
36
|
+
metric_data = result.get('results', {}).get(eval_name)
|
|
37
|
+
if metric_data is None:
|
|
38
|
+
continue # This metric did not run for this prompt
|
|
39
|
+
|
|
40
|
+
prompts_evaluated += 1
|
|
41
|
+
try:
|
|
42
|
+
parsed_data = json.loads(metric_data) if isinstance(metric_data, str) else metric_data
|
|
43
|
+
|
|
44
|
+
score = parsed_data.get(metric_id)
|
|
45
|
+
|
|
46
|
+
result_status = parsed_data.get('result')
|
|
47
|
+
|
|
48
|
+
threshold = parsed_data.get('threshold')
|
|
49
|
+
|
|
50
|
+
if score is not None:
|
|
51
|
+
scores.append(float(score))
|
|
52
|
+
|
|
53
|
+
if result_status:
|
|
54
|
+
if str(result_status).lower() == 'pass':
|
|
55
|
+
pass_count += 1
|
|
56
|
+
elif str(result_status).lower() == 'fail':
|
|
57
|
+
fail_count += 1
|
|
58
|
+
|
|
59
|
+
if threshold is not None and threshold_value is None:
|
|
60
|
+
threshold_value = threshold
|
|
61
|
+
|
|
62
|
+
except (json.JSONDecodeError, ValueError, TypeError):
|
|
63
|
+
continue
|
|
64
|
+
|
|
65
|
+
if scores or pass_count > 0 or fail_count > 0:
|
|
66
|
+
avg_score = sum(scores) / len(scores) if scores else 0
|
|
69
67
|
total_evaluated = pass_count + fail_count
|
|
70
68
|
pass_rate = (pass_count / total_evaluated * 100) if total_evaluated > 0 else 0
|
|
71
|
-
|
|
72
|
-
aggregates[
|
|
69
|
+
|
|
70
|
+
aggregates[display_name] = {
|
|
73
71
|
'total_prompts': len(results),
|
|
72
|
+
'prompts_evaluated': prompts_evaluated,
|
|
74
73
|
'total_evaluated': total_evaluated,
|
|
75
74
|
'pass_count': pass_count,
|
|
76
75
|
'fail_count': fail_count,
|
|
@@ -79,7 +78,7 @@ def calculate_aggregate_statistics(results):
|
|
|
79
78
|
'threshold': threshold_value,
|
|
80
79
|
'scores': scores
|
|
81
80
|
}
|
|
82
|
-
|
|
81
|
+
|
|
83
82
|
return aggregates
|
|
84
83
|
|
|
85
84
|
def parse_score(score_str):
|
|
@@ -100,9 +99,9 @@ def format_score(score):
|
|
|
100
99
|
|
|
101
100
|
def extract_metric_rows(entry):
|
|
102
101
|
"""
|
|
103
|
-
Build generic metric rows from
|
|
102
|
+
Build generic metric rows from evaluation results.
|
|
104
103
|
Each row has: Metric, Result, Score, Threshold, Reason.
|
|
105
|
-
|
|
104
|
+
Omits metrics that did not run (None values) for this prompt.
|
|
106
105
|
"""
|
|
107
106
|
rows = []
|
|
108
107
|
|
|
@@ -112,29 +111,22 @@ def extract_metric_rows(entry):
|
|
|
112
111
|
return d[k]
|
|
113
112
|
return ''
|
|
114
113
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
return
|
|
122
|
-
# fallback to top-level flat structure
|
|
123
|
-
for k, v in e.items():
|
|
124
|
-
if isinstance(k, str) and k.endswith('_score'):
|
|
125
|
-
yield k, v
|
|
126
|
-
|
|
127
|
-
for key, raw in iter_score_fields(entry):
|
|
128
|
-
metric_id = key[:-6] # strip "_score"
|
|
114
|
+
results_container = entry.get('results', {}) if isinstance(entry, dict) else {}
|
|
115
|
+
|
|
116
|
+
for eval_name, raw in results_container.items():
|
|
117
|
+
if raw is None:
|
|
118
|
+
continue # Skip metrics that did not run for this prompt
|
|
119
|
+
|
|
129
120
|
metric_obj = parse_score(raw) if isinstance(raw, (str, bytes)) else (raw or {})
|
|
130
121
|
|
|
131
|
-
display_name =
|
|
122
|
+
display_name = pascal_case_to_title(eval_name)
|
|
123
|
+
metric_id = METRIC_IDS.get(eval_name, eval_name.lower())
|
|
132
124
|
|
|
133
125
|
# Candidate key patterns inside the parsed metric object
|
|
134
|
-
score_val = pick(metric_obj, [metric_id
|
|
135
|
-
result_val = pick(metric_obj, [
|
|
136
|
-
threshold_val = pick(metric_obj, [
|
|
137
|
-
reason_val = pick(metric_obj, [f'{metric_id}_reason', 'reason'
|
|
126
|
+
score_val = pick(metric_obj, [metric_id])
|
|
127
|
+
result_val = pick(metric_obj, ['result'])
|
|
128
|
+
threshold_val = pick(metric_obj, ['threshold'])
|
|
129
|
+
reason_val = pick(metric_obj, [f'{metric_id}_reason', 'reason'])
|
|
138
130
|
|
|
139
131
|
rows.append({
|
|
140
132
|
'Metric': display_name,
|
|
@@ -145,7 +137,13 @@ def extract_metric_rows(entry):
|
|
|
145
137
|
})
|
|
146
138
|
return rows
|
|
147
139
|
|
|
148
|
-
def
|
|
140
|
+
def _escape(text):
|
|
141
|
+
"""HTML-escape user-controlled content to prevent XSS."""
|
|
142
|
+
if text is None:
|
|
143
|
+
return ""
|
|
144
|
+
return html_module.escape(str(text))
|
|
145
|
+
|
|
146
|
+
def generate_html_report(results, agent_name=None, agent_id=None, cli_version=None):
|
|
149
147
|
html = [
|
|
150
148
|
'<!DOCTYPE html>',
|
|
151
149
|
'<html lang="en">',
|
|
@@ -172,29 +170,43 @@ def generate_html_report(results):
|
|
|
172
170
|
' .pass-rate-good { background: #fef9e7; color: #7d6608; font-weight: bold; }',
|
|
173
171
|
' .pass-rate-poor { background: #fadbd8; color: #b03a2e; font-weight: bold; }',
|
|
174
172
|
' .individual-results { margin-top: 3em; }',
|
|
173
|
+
' .evaluator-badge { display: inline-block; padding: 2px 8px; margin: 2px; border-radius: 4px; font-size: 0.85em; background: #e8eaf6; color: #283593; }',
|
|
175
174
|
' </style>',
|
|
176
175
|
'</head>',
|
|
177
176
|
'<body>',
|
|
178
177
|
' <h1> M365 Copilot Agents Evaluation Report</h1>',
|
|
179
178
|
]
|
|
180
179
|
|
|
180
|
+
# Add metadata section
|
|
181
|
+
metadata_items = []
|
|
182
|
+
if agent_name:
|
|
183
|
+
metadata_items.append(f'<strong>Agent Name:</strong> {_escape(agent_name)}')
|
|
184
|
+
if agent_id:
|
|
185
|
+
metadata_items.append(f'<strong>Agent ID:</strong> {_escape(agent_id)}')
|
|
186
|
+
if cli_version:
|
|
187
|
+
metadata_items.append(f'<strong>CLI Version:</strong> {_escape(cli_version)}')
|
|
188
|
+
if metadata_items:
|
|
189
|
+
html.append(f' <p style="color: #666; font-size: 0.95em;">{" | ".join(metadata_items)}</p>')
|
|
190
|
+
|
|
181
191
|
# Add aggregate statistics if multiple results
|
|
182
192
|
if len(results) > 1:
|
|
183
193
|
aggregates = calculate_aggregate_statistics(results)
|
|
184
194
|
if aggregates:
|
|
185
195
|
html.append('<div class="aggregate-section">')
|
|
186
196
|
html.append(f'<h2> Aggregate Statistics ({len(results)} prompts evaluated)</h2>')
|
|
187
|
-
|
|
188
|
-
# Create aggregate table with same style as individual results
|
|
197
|
+
|
|
189
198
|
html.append('<table>')
|
|
190
|
-
html.append('<tr><th>Metric</th><th>Pass Rate</th><th>Passed</th><th>Failed</th><th>Avg Score</th><th>Threshold</th></tr>')
|
|
191
|
-
|
|
199
|
+
html.append('<tr><th>Metric</th><th>Prompts</th><th>Pass Rate</th><th>Passed</th><th>Failed</th><th>Avg Score</th><th>Threshold</th></tr>')
|
|
200
|
+
|
|
192
201
|
for metric_name, stats in aggregates.items():
|
|
193
202
|
pass_rate_class = 'pass-rate-excellent' if stats['pass_rate'] >= 80 else 'pass-rate-good' if stats['pass_rate'] >= 60 else 'pass-rate-poor'
|
|
194
|
-
threshold_display = stats.get('threshold', 'N/A')
|
|
203
|
+
threshold_display = _escape(str(stats.get('threshold', 'N/A')))
|
|
204
|
+
prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
|
|
205
|
+
total_prompts = stats.get('total_prompts', len(results))
|
|
195
206
|
html.append(f'''
|
|
196
207
|
<tr>
|
|
197
|
-
<td><strong>{metric_name}</strong></td>
|
|
208
|
+
<td><strong>{_escape(metric_name)}</strong></td>
|
|
209
|
+
<td>{prompts_evaluated}/{total_prompts}</td>
|
|
198
210
|
<td class="{pass_rate_class}">{stats['pass_rate']:.1f}%</td>
|
|
199
211
|
<td class="pass">{stats['pass_count']}</td>
|
|
200
212
|
<td class="fail">{stats['fail_count']}</td>
|
|
@@ -202,7 +214,7 @@ def generate_html_report(results):
|
|
|
202
214
|
<td>{threshold_display}</td>
|
|
203
215
|
</tr>
|
|
204
216
|
''')
|
|
205
|
-
|
|
217
|
+
|
|
206
218
|
html.append('</table>')
|
|
207
219
|
html.append('</div>')
|
|
208
220
|
|
|
@@ -211,34 +223,42 @@ def generate_html_report(results):
|
|
|
211
223
|
html.append('<h2> Individual Results</h2>')
|
|
212
224
|
|
|
213
225
|
for idx, entry in enumerate(results, 1):
|
|
214
|
-
html.append(f'<h3>Prompt {idx}: {entry.get("prompt", "")}</h3>')
|
|
226
|
+
html.append(f'<h3>Prompt {idx}: {_escape(entry.get("prompt", ""))}</h3>')
|
|
227
|
+
|
|
228
|
+
# Show evaluator badges for this prompt
|
|
229
|
+
evaluators_ran = entry.get('evaluators_ran', [])
|
|
230
|
+
if evaluators_ran:
|
|
231
|
+
badges = ''.join(f'<span class="evaluator-badge">{_escape(e)}</span>' for e in evaluators_ran)
|
|
232
|
+
html.append(f'<p>Evaluators: {badges}</p>')
|
|
233
|
+
|
|
215
234
|
html.append('<table>')
|
|
216
|
-
html.append('<tr><th>Response</th><td>{}</td></tr>'.format(markdown.markdown(entry.get("response", ""))))
|
|
217
|
-
html.append('<tr><th>Expected Response</th><td>{}</td></tr>'.format(markdown.markdown(entry.get("expected_response", ""))))
|
|
235
|
+
html.append('<tr><th>Response</th><td>{}</td></tr>'.format(markdown.markdown(_escape(entry.get("response", "")))))
|
|
236
|
+
html.append('<tr><th>Expected Response</th><td>{}</td></tr>'.format(markdown.markdown(_escape(entry.get("expected_response", "")))))
|
|
218
237
|
html.append('</table>')
|
|
219
238
|
|
|
220
239
|
score_rows = extract_metric_rows(entry)
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
240
|
+
if score_rows:
|
|
241
|
+
score_df = pd.DataFrame(score_rows)
|
|
242
|
+
|
|
243
|
+
def highlight_result(val):
|
|
244
|
+
lv = str(val).lower()
|
|
245
|
+
if lv == 'pass':
|
|
246
|
+
return 'background-color: #d4edda; color: #155724;'
|
|
247
|
+
elif lv == 'fail':
|
|
248
|
+
return 'background-color: #f8d7da; color: #721c24;'
|
|
249
|
+
return ''
|
|
250
|
+
|
|
251
|
+
score_html = (
|
|
252
|
+
score_df.style
|
|
253
|
+
.map(highlight_result, subset=['Result'])
|
|
254
|
+
.set_table_attributes('style="margin-top:1em;"')
|
|
255
|
+
.hide(axis="index")
|
|
256
|
+
.to_html()
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
html.append('<details open class="score-details"><summary>Show/Hide Evaluation Metric Scores</summary>')
|
|
260
|
+
html.append(score_html)
|
|
261
|
+
html.append('</details>')
|
|
242
262
|
|
|
243
263
|
html.append(f'<p><em>Entry {idx} of {len(results)}</em></p>')
|
|
244
264
|
if idx < len(results): # Don't add HR after last item
|