@microsoft/m365-copilot-eval 1.5.0-preview.1 → 1.7.0-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +19 -1
- package/package.json +4 -3
- package/schema/CHANGELOG.md +7 -0
- package/schema/v1/eval-document.schema.json +144 -333
- package/schema/v1/examples/invalid/error-result-with-score.json +16 -0
- package/schema/v1/examples/invalid/missing-error-on-error.json +13 -0
- package/schema/v1/examples/valid/multi-turn-output.json +2 -0
- package/schema/v1/examples/valid/scenarios-with-mixed-errors.json +239 -0
- package/schema/version.json +1 -1
- package/src/clients/cli/api_clients/A2A/a2a_client.py +57 -10
- package/src/clients/cli/auth/auth_handler.py +21 -1
- package/src/clients/cli/common.py +8 -14
- package/src/clients/cli/error_messages.py +91 -0
- package/src/clients/cli/evaluation_runner.py +108 -97
- package/src/clients/cli/evaluator_resolver.py +8 -33
- package/src/clients/cli/generate_report.py +125 -96
- package/src/clients/cli/main.py +2 -1
- package/src/clients/cli/readme.md +1 -1
- package/src/clients/cli/result_writer.py +129 -110
- package/src/clients/cli/status_derivation.py +91 -0
- package/src/clients/node-js/bin/runevals.js +31 -9
- package/src/clients/node-js/config/default.js +1 -1
- package/src/clients/node-js/lib/env-loader.js +20 -13
- package/src/clients/node-js/lib/python-runtime.js +137 -65
- package/src/clients/node-js/lib/venv-manager.js +3 -2
- package/src/clients/node-js/lib/version-check.js +268 -0
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
"""Core evaluation pipeline — evaluator dispatch, retry, parallel execution."""
|
|
2
2
|
|
|
3
|
-
import json
|
|
4
3
|
import time
|
|
5
4
|
from dataclasses import dataclass, field
|
|
6
5
|
from enum import Enum
|
|
@@ -27,8 +26,6 @@ from common import (
|
|
|
27
26
|
CITATIONS,
|
|
28
27
|
EXACT_MATCH,
|
|
29
28
|
PARTIAL_MATCH,
|
|
30
|
-
REQUIRES_AZURE_OPENAI,
|
|
31
|
-
REQUIRES_TOOL_DEFINITIONS,
|
|
32
29
|
METRIC_IDS,
|
|
33
30
|
MAX_ATTEMPTS,
|
|
34
31
|
MAX_CONCURRENCY,
|
|
@@ -37,17 +34,17 @@ from common import (
|
|
|
37
34
|
STATUS_FAIL,
|
|
38
35
|
STATUS_ERROR,
|
|
39
36
|
STATUS_PARTIAL,
|
|
40
|
-
STATUS_UNKNOWN,
|
|
41
37
|
MAX_TURNS_PER_THREAD,
|
|
42
38
|
LONG_THREAD_WARNING_THRESHOLD,
|
|
43
39
|
RunConfig,
|
|
44
40
|
)
|
|
41
|
+
from error_messages import agent_request_failed, evaluator_failed, turn_skipped
|
|
42
|
+
from status_derivation import rollup_thread_status, status_for_response
|
|
45
43
|
from custom_evaluators.CitationsEvaluator import CitationsEvaluator, CitationFormat
|
|
46
44
|
from custom_evaluators.ExactMatchEvaluator import ExactMatchEvaluator
|
|
47
45
|
from custom_evaluators.PartialMatchEvaluator import PartialMatchEvaluator
|
|
48
46
|
from evaluator_resolver import (
|
|
49
47
|
validate_evaluator_names,
|
|
50
|
-
check_prerequisites,
|
|
51
48
|
resolve_evaluators_for_prompt,
|
|
52
49
|
get_evaluator_threshold,
|
|
53
50
|
)
|
|
@@ -113,7 +110,12 @@ def detect_item_type(item: dict) -> ItemType:
|
|
|
113
110
|
|
|
114
111
|
|
|
115
112
|
def _decorate_metric(metric_id: str, data, threshold: Optional[float] = None) -> Dict[str, Any]:
|
|
116
|
-
"""Augment raw evaluator output with standardized threshold + pass/fail result.
|
|
113
|
+
"""Augment raw evaluator output with standardized threshold + pass/fail result.
|
|
114
|
+
|
|
115
|
+
Raises ValueError if the SDK returned a malformed result (no numeric score
|
|
116
|
+
under ``metric_id``). The outer try/except in :func:`_run_evaluators_for_item`
|
|
117
|
+
catches it and emits a standard ``evaluator_failed`` error entry.
|
|
118
|
+
"""
|
|
117
119
|
pass_threshold = threshold if threshold is not None else DEFAULT_PASS_THRESHOLD
|
|
118
120
|
payload = {}
|
|
119
121
|
if isinstance(data, dict):
|
|
@@ -125,12 +127,12 @@ def _decorate_metric(metric_id: str, data, threshold: Optional[float] = None) ->
|
|
|
125
127
|
if isinstance(data, dict):
|
|
126
128
|
if metric_id in data:
|
|
127
129
|
score_val = data[metric_id]
|
|
128
|
-
if isinstance(score_val, (int, float)):
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
130
|
+
if not isinstance(score_val, (int, float)):
|
|
131
|
+
raise ValueError(
|
|
132
|
+
f"non-numeric score from evaluator (metric_id={metric_id!r}, score={score_val!r})"
|
|
133
|
+
)
|
|
134
|
+
payload['threshold'] = pass_threshold
|
|
135
|
+
payload['result'] = STATUS_PASS if score_val >= pass_threshold else STATUS_FAIL
|
|
134
136
|
return payload
|
|
135
137
|
|
|
136
138
|
|
|
@@ -141,36 +143,20 @@ def _run_evaluators_for_item(
|
|
|
141
143
|
enhanced_response: Dict[str, Any],
|
|
142
144
|
resolved_evaluators: Dict[str, Any],
|
|
143
145
|
model_config: AzureOpenAIModelConfiguration,
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
) -> Tuple[Dict[str, Optional[str]], List[str]]:
|
|
146
|
+
context_label: str = "",
|
|
147
|
+
) -> Tuple[Dict[str, Dict[str, Any]], List[str]]:
|
|
147
148
|
"""Run resolved evaluators against a single item/turn.
|
|
148
149
|
|
|
149
|
-
|
|
150
|
+
Each value in results_dict is a decorated metric dict on success or an
|
|
151
|
+
errored entry ``{result: "error", error: "Evaluator failed: <exc.message>", threshold}``
|
|
152
|
+
on crash. The ``threshold`` is included on errored entries so the aggregate
|
|
153
|
+
report can still display it; the persisted ErroredScore shape strips it
|
|
154
|
+
out at write time (see ``_as_errored_score`` in result_writer).
|
|
150
155
|
"""
|
|
151
|
-
|
|
152
|
-
m365_agent_id and enhanced_response.get("tool_definitions")
|
|
153
|
-
)
|
|
154
|
-
available_context = {
|
|
155
|
-
REQUIRES_AZURE_OPENAI: has_azure_openai,
|
|
156
|
-
REQUIRES_TOOL_DEFINITIONS: has_tool_defs,
|
|
157
|
-
}
|
|
158
|
-
|
|
159
|
-
results_dict: Dict[str, Optional[str]] = {}
|
|
156
|
+
results_dict: Dict[str, Dict[str, Any]] = {}
|
|
160
157
|
evaluators_ran: List[str] = []
|
|
161
158
|
|
|
162
159
|
for eval_name, eval_options in resolved_evaluators.items():
|
|
163
|
-
can_run, warn_msg = check_prerequisites(eval_name, available_context)
|
|
164
|
-
if not can_run:
|
|
165
|
-
if warn_msg:
|
|
166
|
-
emit_structured_log(
|
|
167
|
-
"warning",
|
|
168
|
-
f"Evaluator '{eval_name}' prerequisite check failed: {warn_msg}",
|
|
169
|
-
operation=Operation.EVALUATE,
|
|
170
|
-
)
|
|
171
|
-
results_dict[eval_name] = None
|
|
172
|
-
continue
|
|
173
|
-
|
|
174
160
|
threshold = get_evaluator_threshold(eval_name, eval_options)
|
|
175
161
|
|
|
176
162
|
try:
|
|
@@ -215,33 +201,39 @@ def _run_evaluators_for_item(
|
|
|
215
201
|
|
|
216
202
|
evaluators_ran.append(eval_name)
|
|
217
203
|
except Exception as e:
|
|
204
|
+
# Full exception detail goes to the log stream (FR-009). Persisted
|
|
205
|
+
# output gets the scrubbed text from error_messages.evaluator_failed
|
|
206
|
+
# — exception.message only, never repr / class name / traceback.
|
|
207
|
+
where = f" on response for {context_label}" if context_label else ""
|
|
218
208
|
emit_structured_log(
|
|
219
209
|
"error",
|
|
220
|
-
f"Evaluator '{eval_name}' crashed
|
|
210
|
+
f"Evaluator '{eval_name}' crashed{where}: {e}",
|
|
221
211
|
operation=Operation.EVALUATE,
|
|
222
212
|
)
|
|
223
|
-
|
|
213
|
+
exc_msg = getattr(e, "message", None) or str(e)
|
|
214
|
+
results_dict[eval_name] = {
|
|
215
|
+
"result": STATUS_ERROR,
|
|
216
|
+
"error": evaluator_failed(exc_msg),
|
|
217
|
+
"threshold": threshold,
|
|
218
|
+
}
|
|
224
219
|
|
|
225
220
|
return results_dict, evaluators_ran
|
|
226
221
|
|
|
227
222
|
|
|
228
|
-
def
|
|
229
|
-
"""
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
return False
|
|
235
|
-
return True
|
|
223
|
+
def _collect_evaluator_results(results_dict: Dict[str, Dict[str, Any]]) -> List[str]:
|
|
224
|
+
"""Extract per-evaluator ``result`` values (one of pass/fail/error) for status derivation."""
|
|
225
|
+
return [
|
|
226
|
+
d["result"] for d in results_dict.values()
|
|
227
|
+
if d.get("result") in (STATUS_PASS, STATUS_FAIL, STATUS_ERROR)
|
|
228
|
+
]
|
|
236
229
|
|
|
237
230
|
|
|
238
231
|
def _evaluate_multi_turn_responses(
|
|
239
232
|
turns: List[Dict],
|
|
240
|
-
m365_agent_id: Optional[str],
|
|
241
233
|
effective_log_level: str,
|
|
242
234
|
default_evaluators: Dict[str, Any],
|
|
243
235
|
model_config: AzureOpenAIModelConfiguration,
|
|
244
|
-
|
|
236
|
+
thread_name: str = "",
|
|
245
237
|
) -> Tuple[List[Dict], Dict]:
|
|
246
238
|
"""Run per-turn evaluations and build evaluated turn results with summary.
|
|
247
239
|
|
|
@@ -251,8 +243,6 @@ def _evaluate_multi_turn_responses(
|
|
|
251
243
|
and optionally error. Does not mutate the input turns.
|
|
252
244
|
"""
|
|
253
245
|
evaluated_turns: List[Dict] = []
|
|
254
|
-
turns_passed = 0
|
|
255
|
-
turns_failed = 0
|
|
256
246
|
|
|
257
247
|
for i, turn in enumerate(turns):
|
|
258
248
|
evaluated_turn: Dict[str, Any] = {
|
|
@@ -268,9 +258,10 @@ def _evaluate_multi_turn_responses(
|
|
|
268
258
|
evaluated_turn["evaluators_mode"] = turn["evaluators_mode"]
|
|
269
259
|
|
|
270
260
|
if turn.get("status") == STATUS_ERROR:
|
|
261
|
+
# Request-failure or downstream-skip turn — error already set upstream.
|
|
271
262
|
evaluated_turn["status"] = STATUS_ERROR
|
|
272
|
-
|
|
273
|
-
|
|
263
|
+
if "error" in turn:
|
|
264
|
+
evaluated_turn["error"] = turn["error"]
|
|
274
265
|
evaluated_turns.append(evaluated_turn)
|
|
275
266
|
continue
|
|
276
267
|
|
|
@@ -282,16 +273,22 @@ def _evaluate_multi_turn_responses(
|
|
|
282
273
|
turn.get("prompt", ""), default_evaluators,
|
|
283
274
|
)
|
|
284
275
|
|
|
276
|
+
thread_part = f" of '{thread_name}'" if thread_name else ""
|
|
277
|
+
turn_label = f"turn {i + 1}/{len(turns)}{thread_part}"
|
|
285
278
|
results_dict, evaluators_ran = _run_evaluators_for_item(
|
|
286
279
|
turn.get("prompt", ""), actual_response, turn.get("expected_response", ""),
|
|
287
|
-
enhanced_response, resolved, model_config,
|
|
280
|
+
enhanced_response, resolved, model_config,
|
|
281
|
+
context_label=turn_label,
|
|
288
282
|
)
|
|
289
283
|
|
|
290
|
-
|
|
284
|
+
evaluator_result_values = _collect_evaluator_results(results_dict)
|
|
285
|
+
status, error_obj = status_for_response(evaluator_result_values)
|
|
291
286
|
|
|
292
287
|
evaluated_turn["results"] = results_dict
|
|
293
288
|
evaluated_turn["evaluators_ran"] = evaluators_ran
|
|
294
|
-
evaluated_turn["status"] =
|
|
289
|
+
evaluated_turn["status"] = status
|
|
290
|
+
if error_obj is not None:
|
|
291
|
+
evaluated_turn["error"] = error_obj
|
|
295
292
|
|
|
296
293
|
if effective_log_level == "debug":
|
|
297
294
|
emit_structured_log(
|
|
@@ -302,26 +299,17 @@ def _evaluate_multi_turn_responses(
|
|
|
302
299
|
operation=Operation.EVALUATE,
|
|
303
300
|
)
|
|
304
301
|
|
|
305
|
-
if all_passed:
|
|
306
|
-
turns_passed += 1
|
|
307
|
-
else:
|
|
308
|
-
turns_failed += 1
|
|
309
|
-
|
|
310
302
|
evaluated_turns.append(evaluated_turn)
|
|
311
303
|
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
overall_status = STATUS_PASS
|
|
315
|
-
elif turns_failed == turns_total:
|
|
316
|
-
overall_status = STATUS_FAIL
|
|
317
|
-
else:
|
|
318
|
-
overall_status = STATUS_PARTIAL
|
|
319
|
-
|
|
304
|
+
turn_statuses = [t.get("status", STATUS_ERROR) for t in evaluated_turns]
|
|
305
|
+
turns_total = len(evaluated_turns)
|
|
320
306
|
summary = {
|
|
321
307
|
"turns_total": turns_total,
|
|
322
|
-
"turns_passed":
|
|
323
|
-
"turns_failed":
|
|
324
|
-
"
|
|
308
|
+
"turns_passed": sum(1 for s in turn_statuses if s == STATUS_PASS),
|
|
309
|
+
"turns_failed": sum(1 for s in turn_statuses if s == STATUS_FAIL),
|
|
310
|
+
"turns_partial": sum(1 for s in turn_statuses if s == STATUS_PARTIAL),
|
|
311
|
+
"turns_errored": sum(1 for s in turn_statuses if s == STATUS_ERROR),
|
|
312
|
+
"overall_status": rollup_thread_status(turn_statuses),
|
|
325
313
|
}
|
|
326
314
|
|
|
327
315
|
return evaluated_turns, summary
|
|
@@ -330,10 +318,8 @@ def _evaluate_multi_turn_responses(
|
|
|
330
318
|
def _evaluate_single_response(
|
|
331
319
|
enhanced_response: Dict[str, Any],
|
|
332
320
|
eval_item: Dict,
|
|
333
|
-
m365_agent_id: Optional[str],
|
|
334
321
|
effective_log_level: str,
|
|
335
322
|
model_config: AzureOpenAIModelConfiguration,
|
|
336
|
-
has_azure_openai: bool,
|
|
337
323
|
default_evaluators: Dict[str, Any],
|
|
338
324
|
) -> Dict[str, Any]:
|
|
339
325
|
"""Run all evaluators for a single prompt/response pair and return the result dict."""
|
|
@@ -348,10 +334,14 @@ def _evaluate_single_response(
|
|
|
348
334
|
|
|
349
335
|
results_dict, evaluators_ran = _run_evaluators_for_item(
|
|
350
336
|
prompt, actual_response_text, expected_response, enhanced_response,
|
|
351
|
-
resolved, model_config,
|
|
337
|
+
resolved, model_config,
|
|
338
|
+
context_label=f"prompt '{prompt[:60]}'" if prompt else "",
|
|
352
339
|
)
|
|
353
340
|
|
|
354
|
-
|
|
341
|
+
evaluator_result_values = _collect_evaluator_results(results_dict)
|
|
342
|
+
status, error_obj = status_for_response(evaluator_result_values)
|
|
343
|
+
|
|
344
|
+
evaluation_result: Dict[str, Any] = {
|
|
355
345
|
"prompt": prompt,
|
|
356
346
|
"response": enhanced_response.get(
|
|
357
347
|
"display_response_text", actual_response_text
|
|
@@ -359,7 +349,10 @@ def _evaluate_single_response(
|
|
|
359
349
|
"expected_response": expected_response,
|
|
360
350
|
"evaluators_ran": evaluators_ran,
|
|
361
351
|
"results": results_dict,
|
|
352
|
+
"status": status,
|
|
362
353
|
}
|
|
354
|
+
if error_obj is not None:
|
|
355
|
+
evaluation_result["error"] = error_obj
|
|
363
356
|
|
|
364
357
|
if "evaluators" in eval_item:
|
|
365
358
|
evaluation_result["evaluators"] = eval_item["evaluators"]
|
|
@@ -485,7 +478,7 @@ def run_pipeline(
|
|
|
485
478
|
"evaluators_ran": [],
|
|
486
479
|
"results": {},
|
|
487
480
|
"status": STATUS_ERROR,
|
|
488
|
-
"
|
|
481
|
+
"error": agent_request_failed(getattr(exc, "message", None) or str(exc)),
|
|
489
482
|
}
|
|
490
483
|
|
|
491
484
|
delay = retry_after if retry_after is not None else pipeline.get_backoff_seconds(attempt)
|
|
@@ -493,9 +486,8 @@ def run_pipeline(
|
|
|
493
486
|
|
|
494
487
|
# Phase B: Evaluate response
|
|
495
488
|
return _evaluate_single_response(
|
|
496
|
-
response, eval_item, config.
|
|
497
|
-
pipeline.model_config, pipeline.
|
|
498
|
-
pipeline.default_evaluators,
|
|
489
|
+
response, eval_item, config.effective_log_level,
|
|
490
|
+
pipeline.model_config, pipeline.default_evaluators,
|
|
499
491
|
)
|
|
500
492
|
|
|
501
493
|
def _process_multi_turn(eval_item: Dict, index: int) -> Dict[str, Any]:
|
|
@@ -522,7 +514,7 @@ def run_pipeline(
|
|
|
522
514
|
conversation_context = None
|
|
523
515
|
conversation_id = None
|
|
524
516
|
enriched_turns: List[Dict[str, Any]] = []
|
|
525
|
-
|
|
517
|
+
failure_exception: Optional[Exception] = None
|
|
526
518
|
|
|
527
519
|
for i, turn in enumerate(turns):
|
|
528
520
|
prompt = turn["prompt"]
|
|
@@ -557,28 +549,34 @@ def run_pipeline(
|
|
|
557
549
|
continue
|
|
558
550
|
|
|
559
551
|
# All other errors: stop the thread
|
|
552
|
+
if status == 429:
|
|
553
|
+
note = "" # 429 retries were exhausted; the attempt count is enough.
|
|
554
|
+
else:
|
|
555
|
+
status_part = f"HTTP {status}" if status else "this error"
|
|
556
|
+
note = f" ({status_part} is not retried in multi-turn to avoid duplicate turns in the conversation)"
|
|
560
557
|
emit_structured_log(
|
|
561
558
|
"error",
|
|
562
|
-
f"Turn {i + 1}/{len(turns)} of '{thread_name}' failed after {attempt} attempt(s): {exc}",
|
|
559
|
+
f"Turn {i + 1}/{len(turns)} of '{thread_name}' failed after {attempt} attempt(s){note}: {exc}",
|
|
563
560
|
operation=Operation.SEND_PROMPT,
|
|
564
561
|
)
|
|
565
|
-
|
|
562
|
+
failure_exception = exc
|
|
566
563
|
break
|
|
567
564
|
|
|
568
|
-
if
|
|
569
|
-
#
|
|
565
|
+
if failure_exception is not None:
|
|
566
|
+
# Failing turn carries the cause; downstream turns are skipped.
|
|
567
|
+
exc_msg = getattr(failure_exception, "message", None) or str(failure_exception)
|
|
570
568
|
enriched_turns.append({
|
|
571
569
|
**turn,
|
|
572
570
|
"response": "",
|
|
573
571
|
"status": STATUS_ERROR,
|
|
574
|
-
"error":
|
|
572
|
+
"error": agent_request_failed(exc_msg),
|
|
575
573
|
})
|
|
576
574
|
for j in range(i + 1, len(turns)):
|
|
577
575
|
enriched_turns.append({
|
|
578
576
|
**turns[j],
|
|
579
577
|
"response": "",
|
|
580
578
|
"status": STATUS_ERROR,
|
|
581
|
-
"error":
|
|
579
|
+
"error": turn_skipped(),
|
|
582
580
|
})
|
|
583
581
|
break
|
|
584
582
|
|
|
@@ -596,10 +594,10 @@ def run_pipeline(
|
|
|
596
594
|
|
|
597
595
|
# Phase B: Run per-turn evaluations
|
|
598
596
|
evaluated_turns, summary = _evaluate_multi_turn_responses(
|
|
599
|
-
enriched_turns, config.
|
|
597
|
+
enriched_turns, config.effective_log_level,
|
|
600
598
|
pipeline.default_evaluators,
|
|
599
|
+
thread_name=thread_name,
|
|
601
600
|
model_config=pipeline.model_config,
|
|
602
|
-
has_azure_openai=pipeline.has_azure_openai,
|
|
603
601
|
)
|
|
604
602
|
|
|
605
603
|
return {
|
|
@@ -621,21 +619,34 @@ def run_pipeline(
|
|
|
621
619
|
if wr.error:
|
|
622
620
|
idx = wr.index
|
|
623
621
|
item = eval_items[idx]
|
|
622
|
+
exc_msg = getattr(wr.error, "message", None) or str(wr.error)
|
|
623
|
+
cause_error = agent_request_failed(exc_msg)
|
|
624
624
|
if item_types[idx] == ItemType.MULTI_TURN:
|
|
625
|
+
# Worker raised before any turn ran. Turn 1 carries the cause;
|
|
626
|
+
# remaining turns are downstream-skipped. All turns errored →
|
|
627
|
+
# thread overall_status="error".
|
|
628
|
+
turns = item.get("turns", [])
|
|
629
|
+
turn_dicts = []
|
|
630
|
+
for j, t in enumerate(turns):
|
|
631
|
+
turn_dicts.append({
|
|
632
|
+
**t,
|
|
633
|
+
"response": "",
|
|
634
|
+
"results": {},
|
|
635
|
+
"status": STATUS_ERROR,
|
|
636
|
+
"error": cause_error if j == 0 else turn_skipped(),
|
|
637
|
+
})
|
|
625
638
|
ordered_results.append({
|
|
626
639
|
"type": "multi_turn",
|
|
627
640
|
"name": item.get("name", ""),
|
|
628
|
-
"turns":
|
|
629
|
-
{**t, "status": STATUS_ERROR, "error": str(wr.error), "response": "", "results": {}}
|
|
630
|
-
for t in item.get("turns", [])
|
|
631
|
-
],
|
|
641
|
+
"turns": turn_dicts,
|
|
632
642
|
"summary": {
|
|
633
|
-
"turns_total": len(
|
|
643
|
+
"turns_total": len(turns),
|
|
634
644
|
"turns_passed": 0,
|
|
635
|
-
"turns_failed":
|
|
636
|
-
"
|
|
645
|
+
"turns_failed": 0,
|
|
646
|
+
"turns_partial": 0,
|
|
647
|
+
"turns_errored": len(turns),
|
|
648
|
+
"overall_status": STATUS_ERROR,
|
|
637
649
|
},
|
|
638
|
-
"error": str(wr.error),
|
|
639
650
|
})
|
|
640
651
|
else:
|
|
641
652
|
ordered_results.append({
|
|
@@ -645,7 +656,7 @@ def run_pipeline(
|
|
|
645
656
|
"evaluators_ran": [],
|
|
646
657
|
"results": {},
|
|
647
658
|
"status": STATUS_ERROR,
|
|
648
|
-
"
|
|
659
|
+
"error": cause_error,
|
|
649
660
|
})
|
|
650
661
|
else:
|
|
651
662
|
ordered_results.append(wr.value)
|
|
@@ -6,7 +6,7 @@ with file-level defaults and system defaults, following extend/replace modes.
|
|
|
6
6
|
|
|
7
7
|
import difflib
|
|
8
8
|
import logging
|
|
9
|
-
from typing import Any, Dict, Optional
|
|
9
|
+
from typing import Any, Dict, Optional
|
|
10
10
|
|
|
11
11
|
from common import (
|
|
12
12
|
RELEVANCE,
|
|
@@ -16,7 +16,6 @@ from common import (
|
|
|
16
16
|
CITATIONS,
|
|
17
17
|
EXACT_MATCH,
|
|
18
18
|
PARTIAL_MATCH,
|
|
19
|
-
REQUIRES_AZURE_OPENAI,
|
|
20
19
|
SYSTEM_DEFAULT_EVALUATORS,
|
|
21
20
|
RegistryEntry,
|
|
22
21
|
)
|
|
@@ -26,13 +25,13 @@ logger = logging.getLogger(__name__)
|
|
|
26
25
|
|
|
27
26
|
# Static registry of available evaluators per data-model.md
|
|
28
27
|
EVALUATOR_REGISTRY: Dict[str, RegistryEntry] = {
|
|
29
|
-
RELEVANCE: RegistryEntry(type="llm",
|
|
30
|
-
COHERENCE: RegistryEntry(type="llm",
|
|
31
|
-
GROUNDEDNESS: RegistryEntry(type="llm",
|
|
32
|
-
SIMILARITY: RegistryEntry(type="llm",
|
|
33
|
-
CITATIONS: RegistryEntry(type="non-llm",
|
|
34
|
-
EXACT_MATCH: RegistryEntry(type="non-llm",
|
|
35
|
-
PARTIAL_MATCH: RegistryEntry(type="non-llm",
|
|
28
|
+
RELEVANCE: RegistryEntry(type="llm", default_threshold=3),
|
|
29
|
+
COHERENCE: RegistryEntry(type="llm", default_threshold=3),
|
|
30
|
+
GROUNDEDNESS: RegistryEntry(type="llm", default_threshold=3),
|
|
31
|
+
SIMILARITY: RegistryEntry(type="llm", default_threshold=3),
|
|
32
|
+
CITATIONS: RegistryEntry(type="non-llm", default_threshold=1),
|
|
33
|
+
EXACT_MATCH: RegistryEntry(type="non-llm", default_threshold=None),
|
|
34
|
+
PARTIAL_MATCH: RegistryEntry(type="non-llm", default_threshold=0.5),
|
|
36
35
|
}
|
|
37
36
|
|
|
38
37
|
|
|
@@ -71,30 +70,6 @@ def validate_evaluator_names(evaluator_map: Dict[str, Any]) -> None:
|
|
|
71
70
|
raise ValueError("\n".join(lines))
|
|
72
71
|
|
|
73
72
|
|
|
74
|
-
def check_prerequisites(
|
|
75
|
-
evaluator_name: str,
|
|
76
|
-
available_context: Dict[str, bool],
|
|
77
|
-
) -> Tuple[bool, Optional[str]]:
|
|
78
|
-
"""Check if prerequisites for an evaluator are available.
|
|
79
|
-
|
|
80
|
-
Returns (True, None) if all prerequisites are met, or
|
|
81
|
-
(False, warning_message) if a prerequisite is missing.
|
|
82
|
-
"""
|
|
83
|
-
registry_entry = EVALUATOR_REGISTRY.get(evaluator_name)
|
|
84
|
-
if not registry_entry:
|
|
85
|
-
return False, f"Unknown evaluator: {evaluator_name}"
|
|
86
|
-
|
|
87
|
-
for req in registry_entry.requires:
|
|
88
|
-
if not available_context.get(req, False):
|
|
89
|
-
msg = (
|
|
90
|
-
f"Skipping evaluator '{evaluator_name}': "
|
|
91
|
-
f"missing prerequisite '{req}'"
|
|
92
|
-
)
|
|
93
|
-
return False, msg
|
|
94
|
-
|
|
95
|
-
return True, None
|
|
96
|
-
|
|
97
|
-
|
|
98
73
|
def resolve_default_evaluators(file_defaults: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
|
99
74
|
"""Resolve effective default evaluators, falling back to system defaults.
|
|
100
75
|
|