@microsoft/m365-copilot-eval 1.2.1-preview.1 → 1.4.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/README.md +140 -101
  2. package/package.json +7 -4
  3. package/schema/CHANGELOG.md +8 -0
  4. package/schema/v1/eval-document.schema.json +256 -8
  5. package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
  6. package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
  7. package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
  8. package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
  9. package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
  10. package/schema/v1/examples/valid/comprehensive.json +27 -2
  11. package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
  12. package/schema/v1/examples/valid/multi-turn-output.json +59 -0
  13. package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
  14. package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
  15. package/schema/version.json +2 -2
  16. package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
  17. package/src/clients/cli/api_clients/A2A/a2a_client.py +456 -0
  18. package/src/clients/cli/api_clients/REST/__init__.py +3 -0
  19. package/src/clients/cli/api_clients/REST/sydney_client.py +204 -0
  20. package/src/clients/cli/api_clients/__init__.py +3 -0
  21. package/src/clients/cli/api_clients/base_agent_client.py +78 -0
  22. package/src/clients/cli/cli_logging/__init__.py +0 -0
  23. package/src/clients/cli/cli_logging/console_diagnostics.py +107 -0
  24. package/src/clients/cli/cli_logging/logging_utils.py +144 -0
  25. package/src/clients/cli/common.py +62 -0
  26. package/src/clients/cli/custom_evaluators/CitationsEvaluator.py +3 -3
  27. package/src/clients/cli/custom_evaluators/ExactMatchEvaluator.py +11 -11
  28. package/src/clients/cli/custom_evaluators/PartialMatchEvaluator.py +1 -11
  29. package/src/clients/cli/evaluator_resolver.py +150 -0
  30. package/src/clients/cli/generate_report.py +347 -184
  31. package/src/clients/cli/main.py +1288 -481
  32. package/src/clients/cli/parallel_executor.py +57 -0
  33. package/src/clients/cli/readme.md +14 -7
  34. package/src/clients/cli/requirements.txt +1 -1
  35. package/src/clients/cli/response_extractor.py +30 -14
  36. package/src/clients/cli/retry_policy.py +52 -0
  37. package/src/clients/cli/samples/multiturn_example.json +35 -0
  38. package/src/clients/cli/throttle_gate.py +82 -0
  39. package/src/clients/node-js/bin/runevals.js +134 -41
  40. package/src/clients/node-js/config/default.js +5 -1
  41. package/src/clients/node-js/lib/agent-id.js +12 -0
  42. package/src/clients/node-js/lib/env-loader.js +11 -16
  43. package/src/clients/node-js/lib/eula-manager.js +78 -0
  44. package/src/clients/node-js/lib/progress.js +13 -11
@@ -0,0 +1,78 @@
1
+ from __future__ import annotations
2
+
3
+ import functools
4
+ from abc import ABC, abstractmethod
5
+ from datetime import datetime
6
+ from typing import Any, Dict, List, Optional, Tuple
7
+
8
+ import tzlocal
9
+
10
+
11
+ class BaseAgentClient(ABC):
12
+ """Abstract base class for agent API clients.
13
+ """
14
+
15
+ @abstractmethod
16
+ def fetch_available_agents(self) -> List[Dict[str, Any]]:
17
+ """Return the list of agents accessible to the configured user.
18
+
19
+ Implementations that do not support agent enumeration should
20
+ return an empty list.
21
+ """
22
+ pass
23
+
24
+ @abstractmethod
25
+ def send_prompt(
26
+ self,
27
+ prompt: str,
28
+ agent_id: str | None = None,
29
+ conversation_context: Optional[Dict[str, Any]] = None,
30
+ ) -> Tuple[Dict[str, Any], Optional[Dict[str, Any]]]:
31
+ """Send a single prompt and return the response with conversation context.
32
+
33
+ For single-turn usage, pass conversation_context=None.
34
+ For multi-turn usage, pass the context returned from the previous turn.
35
+
36
+ Args:
37
+ prompt: The prompt string to send.
38
+ agent_id: Optional agent ID to target.
39
+ conversation_context: Opaque context dict from a previous turn,
40
+ or None for the first turn / single-turn usage.
41
+
42
+ Returns:
43
+ Tuple of (enhanced_response_dict, conversation_context).
44
+ The conversation_context should be passed to the next turn
45
+ in a multi-turn conversation, or discarded for single-turn.
46
+ The context structure is implementation-specific:
47
+ - Sydney/REST: {"conversation_id": str}
48
+ - A2A: {"context_id": str}
49
+ Returns None as context when no conversation state is established.
50
+ """
51
+ pass
52
+
53
+ def resolve_agent(self, agent_id: str) -> None:
54
+ """Pre-resolve agent endpoint. Called once before pipeline starts.
55
+
56
+ Default is no-op. Subclasses may override to cache agent discovery.
57
+ """
58
+ pass
59
+
60
+ @staticmethod
61
+ @functools.lru_cache(maxsize=1)
62
+ def _get_iana_timezone_name() -> str:
63
+ try:
64
+ return tzlocal.get_localzone_name()
65
+ except Exception:
66
+ return str(tzlocal.get_localzone())
67
+
68
+ @staticmethod
69
+ @functools.lru_cache(maxsize=1)
70
+ def _get_location_info() -> Dict[str, Any]:
71
+ now = datetime.now().astimezone()
72
+ utc_offset = now.utcoffset()
73
+ offset_hours = int(utc_offset.total_seconds() // 3600) if utc_offset is not None else 0
74
+ return {
75
+ "timeZoneOffset": offset_hours,
76
+ "timeZone": BaseAgentClient._get_iana_timezone_name(),
77
+ }
78
+
File without changes
@@ -0,0 +1,107 @@
1
+ import json
2
+ import logging
3
+ import sys
4
+ from collections import OrderedDict
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ from cli_logging.logging_utils import (
8
+ STRUCTURED_LOG_FIELDS,
9
+ Operation,
10
+ format_structured_log_entry,
11
+ )
12
+
13
+ _ANSI_COLORS = {
14
+ "debug": "\033[2m", # dim
15
+ "info": "", # default
16
+ "warning": "\033[33m", # yellow
17
+ "error": "\033[31m", # red
18
+ }
19
+ _ANSI_RESET = "\033[0m"
20
+
21
+
22
+ def format_diagnostic_record(record: Dict[str, Any]) -> OrderedDict:
23
+ ordered = OrderedDict()
24
+ for field in STRUCTURED_LOG_FIELDS:
25
+ default = False if field == "is-redacted" else None
26
+ ordered[field] = record.get(field, default)
27
+ return ordered
28
+
29
+
30
+ def serialize_diagnostic_record(record: Dict[str, Any]) -> str:
31
+ return json.dumps(format_diagnostic_record(record), ensure_ascii=False)
32
+
33
+
34
+ def format_console_record(record: Dict[str, Any], max_message_length: int = 250) -> str:
35
+ """Format a diagnostic record for human-readable TTY output with ANSI colors."""
36
+ ts = record.get("timestamp", "")
37
+ # Extract HH:MM:SS from ISO timestamp
38
+ time_part = ts[11:19] if len(ts) >= 19 else ts
39
+ level = (record.get("level") or "info").upper()
40
+ message = record.get("message", "")
41
+ if len(message) > max_message_length:
42
+ message = message[:max_message_length] + "…"
43
+
44
+ ids = []
45
+ for key in ("request-id", "conversation-id", "message-id"):
46
+ val = record.get(key)
47
+ if val:
48
+ ids.append(f"{key}={val}")
49
+ id_suffix = f" ({' | '.join(ids)})" if ids else ""
50
+
51
+ color = _ANSI_COLORS.get((record.get("level") or "info").lower(), "")
52
+ reset = _ANSI_RESET if color else ""
53
+ return f"{color}[{time_part}] {level} {message}{id_suffix}{reset}"
54
+
55
+
56
+ def render_diagnostic(record: Dict[str, Any]) -> str:
57
+ """Return TTY-friendly or JSON output depending on whether stdout is a terminal."""
58
+ if sys.stdout.isatty():
59
+ return format_console_record(record)
60
+ return serialize_diagnostic_record(record)
61
+
62
+
63
+ def emit_structured_log(
64
+ level: str,
65
+ message: str,
66
+ operation: str = Operation.EVALUATE,
67
+ *,
68
+ logger: logging.Logger,
69
+ diagnostic_records: Optional[List[Dict[str, Any]]] = None,
70
+ run_context: Optional[Dict[str, Any]] = None,
71
+ ) -> None:
72
+ """Emit a structured log entry.
73
+
74
+ Formats via format_structured_log_entry, optionally appends to
75
+ diagnostic_records, then logs via render_diagnostic (TTY-friendly or JSON).
76
+
77
+ Args:
78
+ level: One of "debug", "info", "warning", "error".
79
+ message: Human-readable log message.
80
+ operation: The CLI operation step (e.g. Operation.SEND_PROMPT).
81
+ logger: Logger to emit through.
82
+ diagnostic_records: If provided, the structured entry is appended here.
83
+ run_context: Full run context override (request-id, conversation-id,
84
+ message-id). Defaults to nulls with the given operation.
85
+ """
86
+ log_level_int = getattr(logging, level.upper(), logging.INFO)
87
+ if diagnostic_records is None and not logger.isEnabledFor(log_level_int):
88
+ return
89
+
90
+ context = run_context or {
91
+ "request-id": None,
92
+ "conversation-id": None,
93
+ "message-id": None,
94
+ "operation": operation,
95
+ }
96
+ entry = format_structured_log_entry(
97
+ level=level,
98
+ message=message,
99
+ logger_name=logger.name,
100
+ run_context=context,
101
+ )
102
+ if diagnostic_records is not None:
103
+ diagnostic_records.append(entry)
104
+ try:
105
+ logger.log(getattr(logging, level.upper(), logging.INFO), render_diagnostic(entry))
106
+ except Exception:
107
+ pass
@@ -0,0 +1,144 @@
1
+ import logging
2
+ import re
3
+ from datetime import datetime, timezone
4
+ from enum import Enum
5
+ from typing import Any, Dict, List, Optional, Tuple
6
+
7
+
8
+ class LogLevel(str, Enum):
9
+ """Log level enum. Inherits from str so comparisons like level == "debug" work."""
10
+ DEBUG = "debug"
11
+ INFO = "info"
12
+ WARNING = "warning"
13
+ ERROR = "error"
14
+
15
+
16
+ class Operation(str, Enum):
17
+ """CLI operation steps for structured log entries."""
18
+ SETUP = "setup"
19
+ AUTHENTICATE = "authenticate"
20
+ VALIDATE_ENV = "validate-env"
21
+ LOAD_PROMPTS = "load-prompts"
22
+ FETCH_AGENTS = "fetch-agents"
23
+ SEND_PROMPT = "send-prompt"
24
+ EVALUATE = "evaluate"
25
+ WRITE_OUTPUT = "write-output"
26
+
27
+
28
+ ALLOWED_LOG_LEVELS = tuple(level.value for level in LogLevel)
29
+ LOG_LEVEL_MAP = {
30
+ LogLevel.DEBUG: logging.DEBUG,
31
+ LogLevel.INFO: logging.INFO,
32
+ LogLevel.WARNING: logging.WARNING,
33
+ LogLevel.ERROR: logging.ERROR,
34
+ }
35
+
36
+ STRUCTURED_LOG_FIELDS = (
37
+ "timestamp",
38
+ "level",
39
+ "operation",
40
+ "request-id",
41
+ "conversation-id",
42
+ "message-id",
43
+ "logger",
44
+ "message",
45
+ "is-redacted",
46
+ )
47
+
48
+
49
+ def normalize_log_level(value: Optional[str]) -> Optional[str]:
50
+ if value is None:
51
+ return None
52
+ return value.strip().lower()
53
+
54
+
55
+ def resolve_log_level(
56
+ log_level_values: Optional[List[str]],
57
+ ) -> Tuple[Optional[str], Optional[str]]:
58
+ values = log_level_values or []
59
+ if not values:
60
+ return "info", None
61
+
62
+ # Use the last value provided (aligns with Node.js wrapper behavior).
63
+ last = normalize_log_level(values[-1])
64
+ if last not in ALLOWED_LOG_LEVELS:
65
+ return (
66
+ None,
67
+ "Invalid value for --log-level. Supported values are: "
68
+ "debug, info, warning, error.",
69
+ )
70
+
71
+ return last, None
72
+
73
+
74
+ def utc_iso_timestamp() -> str:
75
+ return datetime.now(timezone.utc).isoformat()
76
+
77
+
78
+ def build_run_context(
79
+ operation: str = "evaluate",
80
+ request_id: Optional[str] = None,
81
+ conversation_id: Optional[str] = None,
82
+ message_id: Optional[str] = None,
83
+ ) -> Dict[str, Optional[str]]:
84
+ return {
85
+ "request-id": request_id,
86
+ "conversation-id": conversation_id,
87
+ "message-id": message_id,
88
+ "operation": operation,
89
+ }
90
+
91
+
92
+ _SECRET_PATTERNS = [
93
+ re.compile(r"(?i)(api[_-]?key\s*[:=]\s*)([^\s,;]+)"),
94
+ re.compile(r"(?i)(token\s*[:=]\s*)([^\s,;]+)"),
95
+ re.compile(r"(?i)(authorization\s*[:=]\s*bearer\s+)([^\s,;]+)"),
96
+ re.compile(r"(?i)(password\s*[:=]\s*)([^\s,;]+)"),
97
+ ]
98
+
99
+
100
+ def redact_sensitive_content(message: Optional[str]) -> Tuple[str, bool]:
101
+ if message is None:
102
+ return "", False
103
+
104
+ redacted = message
105
+ changed = False
106
+ for pattern in _SECRET_PATTERNS:
107
+ updated = pattern.sub(r"\1***REDACTED***", redacted)
108
+ if updated != redacted:
109
+ changed = True
110
+ redacted = updated
111
+
112
+ # Fallback: match strings 32+ chars containing mixed case and digits
113
+ # (likely a credential/token) that weren't already caught above.
114
+ if (
115
+ "***REDACTED***" not in redacted
116
+ and re.search(
117
+ r"(?=[A-Za-z0-9_\-]*[A-Z])(?=[A-Za-z0-9_\-]*[a-z])"
118
+ r"(?=[A-Za-z0-9_\-]*[0-9])[A-Za-z0-9_\-]{32,}",
119
+ redacted,
120
+ )
121
+ ):
122
+ return "[REDACTED]", True
123
+
124
+ return redacted, changed
125
+
126
+
127
+ def format_structured_log_entry(
128
+ level: str,
129
+ message: str,
130
+ logger_name: str,
131
+ run_context: Dict[str, Optional[str]],
132
+ ) -> Dict[str, Any]:
133
+ safe_message, is_redacted = redact_sensitive_content(message)
134
+ return {
135
+ "level": normalize_log_level(level) or "info",
136
+ "message": safe_message,
137
+ "logger": logger_name,
138
+ "timestamp": utc_iso_timestamp(),
139
+ "request-id": run_context.get("request-id"),
140
+ "conversation-id": run_context.get("conversation-id"),
141
+ "message-id": run_context.get("message-id"),
142
+ "operation": run_context.get("operation"),
143
+ "is-redacted": is_redacted,
144
+ }
@@ -0,0 +1,62 @@
1
+ """Shared types and constants for the CLI."""
2
+
3
+ import re
4
+ from dataclasses import dataclass
5
+ from typing import List, Optional
6
+
7
+
8
+ def pascal_case_to_title(eval_name: str) -> str:
9
+ """Convert PascalCase evaluator name to space-separated display name.
10
+
11
+ e.g., "ToolCallAccuracy" → "Tool Call Accuracy"
12
+ """
13
+ return re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', eval_name)
14
+
15
+ # Canonical evaluator name constants
16
+ RELEVANCE = "Relevance"
17
+ COHERENCE = "Coherence"
18
+ GROUNDEDNESS = "Groundedness"
19
+ TOOL_CALL_ACCURACY = "ToolCallAccuracy"
20
+ CITATIONS = "Citations"
21
+ EXACT_MATCH = "ExactMatch"
22
+ PARTIAL_MATCH = "PartialMatch"
23
+
24
+ # Prerequisite constants
25
+ REQUIRES_AZURE_OPENAI = "azure_openai"
26
+ REQUIRES_TOOL_DEFINITIONS = "tool_definitions"
27
+
28
+ # Evaluation status constants
29
+ # Outcome statuses (agent responded, evaluators ran):
30
+ STATUS_PASS = "pass" # All evaluators scored above threshold
31
+ STATUS_FAIL = "fail" # At least one evaluator scored below threshold
32
+ # Error state (evaluation couldn't complete):
33
+ STATUS_ERROR = "error" # API call failed / response couldn't be obtained
34
+ # Thread-level aggregate status (multi-turn only):
35
+ STATUS_PARTIAL = "partial" # Some turns passed, some did not
36
+ # Fallback for missing status:
37
+ STATUS_UNKNOWN = "unknown"
38
+
39
+ # System defaults when no file-level or env-level defaults are configured
40
+ SYSTEM_DEFAULT_EVALUATORS = [
41
+ RELEVANCE,
42
+ COHERENCE,
43
+ ]
44
+
45
+
46
+ # Mapping from evaluator name to the key used in evaluator output dicts
47
+ METRIC_IDS = {
48
+ RELEVANCE: "relevance",
49
+ COHERENCE: "coherence",
50
+ GROUNDEDNESS: "groundedness",
51
+ TOOL_CALL_ACCURACY: "tool_call_accuracy",
52
+ CITATIONS: "citations",
53
+ EXACT_MATCH: "exact_match",
54
+ PARTIAL_MATCH: "partial_match",
55
+ }
56
+
57
+
58
+ @dataclass
59
+ class RegistryEntry:
60
+ type: str # "llm", "tool", or "non-llm"
61
+ requires: List[str]
62
+ default_threshold: Optional[float]
@@ -17,8 +17,8 @@ from typing import Dict, Any, Optional
17
17
  class CitationFormat(Enum):
18
18
  """Enum for different citation formats supported by the evaluator."""
19
19
  OAI_UNICODE = "oai_unicode" # New format: \ue200cite\ue202turn{X}search{Y}\ue201
20
- LEGACY_BRACKET = "legacy_bracket" # Old format: [^i^]
21
- AUTO = "auto" # Automatically detect both formats
20
+ LEGACY_BRACKET = "bracket" # Old format: [^i^]
21
+ AUTO = "mixed" # Automatically detect both formats
22
22
 
23
23
 
24
24
  class CitationsEvaluator:
@@ -141,7 +141,7 @@ class CitationsEvaluator:
141
141
 
142
142
  results = {
143
143
  "citation_format": self.citation_format.value,
144
- "score": total_citations,
144
+ "citations": total_citations,
145
145
  "result": "pass" if total_citations > 0 else "fail",
146
146
  "threshold": 1,
147
147
  "reason": " ".join(reason_parts)
@@ -1,8 +1,6 @@
1
- from azure.ai.evaluation import evaluate
2
-
3
1
  class ExactMatchEvaluator:
4
- def __init__(self):
5
- pass
2
+ def __init__(self, case_sensitive=False):
3
+ self.case_sensitive = case_sensitive
6
4
 
7
5
  def __call__(self, *, response: str, expected_answer: str, **kwargs):
8
6
  if response is None or response.strip() == "":
@@ -11,15 +9,17 @@ class ExactMatchEvaluator:
11
9
  if expected_answer is None:
12
10
  raise ValueError("Expected answer cannot be None.")
13
11
 
14
- # Case-sensitive exact match (mimics C# StringComparison.InvariantCulture)
15
- is_match = response.strip() == expected_answer.strip()
12
+ resp = response.strip()
13
+ exp = expected_answer.strip()
14
+
15
+ if not self.case_sensitive:
16
+ resp = resp.lower()
17
+ exp = exp.lower()
18
+
19
+ is_match = resp == exp
16
20
 
17
21
  return {
18
22
  "exact_match": 1.0 if is_match else 0.0,
19
- "exact_match_result": "pass" if is_match else "fail",
20
- "exact_match_threshold": 1.0,
23
+ "result": "pass" if is_match else "fail",
21
24
  "exact_match_reason": "Exact match found" if is_match else "No exact match found"
22
25
  }
23
-
24
-
25
- exact_match_evaluator = ExactMatchEvaluator()
@@ -1,5 +1,3 @@
1
- from azure.ai.evaluation import evaluate
2
-
3
1
  class PartialMatchEvaluator:
4
2
  def __init__(self, case_sensitive=False):
5
3
  self.case_sensitive = case_sensitive
@@ -25,15 +23,7 @@ class PartialMatchEvaluator:
25
23
  else:
26
24
  score = 0.0
27
25
 
28
- threshold = 0.5 # 50% match threshold
29
- is_pass = score >= threshold
30
-
31
26
  return {
32
27
  "partial_match": score,
33
- "partial_match_result": "pass" if is_pass else "fail",
34
- "partial_match_threshold": threshold,
35
- "partial_match_reason": f"Match score: {score:.3f} ({'above' if is_pass else 'below'} threshold {threshold})"
28
+ "partial_match_reason": f"Match score: {score:.3f}"
36
29
  }
37
-
38
-
39
- partial_match_evaluator = PartialMatchEvaluator(case_sensitive=False)
@@ -0,0 +1,150 @@
1
+ """Evaluator resolution module for per-prompt evaluator configuration.
2
+
3
+ Resolves which evaluators to run on each prompt by merging prompt-level config
4
+ with file-level defaults and system defaults, following extend/replace modes.
5
+ """
6
+
7
+ import difflib
8
+ import logging
9
+ from typing import Any, Dict, Optional, Tuple
10
+
11
+ from common import (
12
+ RELEVANCE,
13
+ COHERENCE,
14
+ GROUNDEDNESS,
15
+ TOOL_CALL_ACCURACY,
16
+ CITATIONS,
17
+ EXACT_MATCH,
18
+ PARTIAL_MATCH,
19
+ REQUIRES_AZURE_OPENAI,
20
+ REQUIRES_TOOL_DEFINITIONS,
21
+ SYSTEM_DEFAULT_EVALUATORS,
22
+ RegistryEntry,
23
+ )
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ # Static registry of available evaluators per data-model.md
29
+ EVALUATOR_REGISTRY: Dict[str, RegistryEntry] = {
30
+ RELEVANCE: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
31
+ COHERENCE: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
32
+ GROUNDEDNESS: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
33
+ TOOL_CALL_ACCURACY: RegistryEntry(type="tool", requires=[REQUIRES_AZURE_OPENAI, REQUIRES_TOOL_DEFINITIONS], default_threshold=3),
34
+ CITATIONS: RegistryEntry(type="non-llm", requires=[], default_threshold=1),
35
+ EXACT_MATCH: RegistryEntry(type="non-llm", requires=[], default_threshold=None),
36
+ PARTIAL_MATCH: RegistryEntry(type="non-llm", requires=[], default_threshold=0.5),
37
+ }
38
+
39
+
40
+ def validate_evaluator_names(evaluator_map: Dict[str, Any]) -> None:
41
+ """Validate that all evaluator names in the map exist in the registry.
42
+
43
+ Raises ValueError with categorized valid names and
44
+ 'Did you mean?' suggestions for close matches.
45
+ """
46
+ invalid_names = [name for name in evaluator_map if name not in EVALUATOR_REGISTRY]
47
+ if not invalid_names:
48
+ return
49
+
50
+ # Categorize valid evaluators for the error message
51
+ llm_evals = [n for n, r in EVALUATOR_REGISTRY.items() if r.type == "llm"]
52
+ tool_evals = [n for n, r in EVALUATOR_REGISTRY.items() if r.type == "tool"]
53
+ non_llm_evals = [n for n, r in EVALUATOR_REGISTRY.items() if r.type == "non-llm"]
54
+
55
+ lines = []
56
+ for name in invalid_names:
57
+ lines.append(f'Unknown evaluator "{name}".')
58
+ close = difflib.get_close_matches(name, EVALUATOR_REGISTRY.keys(), n=1, cutoff=0.5)
59
+ if close:
60
+ lines.append(f'Did you mean "{close[0]}"?')
61
+
62
+ lines.append("")
63
+ lines.append("Valid evaluators are:")
64
+ lines.append(f" - {', '.join(llm_evals)} (LLM-based)")
65
+ lines.append(f" - {', '.join(tool_evals)} (tool evaluation)")
66
+ lines.append(f" - {', '.join(non_llm_evals)} (non-LLM)")
67
+
68
+ raise ValueError("\n".join(lines))
69
+
70
+
71
+ def check_prerequisites(
72
+ evaluator_name: str,
73
+ available_context: Dict[str, bool],
74
+ ) -> Tuple[bool, Optional[str]]:
75
+ """Check if prerequisites for an evaluator are available.
76
+
77
+ Returns (True, None) if all prerequisites are met, or
78
+ (False, warning_message) if a prerequisite is missing.
79
+ """
80
+ registry_entry = EVALUATOR_REGISTRY.get(evaluator_name)
81
+ if not registry_entry:
82
+ return False, f"Unknown evaluator: {evaluator_name}"
83
+
84
+ for req in registry_entry.requires:
85
+ if not available_context.get(req, False):
86
+ msg = (
87
+ f"Skipping evaluator '{evaluator_name}': "
88
+ f"missing prerequisite '{req}'"
89
+ )
90
+ return False, msg
91
+
92
+ return True, None
93
+
94
+
95
+ def resolve_default_evaluators(file_defaults: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
96
+ """Resolve effective default evaluators, falling back to system defaults.
97
+
98
+ Precedence: file-level defaults > system defaults.
99
+ An explicit empty dict means "no default evaluators".
100
+ """
101
+ # File-level defaults (including explicit empty dict)
102
+ if file_defaults is not None:
103
+ return file_defaults
104
+
105
+ # System defaults
106
+ return {name: {} for name in SYSTEM_DEFAULT_EVALUATORS}
107
+
108
+
109
+ def resolve_evaluators_for_prompt(
110
+ prompt_evaluators: Optional[Dict[str, Any]],
111
+ evaluators_mode: str,
112
+ prompt: str,
113
+ default_evaluators: Dict[str, Any],
114
+ ) -> Dict[str, Any]:
115
+ """Resolve which evaluators to run for a single prompt.
116
+
117
+ Args:
118
+ prompt_evaluators: Per-prompt evaluator config (None if not specified).
119
+ evaluators_mode: How to combine with defaults ("extend" or "replace").
120
+ prompt: The prompt text (used in warning messages).
121
+ default_evaluators: Resolved default evaluators (from resolve_default_evaluators).
122
+
123
+ Returns:
124
+ Resolved EvaluatorMap (dict of evaluator_name -> options).
125
+ """
126
+ # No prompt-level config → use defaults
127
+ if prompt_evaluators is None:
128
+ return dict(default_evaluators)
129
+
130
+ if evaluators_mode == "replace":
131
+ if not prompt_evaluators:
132
+ logger.warning(
133
+ "Empty evaluators with 'replace' mode for prompt: '%s'. "
134
+ "No evaluators will run.",
135
+ prompt[:80],
136
+ )
137
+ return dict(prompt_evaluators)
138
+
139
+ # mode == "extend": merge defaults with prompt overrides (prompt wins on conflict)
140
+ merged = dict(default_evaluators)
141
+ merged.update(prompt_evaluators)
142
+ return merged
143
+
144
+
145
+ def get_evaluator_threshold(evaluator_name: str, options: Dict[str, Any]) -> Optional[float]:
146
+ """Get the threshold for an evaluator, with option override support."""
147
+ if "threshold" in options:
148
+ return options["threshold"]
149
+ entry = EVALUATOR_REGISTRY.get(evaluator_name)
150
+ return entry.default_threshold if entry else None