@microsoft/m365-copilot-eval 1.2.1-preview.1 → 1.4.0-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -101
- package/package.json +7 -4
- package/schema/CHANGELOG.md +8 -0
- package/schema/v1/eval-document.schema.json +256 -8
- package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
- package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
- package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
- package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
- package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
- package/schema/v1/examples/valid/comprehensive.json +27 -2
- package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
- package/schema/v1/examples/valid/multi-turn-output.json +59 -0
- package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
- package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
- package/schema/version.json +2 -2
- package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
- package/src/clients/cli/api_clients/A2A/a2a_client.py +456 -0
- package/src/clients/cli/api_clients/REST/__init__.py +3 -0
- package/src/clients/cli/api_clients/REST/sydney_client.py +204 -0
- package/src/clients/cli/api_clients/__init__.py +3 -0
- package/src/clients/cli/api_clients/base_agent_client.py +78 -0
- package/src/clients/cli/cli_logging/__init__.py +0 -0
- package/src/clients/cli/cli_logging/console_diagnostics.py +107 -0
- package/src/clients/cli/cli_logging/logging_utils.py +144 -0
- package/src/clients/cli/common.py +62 -0
- package/src/clients/cli/custom_evaluators/CitationsEvaluator.py +3 -3
- package/src/clients/cli/custom_evaluators/ExactMatchEvaluator.py +11 -11
- package/src/clients/cli/custom_evaluators/PartialMatchEvaluator.py +1 -11
- package/src/clients/cli/evaluator_resolver.py +150 -0
- package/src/clients/cli/generate_report.py +347 -184
- package/src/clients/cli/main.py +1288 -481
- package/src/clients/cli/parallel_executor.py +57 -0
- package/src/clients/cli/readme.md +14 -7
- package/src/clients/cli/requirements.txt +1 -1
- package/src/clients/cli/response_extractor.py +30 -14
- package/src/clients/cli/retry_policy.py +52 -0
- package/src/clients/cli/samples/multiturn_example.json +35 -0
- package/src/clients/cli/throttle_gate.py +82 -0
- package/src/clients/node-js/bin/runevals.js +134 -41
- package/src/clients/node-js/config/default.js +5 -1
- package/src/clients/node-js/lib/agent-id.js +12 -0
- package/src/clients/node-js/lib/env-loader.js +11 -16
- package/src/clients/node-js/lib/eula-manager.js +78 -0
- package/src/clients/node-js/lib/progress.js +13 -11
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import functools
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
7
|
+
|
|
8
|
+
import tzlocal
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class BaseAgentClient(ABC):
|
|
12
|
+
"""Abstract base class for agent API clients.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
@abstractmethod
|
|
16
|
+
def fetch_available_agents(self) -> List[Dict[str, Any]]:
|
|
17
|
+
"""Return the list of agents accessible to the configured user.
|
|
18
|
+
|
|
19
|
+
Implementations that do not support agent enumeration should
|
|
20
|
+
return an empty list.
|
|
21
|
+
"""
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
@abstractmethod
|
|
25
|
+
def send_prompt(
|
|
26
|
+
self,
|
|
27
|
+
prompt: str,
|
|
28
|
+
agent_id: str | None = None,
|
|
29
|
+
conversation_context: Optional[Dict[str, Any]] = None,
|
|
30
|
+
) -> Tuple[Dict[str, Any], Optional[Dict[str, Any]]]:
|
|
31
|
+
"""Send a single prompt and return the response with conversation context.
|
|
32
|
+
|
|
33
|
+
For single-turn usage, pass conversation_context=None.
|
|
34
|
+
For multi-turn usage, pass the context returned from the previous turn.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
prompt: The prompt string to send.
|
|
38
|
+
agent_id: Optional agent ID to target.
|
|
39
|
+
conversation_context: Opaque context dict from a previous turn,
|
|
40
|
+
or None for the first turn / single-turn usage.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Tuple of (enhanced_response_dict, conversation_context).
|
|
44
|
+
The conversation_context should be passed to the next turn
|
|
45
|
+
in a multi-turn conversation, or discarded for single-turn.
|
|
46
|
+
The context structure is implementation-specific:
|
|
47
|
+
- Sydney/REST: {"conversation_id": str}
|
|
48
|
+
- A2A: {"context_id": str}
|
|
49
|
+
Returns None as context when no conversation state is established.
|
|
50
|
+
"""
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
def resolve_agent(self, agent_id: str) -> None:
|
|
54
|
+
"""Pre-resolve agent endpoint. Called once before pipeline starts.
|
|
55
|
+
|
|
56
|
+
Default is no-op. Subclasses may override to cache agent discovery.
|
|
57
|
+
"""
|
|
58
|
+
pass
|
|
59
|
+
|
|
60
|
+
@staticmethod
|
|
61
|
+
@functools.lru_cache(maxsize=1)
|
|
62
|
+
def _get_iana_timezone_name() -> str:
|
|
63
|
+
try:
|
|
64
|
+
return tzlocal.get_localzone_name()
|
|
65
|
+
except Exception:
|
|
66
|
+
return str(tzlocal.get_localzone())
|
|
67
|
+
|
|
68
|
+
@staticmethod
|
|
69
|
+
@functools.lru_cache(maxsize=1)
|
|
70
|
+
def _get_location_info() -> Dict[str, Any]:
|
|
71
|
+
now = datetime.now().astimezone()
|
|
72
|
+
utc_offset = now.utcoffset()
|
|
73
|
+
offset_hours = int(utc_offset.total_seconds() // 3600) if utc_offset is not None else 0
|
|
74
|
+
return {
|
|
75
|
+
"timeZoneOffset": offset_hours,
|
|
76
|
+
"timeZone": BaseAgentClient._get_iana_timezone_name(),
|
|
77
|
+
}
|
|
78
|
+
|
|
File without changes
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import sys
|
|
4
|
+
from collections import OrderedDict
|
|
5
|
+
from typing import Any, Dict, List, Optional
|
|
6
|
+
|
|
7
|
+
from cli_logging.logging_utils import (
|
|
8
|
+
STRUCTURED_LOG_FIELDS,
|
|
9
|
+
Operation,
|
|
10
|
+
format_structured_log_entry,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
_ANSI_COLORS = {
|
|
14
|
+
"debug": "\033[2m", # dim
|
|
15
|
+
"info": "", # default
|
|
16
|
+
"warning": "\033[33m", # yellow
|
|
17
|
+
"error": "\033[31m", # red
|
|
18
|
+
}
|
|
19
|
+
_ANSI_RESET = "\033[0m"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def format_diagnostic_record(record: Dict[str, Any]) -> OrderedDict:
|
|
23
|
+
ordered = OrderedDict()
|
|
24
|
+
for field in STRUCTURED_LOG_FIELDS:
|
|
25
|
+
default = False if field == "is-redacted" else None
|
|
26
|
+
ordered[field] = record.get(field, default)
|
|
27
|
+
return ordered
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def serialize_diagnostic_record(record: Dict[str, Any]) -> str:
|
|
31
|
+
return json.dumps(format_diagnostic_record(record), ensure_ascii=False)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def format_console_record(record: Dict[str, Any], max_message_length: int = 250) -> str:
|
|
35
|
+
"""Format a diagnostic record for human-readable TTY output with ANSI colors."""
|
|
36
|
+
ts = record.get("timestamp", "")
|
|
37
|
+
# Extract HH:MM:SS from ISO timestamp
|
|
38
|
+
time_part = ts[11:19] if len(ts) >= 19 else ts
|
|
39
|
+
level = (record.get("level") or "info").upper()
|
|
40
|
+
message = record.get("message", "")
|
|
41
|
+
if len(message) > max_message_length:
|
|
42
|
+
message = message[:max_message_length] + "…"
|
|
43
|
+
|
|
44
|
+
ids = []
|
|
45
|
+
for key in ("request-id", "conversation-id", "message-id"):
|
|
46
|
+
val = record.get(key)
|
|
47
|
+
if val:
|
|
48
|
+
ids.append(f"{key}={val}")
|
|
49
|
+
id_suffix = f" ({' | '.join(ids)})" if ids else ""
|
|
50
|
+
|
|
51
|
+
color = _ANSI_COLORS.get((record.get("level") or "info").lower(), "")
|
|
52
|
+
reset = _ANSI_RESET if color else ""
|
|
53
|
+
return f"{color}[{time_part}] {level} {message}{id_suffix}{reset}"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def render_diagnostic(record: Dict[str, Any]) -> str:
|
|
57
|
+
"""Return TTY-friendly or JSON output depending on whether stdout is a terminal."""
|
|
58
|
+
if sys.stdout.isatty():
|
|
59
|
+
return format_console_record(record)
|
|
60
|
+
return serialize_diagnostic_record(record)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def emit_structured_log(
|
|
64
|
+
level: str,
|
|
65
|
+
message: str,
|
|
66
|
+
operation: str = Operation.EVALUATE,
|
|
67
|
+
*,
|
|
68
|
+
logger: logging.Logger,
|
|
69
|
+
diagnostic_records: Optional[List[Dict[str, Any]]] = None,
|
|
70
|
+
run_context: Optional[Dict[str, Any]] = None,
|
|
71
|
+
) -> None:
|
|
72
|
+
"""Emit a structured log entry.
|
|
73
|
+
|
|
74
|
+
Formats via format_structured_log_entry, optionally appends to
|
|
75
|
+
diagnostic_records, then logs via render_diagnostic (TTY-friendly or JSON).
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
level: One of "debug", "info", "warning", "error".
|
|
79
|
+
message: Human-readable log message.
|
|
80
|
+
operation: The CLI operation step (e.g. Operation.SEND_PROMPT).
|
|
81
|
+
logger: Logger to emit through.
|
|
82
|
+
diagnostic_records: If provided, the structured entry is appended here.
|
|
83
|
+
run_context: Full run context override (request-id, conversation-id,
|
|
84
|
+
message-id). Defaults to nulls with the given operation.
|
|
85
|
+
"""
|
|
86
|
+
log_level_int = getattr(logging, level.upper(), logging.INFO)
|
|
87
|
+
if diagnostic_records is None and not logger.isEnabledFor(log_level_int):
|
|
88
|
+
return
|
|
89
|
+
|
|
90
|
+
context = run_context or {
|
|
91
|
+
"request-id": None,
|
|
92
|
+
"conversation-id": None,
|
|
93
|
+
"message-id": None,
|
|
94
|
+
"operation": operation,
|
|
95
|
+
}
|
|
96
|
+
entry = format_structured_log_entry(
|
|
97
|
+
level=level,
|
|
98
|
+
message=message,
|
|
99
|
+
logger_name=logger.name,
|
|
100
|
+
run_context=context,
|
|
101
|
+
)
|
|
102
|
+
if diagnostic_records is not None:
|
|
103
|
+
diagnostic_records.append(entry)
|
|
104
|
+
try:
|
|
105
|
+
logger.log(getattr(logging, level.upper(), logging.INFO), render_diagnostic(entry))
|
|
106
|
+
except Exception:
|
|
107
|
+
pass
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class LogLevel(str, Enum):
|
|
9
|
+
"""Log level enum. Inherits from str so comparisons like level == "debug" work."""
|
|
10
|
+
DEBUG = "debug"
|
|
11
|
+
INFO = "info"
|
|
12
|
+
WARNING = "warning"
|
|
13
|
+
ERROR = "error"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Operation(str, Enum):
|
|
17
|
+
"""CLI operation steps for structured log entries."""
|
|
18
|
+
SETUP = "setup"
|
|
19
|
+
AUTHENTICATE = "authenticate"
|
|
20
|
+
VALIDATE_ENV = "validate-env"
|
|
21
|
+
LOAD_PROMPTS = "load-prompts"
|
|
22
|
+
FETCH_AGENTS = "fetch-agents"
|
|
23
|
+
SEND_PROMPT = "send-prompt"
|
|
24
|
+
EVALUATE = "evaluate"
|
|
25
|
+
WRITE_OUTPUT = "write-output"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
ALLOWED_LOG_LEVELS = tuple(level.value for level in LogLevel)
|
|
29
|
+
LOG_LEVEL_MAP = {
|
|
30
|
+
LogLevel.DEBUG: logging.DEBUG,
|
|
31
|
+
LogLevel.INFO: logging.INFO,
|
|
32
|
+
LogLevel.WARNING: logging.WARNING,
|
|
33
|
+
LogLevel.ERROR: logging.ERROR,
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
STRUCTURED_LOG_FIELDS = (
|
|
37
|
+
"timestamp",
|
|
38
|
+
"level",
|
|
39
|
+
"operation",
|
|
40
|
+
"request-id",
|
|
41
|
+
"conversation-id",
|
|
42
|
+
"message-id",
|
|
43
|
+
"logger",
|
|
44
|
+
"message",
|
|
45
|
+
"is-redacted",
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def normalize_log_level(value: Optional[str]) -> Optional[str]:
|
|
50
|
+
if value is None:
|
|
51
|
+
return None
|
|
52
|
+
return value.strip().lower()
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def resolve_log_level(
|
|
56
|
+
log_level_values: Optional[List[str]],
|
|
57
|
+
) -> Tuple[Optional[str], Optional[str]]:
|
|
58
|
+
values = log_level_values or []
|
|
59
|
+
if not values:
|
|
60
|
+
return "info", None
|
|
61
|
+
|
|
62
|
+
# Use the last value provided (aligns with Node.js wrapper behavior).
|
|
63
|
+
last = normalize_log_level(values[-1])
|
|
64
|
+
if last not in ALLOWED_LOG_LEVELS:
|
|
65
|
+
return (
|
|
66
|
+
None,
|
|
67
|
+
"Invalid value for --log-level. Supported values are: "
|
|
68
|
+
"debug, info, warning, error.",
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
return last, None
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def utc_iso_timestamp() -> str:
|
|
75
|
+
return datetime.now(timezone.utc).isoformat()
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def build_run_context(
|
|
79
|
+
operation: str = "evaluate",
|
|
80
|
+
request_id: Optional[str] = None,
|
|
81
|
+
conversation_id: Optional[str] = None,
|
|
82
|
+
message_id: Optional[str] = None,
|
|
83
|
+
) -> Dict[str, Optional[str]]:
|
|
84
|
+
return {
|
|
85
|
+
"request-id": request_id,
|
|
86
|
+
"conversation-id": conversation_id,
|
|
87
|
+
"message-id": message_id,
|
|
88
|
+
"operation": operation,
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
_SECRET_PATTERNS = [
|
|
93
|
+
re.compile(r"(?i)(api[_-]?key\s*[:=]\s*)([^\s,;]+)"),
|
|
94
|
+
re.compile(r"(?i)(token\s*[:=]\s*)([^\s,;]+)"),
|
|
95
|
+
re.compile(r"(?i)(authorization\s*[:=]\s*bearer\s+)([^\s,;]+)"),
|
|
96
|
+
re.compile(r"(?i)(password\s*[:=]\s*)([^\s,;]+)"),
|
|
97
|
+
]
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def redact_sensitive_content(message: Optional[str]) -> Tuple[str, bool]:
|
|
101
|
+
if message is None:
|
|
102
|
+
return "", False
|
|
103
|
+
|
|
104
|
+
redacted = message
|
|
105
|
+
changed = False
|
|
106
|
+
for pattern in _SECRET_PATTERNS:
|
|
107
|
+
updated = pattern.sub(r"\1***REDACTED***", redacted)
|
|
108
|
+
if updated != redacted:
|
|
109
|
+
changed = True
|
|
110
|
+
redacted = updated
|
|
111
|
+
|
|
112
|
+
# Fallback: match strings 32+ chars containing mixed case and digits
|
|
113
|
+
# (likely a credential/token) that weren't already caught above.
|
|
114
|
+
if (
|
|
115
|
+
"***REDACTED***" not in redacted
|
|
116
|
+
and re.search(
|
|
117
|
+
r"(?=[A-Za-z0-9_\-]*[A-Z])(?=[A-Za-z0-9_\-]*[a-z])"
|
|
118
|
+
r"(?=[A-Za-z0-9_\-]*[0-9])[A-Za-z0-9_\-]{32,}",
|
|
119
|
+
redacted,
|
|
120
|
+
)
|
|
121
|
+
):
|
|
122
|
+
return "[REDACTED]", True
|
|
123
|
+
|
|
124
|
+
return redacted, changed
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def format_structured_log_entry(
|
|
128
|
+
level: str,
|
|
129
|
+
message: str,
|
|
130
|
+
logger_name: str,
|
|
131
|
+
run_context: Dict[str, Optional[str]],
|
|
132
|
+
) -> Dict[str, Any]:
|
|
133
|
+
safe_message, is_redacted = redact_sensitive_content(message)
|
|
134
|
+
return {
|
|
135
|
+
"level": normalize_log_level(level) or "info",
|
|
136
|
+
"message": safe_message,
|
|
137
|
+
"logger": logger_name,
|
|
138
|
+
"timestamp": utc_iso_timestamp(),
|
|
139
|
+
"request-id": run_context.get("request-id"),
|
|
140
|
+
"conversation-id": run_context.get("conversation-id"),
|
|
141
|
+
"message-id": run_context.get("message-id"),
|
|
142
|
+
"operation": run_context.get("operation"),
|
|
143
|
+
"is-redacted": is_redacted,
|
|
144
|
+
}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Shared types and constants for the CLI."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import List, Optional
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def pascal_case_to_title(eval_name: str) -> str:
|
|
9
|
+
"""Convert PascalCase evaluator name to space-separated display name.
|
|
10
|
+
|
|
11
|
+
e.g., "ToolCallAccuracy" → "Tool Call Accuracy"
|
|
12
|
+
"""
|
|
13
|
+
return re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', eval_name)
|
|
14
|
+
|
|
15
|
+
# Canonical evaluator name constants
|
|
16
|
+
RELEVANCE = "Relevance"
|
|
17
|
+
COHERENCE = "Coherence"
|
|
18
|
+
GROUNDEDNESS = "Groundedness"
|
|
19
|
+
TOOL_CALL_ACCURACY = "ToolCallAccuracy"
|
|
20
|
+
CITATIONS = "Citations"
|
|
21
|
+
EXACT_MATCH = "ExactMatch"
|
|
22
|
+
PARTIAL_MATCH = "PartialMatch"
|
|
23
|
+
|
|
24
|
+
# Prerequisite constants
|
|
25
|
+
REQUIRES_AZURE_OPENAI = "azure_openai"
|
|
26
|
+
REQUIRES_TOOL_DEFINITIONS = "tool_definitions"
|
|
27
|
+
|
|
28
|
+
# Evaluation status constants
|
|
29
|
+
# Outcome statuses (agent responded, evaluators ran):
|
|
30
|
+
STATUS_PASS = "pass" # All evaluators scored above threshold
|
|
31
|
+
STATUS_FAIL = "fail" # At least one evaluator scored below threshold
|
|
32
|
+
# Error state (evaluation couldn't complete):
|
|
33
|
+
STATUS_ERROR = "error" # API call failed / response couldn't be obtained
|
|
34
|
+
# Thread-level aggregate status (multi-turn only):
|
|
35
|
+
STATUS_PARTIAL = "partial" # Some turns passed, some did not
|
|
36
|
+
# Fallback for missing status:
|
|
37
|
+
STATUS_UNKNOWN = "unknown"
|
|
38
|
+
|
|
39
|
+
# System defaults when no file-level or env-level defaults are configured
|
|
40
|
+
SYSTEM_DEFAULT_EVALUATORS = [
|
|
41
|
+
RELEVANCE,
|
|
42
|
+
COHERENCE,
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# Mapping from evaluator name to the key used in evaluator output dicts
|
|
47
|
+
METRIC_IDS = {
|
|
48
|
+
RELEVANCE: "relevance",
|
|
49
|
+
COHERENCE: "coherence",
|
|
50
|
+
GROUNDEDNESS: "groundedness",
|
|
51
|
+
TOOL_CALL_ACCURACY: "tool_call_accuracy",
|
|
52
|
+
CITATIONS: "citations",
|
|
53
|
+
EXACT_MATCH: "exact_match",
|
|
54
|
+
PARTIAL_MATCH: "partial_match",
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class RegistryEntry:
|
|
60
|
+
type: str # "llm", "tool", or "non-llm"
|
|
61
|
+
requires: List[str]
|
|
62
|
+
default_threshold: Optional[float]
|
|
@@ -17,8 +17,8 @@ from typing import Dict, Any, Optional
|
|
|
17
17
|
class CitationFormat(Enum):
|
|
18
18
|
"""Enum for different citation formats supported by the evaluator."""
|
|
19
19
|
OAI_UNICODE = "oai_unicode" # New format: \ue200cite\ue202turn{X}search{Y}\ue201
|
|
20
|
-
LEGACY_BRACKET = "
|
|
21
|
-
AUTO = "
|
|
20
|
+
LEGACY_BRACKET = "bracket" # Old format: [^i^]
|
|
21
|
+
AUTO = "mixed" # Automatically detect both formats
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class CitationsEvaluator:
|
|
@@ -141,7 +141,7 @@ class CitationsEvaluator:
|
|
|
141
141
|
|
|
142
142
|
results = {
|
|
143
143
|
"citation_format": self.citation_format.value,
|
|
144
|
-
"
|
|
144
|
+
"citations": total_citations,
|
|
145
145
|
"result": "pass" if total_citations > 0 else "fail",
|
|
146
146
|
"threshold": 1,
|
|
147
147
|
"reason": " ".join(reason_parts)
|
|
@@ -1,8 +1,6 @@
|
|
|
1
|
-
from azure.ai.evaluation import evaluate
|
|
2
|
-
|
|
3
1
|
class ExactMatchEvaluator:
|
|
4
|
-
def __init__(self):
|
|
5
|
-
|
|
2
|
+
def __init__(self, case_sensitive=False):
|
|
3
|
+
self.case_sensitive = case_sensitive
|
|
6
4
|
|
|
7
5
|
def __call__(self, *, response: str, expected_answer: str, **kwargs):
|
|
8
6
|
if response is None or response.strip() == "":
|
|
@@ -11,15 +9,17 @@ class ExactMatchEvaluator:
|
|
|
11
9
|
if expected_answer is None:
|
|
12
10
|
raise ValueError("Expected answer cannot be None.")
|
|
13
11
|
|
|
14
|
-
|
|
15
|
-
|
|
12
|
+
resp = response.strip()
|
|
13
|
+
exp = expected_answer.strip()
|
|
14
|
+
|
|
15
|
+
if not self.case_sensitive:
|
|
16
|
+
resp = resp.lower()
|
|
17
|
+
exp = exp.lower()
|
|
18
|
+
|
|
19
|
+
is_match = resp == exp
|
|
16
20
|
|
|
17
21
|
return {
|
|
18
22
|
"exact_match": 1.0 if is_match else 0.0,
|
|
19
|
-
"
|
|
20
|
-
"exact_match_threshold": 1.0,
|
|
23
|
+
"result": "pass" if is_match else "fail",
|
|
21
24
|
"exact_match_reason": "Exact match found" if is_match else "No exact match found"
|
|
22
25
|
}
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
exact_match_evaluator = ExactMatchEvaluator()
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
from azure.ai.evaluation import evaluate
|
|
2
|
-
|
|
3
1
|
class PartialMatchEvaluator:
|
|
4
2
|
def __init__(self, case_sensitive=False):
|
|
5
3
|
self.case_sensitive = case_sensitive
|
|
@@ -25,15 +23,7 @@ class PartialMatchEvaluator:
|
|
|
25
23
|
else:
|
|
26
24
|
score = 0.0
|
|
27
25
|
|
|
28
|
-
threshold = 0.5 # 50% match threshold
|
|
29
|
-
is_pass = score >= threshold
|
|
30
|
-
|
|
31
26
|
return {
|
|
32
27
|
"partial_match": score,
|
|
33
|
-
"
|
|
34
|
-
"partial_match_threshold": threshold,
|
|
35
|
-
"partial_match_reason": f"Match score: {score:.3f} ({'above' if is_pass else 'below'} threshold {threshold})"
|
|
28
|
+
"partial_match_reason": f"Match score: {score:.3f}"
|
|
36
29
|
}
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
partial_match_evaluator = PartialMatchEvaluator(case_sensitive=False)
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""Evaluator resolution module for per-prompt evaluator configuration.
|
|
2
|
+
|
|
3
|
+
Resolves which evaluators to run on each prompt by merging prompt-level config
|
|
4
|
+
with file-level defaults and system defaults, following extend/replace modes.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import difflib
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Any, Dict, Optional, Tuple
|
|
10
|
+
|
|
11
|
+
from common import (
|
|
12
|
+
RELEVANCE,
|
|
13
|
+
COHERENCE,
|
|
14
|
+
GROUNDEDNESS,
|
|
15
|
+
TOOL_CALL_ACCURACY,
|
|
16
|
+
CITATIONS,
|
|
17
|
+
EXACT_MATCH,
|
|
18
|
+
PARTIAL_MATCH,
|
|
19
|
+
REQUIRES_AZURE_OPENAI,
|
|
20
|
+
REQUIRES_TOOL_DEFINITIONS,
|
|
21
|
+
SYSTEM_DEFAULT_EVALUATORS,
|
|
22
|
+
RegistryEntry,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# Static registry of available evaluators per data-model.md
|
|
29
|
+
EVALUATOR_REGISTRY: Dict[str, RegistryEntry] = {
|
|
30
|
+
RELEVANCE: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
|
|
31
|
+
COHERENCE: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
|
|
32
|
+
GROUNDEDNESS: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
|
|
33
|
+
TOOL_CALL_ACCURACY: RegistryEntry(type="tool", requires=[REQUIRES_AZURE_OPENAI, REQUIRES_TOOL_DEFINITIONS], default_threshold=3),
|
|
34
|
+
CITATIONS: RegistryEntry(type="non-llm", requires=[], default_threshold=1),
|
|
35
|
+
EXACT_MATCH: RegistryEntry(type="non-llm", requires=[], default_threshold=None),
|
|
36
|
+
PARTIAL_MATCH: RegistryEntry(type="non-llm", requires=[], default_threshold=0.5),
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def validate_evaluator_names(evaluator_map: Dict[str, Any]) -> None:
|
|
41
|
+
"""Validate that all evaluator names in the map exist in the registry.
|
|
42
|
+
|
|
43
|
+
Raises ValueError with categorized valid names and
|
|
44
|
+
'Did you mean?' suggestions for close matches.
|
|
45
|
+
"""
|
|
46
|
+
invalid_names = [name for name in evaluator_map if name not in EVALUATOR_REGISTRY]
|
|
47
|
+
if not invalid_names:
|
|
48
|
+
return
|
|
49
|
+
|
|
50
|
+
# Categorize valid evaluators for the error message
|
|
51
|
+
llm_evals = [n for n, r in EVALUATOR_REGISTRY.items() if r.type == "llm"]
|
|
52
|
+
tool_evals = [n for n, r in EVALUATOR_REGISTRY.items() if r.type == "tool"]
|
|
53
|
+
non_llm_evals = [n for n, r in EVALUATOR_REGISTRY.items() if r.type == "non-llm"]
|
|
54
|
+
|
|
55
|
+
lines = []
|
|
56
|
+
for name in invalid_names:
|
|
57
|
+
lines.append(f'Unknown evaluator "{name}".')
|
|
58
|
+
close = difflib.get_close_matches(name, EVALUATOR_REGISTRY.keys(), n=1, cutoff=0.5)
|
|
59
|
+
if close:
|
|
60
|
+
lines.append(f'Did you mean "{close[0]}"?')
|
|
61
|
+
|
|
62
|
+
lines.append("")
|
|
63
|
+
lines.append("Valid evaluators are:")
|
|
64
|
+
lines.append(f" - {', '.join(llm_evals)} (LLM-based)")
|
|
65
|
+
lines.append(f" - {', '.join(tool_evals)} (tool evaluation)")
|
|
66
|
+
lines.append(f" - {', '.join(non_llm_evals)} (non-LLM)")
|
|
67
|
+
|
|
68
|
+
raise ValueError("\n".join(lines))
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def check_prerequisites(
|
|
72
|
+
evaluator_name: str,
|
|
73
|
+
available_context: Dict[str, bool],
|
|
74
|
+
) -> Tuple[bool, Optional[str]]:
|
|
75
|
+
"""Check if prerequisites for an evaluator are available.
|
|
76
|
+
|
|
77
|
+
Returns (True, None) if all prerequisites are met, or
|
|
78
|
+
(False, warning_message) if a prerequisite is missing.
|
|
79
|
+
"""
|
|
80
|
+
registry_entry = EVALUATOR_REGISTRY.get(evaluator_name)
|
|
81
|
+
if not registry_entry:
|
|
82
|
+
return False, f"Unknown evaluator: {evaluator_name}"
|
|
83
|
+
|
|
84
|
+
for req in registry_entry.requires:
|
|
85
|
+
if not available_context.get(req, False):
|
|
86
|
+
msg = (
|
|
87
|
+
f"Skipping evaluator '{evaluator_name}': "
|
|
88
|
+
f"missing prerequisite '{req}'"
|
|
89
|
+
)
|
|
90
|
+
return False, msg
|
|
91
|
+
|
|
92
|
+
return True, None
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def resolve_default_evaluators(file_defaults: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
|
96
|
+
"""Resolve effective default evaluators, falling back to system defaults.
|
|
97
|
+
|
|
98
|
+
Precedence: file-level defaults > system defaults.
|
|
99
|
+
An explicit empty dict means "no default evaluators".
|
|
100
|
+
"""
|
|
101
|
+
# File-level defaults (including explicit empty dict)
|
|
102
|
+
if file_defaults is not None:
|
|
103
|
+
return file_defaults
|
|
104
|
+
|
|
105
|
+
# System defaults
|
|
106
|
+
return {name: {} for name in SYSTEM_DEFAULT_EVALUATORS}
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def resolve_evaluators_for_prompt(
|
|
110
|
+
prompt_evaluators: Optional[Dict[str, Any]],
|
|
111
|
+
evaluators_mode: str,
|
|
112
|
+
prompt: str,
|
|
113
|
+
default_evaluators: Dict[str, Any],
|
|
114
|
+
) -> Dict[str, Any]:
|
|
115
|
+
"""Resolve which evaluators to run for a single prompt.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
prompt_evaluators: Per-prompt evaluator config (None if not specified).
|
|
119
|
+
evaluators_mode: How to combine with defaults ("extend" or "replace").
|
|
120
|
+
prompt: The prompt text (used in warning messages).
|
|
121
|
+
default_evaluators: Resolved default evaluators (from resolve_default_evaluators).
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
Resolved EvaluatorMap (dict of evaluator_name -> options).
|
|
125
|
+
"""
|
|
126
|
+
# No prompt-level config → use defaults
|
|
127
|
+
if prompt_evaluators is None:
|
|
128
|
+
return dict(default_evaluators)
|
|
129
|
+
|
|
130
|
+
if evaluators_mode == "replace":
|
|
131
|
+
if not prompt_evaluators:
|
|
132
|
+
logger.warning(
|
|
133
|
+
"Empty evaluators with 'replace' mode for prompt: '%s'. "
|
|
134
|
+
"No evaluators will run.",
|
|
135
|
+
prompt[:80],
|
|
136
|
+
)
|
|
137
|
+
return dict(prompt_evaluators)
|
|
138
|
+
|
|
139
|
+
# mode == "extend": merge defaults with prompt overrides (prompt wins on conflict)
|
|
140
|
+
merged = dict(default_evaluators)
|
|
141
|
+
merged.update(prompt_evaluators)
|
|
142
|
+
return merged
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def get_evaluator_threshold(evaluator_name: str, options: Dict[str, Any]) -> Optional[float]:
|
|
146
|
+
"""Get the threshold for an evaluator, with option override support."""
|
|
147
|
+
if "threshold" in options:
|
|
148
|
+
return options["threshold"]
|
|
149
|
+
entry = EVALUATOR_REGISTRY.get(evaluator_name)
|
|
150
|
+
return entry.default_threshold if entry else None
|