@microsoft/m365-copilot-eval 1.3.0-preview.1 → 1.5.0-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +135 -100
- package/package.json +7 -4
- package/schema/CHANGELOG.md +7 -0
- package/schema/v1/eval-document.schema.json +143 -11
- package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
- package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
- package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
- package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
- package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
- package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
- package/schema/v1/examples/valid/multi-turn-output.json +59 -0
- package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
- package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
- package/schema/version.json +2 -2
- package/src/clients/cli/agent_selector.py +74 -0
- package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
- package/src/clients/cli/api_clients/A2A/a2a_client.py +475 -0
- package/src/clients/cli/api_clients/__init__.py +3 -0
- package/src/clients/cli/api_clients/base_agent_client.py +77 -0
- package/src/clients/cli/cli_args.py +136 -0
- package/src/clients/cli/cli_logging/cli_logger.py +33 -0
- package/src/clients/cli/cli_logging/console_diagnostics.py +56 -2
- package/src/clients/cli/cli_logging/logging_utils.py +0 -1
- package/src/clients/cli/common.py +64 -0
- package/src/clients/cli/env_validator.py +73 -0
- package/src/clients/cli/evaluation_runner.py +653 -0
- package/src/clients/cli/evaluator_resolver.py +9 -6
- package/src/clients/cli/generate_report.py +272 -129
- package/src/clients/cli/main.py +157 -1174
- package/src/clients/cli/parallel_executor.py +57 -0
- package/src/clients/cli/prompt_loader.py +148 -0
- package/src/clients/cli/readme.md +9 -53
- package/src/clients/cli/requirements.txt +1 -1
- package/src/clients/cli/response_extractor.py +4 -603
- package/src/clients/cli/result_writer.py +488 -0
- package/src/clients/cli/retry_policy.py +52 -0
- package/src/clients/cli/samples/multiturn_example.json +35 -0
- package/src/clients/cli/throttle_gate.py +82 -0
- package/src/clients/node-js/bin/runevals.js +82 -20
- package/src/clients/node-js/config/default.js +12 -11
- package/src/clients/node-js/lib/agent-id.js +12 -0
- package/src/clients/node-js/lib/env-loader.js +14 -20
- package/src/clients/node-js/lib/eula-manager.js +78 -0
- package/src/clients/node-js/lib/progress.js +13 -11
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Shared CLI logger instance and structured-log convenience wrapper.
|
|
2
|
+
|
|
3
|
+
Every module in the CLI layer that needs to emit diagnostics imports from here
|
|
4
|
+
instead of main.py, which avoids circular-import issues.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
import sys
|
|
9
|
+
from typing import Any, Dict, List
|
|
10
|
+
|
|
11
|
+
from cli_logging.console_diagnostics import emit_structured_log as _emit_structured_log
|
|
12
|
+
from cli_logging.logging_utils import LOG_LEVEL_MAP, Operation
|
|
13
|
+
|
|
14
|
+
CLI_LOGGER_NAME = "m365.eval.cli"
|
|
15
|
+
CLI_LOGGER = logging.getLogger(CLI_LOGGER_NAME)
|
|
16
|
+
DIAGNOSTIC_RECORDS: List[Dict[str, Any]] = []
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def configure_cli_logging(effective_log_level: str) -> None:
|
|
20
|
+
if not CLI_LOGGER.handlers:
|
|
21
|
+
handler = logging.StreamHandler(sys.stdout)
|
|
22
|
+
handler.setFormatter(logging.Formatter("%(message)s"))
|
|
23
|
+
CLI_LOGGER.addHandler(handler)
|
|
24
|
+
CLI_LOGGER.propagate = False
|
|
25
|
+
CLI_LOGGER.setLevel(LOG_LEVEL_MAP[effective_log_level])
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def emit_structured_log(level: str, message: str, operation: str = Operation.EVALUATE) -> None:
|
|
29
|
+
_emit_structured_log(
|
|
30
|
+
level, message, operation,
|
|
31
|
+
logger=CLI_LOGGER,
|
|
32
|
+
diagnostic_records=DIAGNOSTIC_RECORDS,
|
|
33
|
+
)
|
|
@@ -1,9 +1,15 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import logging
|
|
2
3
|
import sys
|
|
3
4
|
from collections import OrderedDict
|
|
4
|
-
from typing import Any, Dict
|
|
5
|
+
from typing import Any, Dict, List, Optional
|
|
5
6
|
|
|
6
|
-
from cli_logging.logging_utils import
|
|
7
|
+
from cli_logging.logging_utils import (
|
|
8
|
+
STRUCTURED_LOG_FIELDS,
|
|
9
|
+
Operation,
|
|
10
|
+
format_structured_log_entry,
|
|
11
|
+
redact_sensitive_content,
|
|
12
|
+
)
|
|
7
13
|
|
|
8
14
|
_ANSI_COLORS = {
|
|
9
15
|
"debug": "\033[2m", # dim
|
|
@@ -53,3 +59,51 @@ def render_diagnostic(record: Dict[str, Any]) -> str:
|
|
|
53
59
|
if sys.stdout.isatty():
|
|
54
60
|
return format_console_record(record)
|
|
55
61
|
return serialize_diagnostic_record(record)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def emit_structured_log(
|
|
65
|
+
level: str,
|
|
66
|
+
message: str,
|
|
67
|
+
operation: str = Operation.EVALUATE,
|
|
68
|
+
*,
|
|
69
|
+
logger: logging.Logger,
|
|
70
|
+
diagnostic_records: Optional[List[Dict[str, Any]]] = None,
|
|
71
|
+
run_context: Optional[Dict[str, Any]] = None,
|
|
72
|
+
) -> None:
|
|
73
|
+
"""Emit a structured log entry.
|
|
74
|
+
|
|
75
|
+
Formats via format_structured_log_entry, optionally appends to
|
|
76
|
+
diagnostic_records, then logs via render_diagnostic (TTY-friendly or JSON).
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
level: One of "debug", "info", "warning", "error".
|
|
80
|
+
message: Human-readable log message.
|
|
81
|
+
operation: The CLI operation step (e.g. Operation.SEND_PROMPT).
|
|
82
|
+
logger: Logger to emit through.
|
|
83
|
+
diagnostic_records: If provided, the structured entry is appended here.
|
|
84
|
+
run_context: Full run context override (request-id, conversation-id,
|
|
85
|
+
message-id). Defaults to nulls with the given operation.
|
|
86
|
+
"""
|
|
87
|
+
log_level_int = getattr(logging, level.upper(), logging.INFO)
|
|
88
|
+
if diagnostic_records is None and not logger.isEnabledFor(log_level_int):
|
|
89
|
+
return
|
|
90
|
+
|
|
91
|
+
context = run_context or {
|
|
92
|
+
"request-id": None,
|
|
93
|
+
"conversation-id": None,
|
|
94
|
+
"message-id": None,
|
|
95
|
+
"operation": operation,
|
|
96
|
+
}
|
|
97
|
+
entry = format_structured_log_entry(
|
|
98
|
+
level=level,
|
|
99
|
+
message=message,
|
|
100
|
+
logger_name=logger.name,
|
|
101
|
+
run_context=context,
|
|
102
|
+
)
|
|
103
|
+
if diagnostic_records is not None:
|
|
104
|
+
diagnostic_records.append(entry)
|
|
105
|
+
try:
|
|
106
|
+
rendered, _ = redact_sensitive_content(render_diagnostic(entry))
|
|
107
|
+
logger.log(getattr(logging, level.upper(), logging.INFO), rendered)
|
|
108
|
+
except Exception:
|
|
109
|
+
pass
|
|
@@ -4,6 +4,22 @@ import re
|
|
|
4
4
|
from dataclasses import dataclass
|
|
5
5
|
from typing import List, Optional
|
|
6
6
|
|
|
7
|
+
MAX_CONCURRENCY = 5
|
|
8
|
+
MAX_ATTEMPTS = 4 # Initial attempt + 3 retries
|
|
9
|
+
MAX_TURNS_PER_THREAD = 20
|
|
10
|
+
LONG_THREAD_WARNING_THRESHOLD = 10
|
|
11
|
+
DEFAULT_PASS_THRESHOLD = 3
|
|
12
|
+
|
|
13
|
+
# ── Environment variable name constants ──────────────────────────────
|
|
14
|
+
ENV_AZURE_AI_OPENAI_ENDPOINT = "AZURE_AI_OPENAI_ENDPOINT"
|
|
15
|
+
ENV_AZURE_AI_API_KEY = "AZURE_AI_API_KEY"
|
|
16
|
+
ENV_AZURE_AI_API_VERSION = "AZURE_AI_API_VERSION"
|
|
17
|
+
ENV_AZURE_AI_MODEL_NAME = "AZURE_AI_MODEL_NAME"
|
|
18
|
+
ENV_TENANT_ID = "TENANT_ID"
|
|
19
|
+
ENV_WORK_IQ_A2A_ENDPOINT = "WORK_IQ_A2A_ENDPOINT"
|
|
20
|
+
ENV_WORK_IQ_A2A_CLIENT_ID = "WORK_IQ_A2A_CLIENT_ID"
|
|
21
|
+
ENV_WORK_IQ_A2A_SCOPES = "WORK_IQ_A2A_SCOPES"
|
|
22
|
+
|
|
7
23
|
|
|
8
24
|
def pascal_case_to_title(eval_name: str) -> str:
|
|
9
25
|
"""Convert PascalCase evaluator name to space-separated display name.
|
|
@@ -12,10 +28,12 @@ def pascal_case_to_title(eval_name: str) -> str:
|
|
|
12
28
|
"""
|
|
13
29
|
return re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', eval_name)
|
|
14
30
|
|
|
31
|
+
|
|
15
32
|
# Canonical evaluator name constants
|
|
16
33
|
RELEVANCE = "Relevance"
|
|
17
34
|
COHERENCE = "Coherence"
|
|
18
35
|
GROUNDEDNESS = "Groundedness"
|
|
36
|
+
SIMILARITY = "Similarity"
|
|
19
37
|
TOOL_CALL_ACCURACY = "ToolCallAccuracy"
|
|
20
38
|
CITATIONS = "Citations"
|
|
21
39
|
EXACT_MATCH = "ExactMatch"
|
|
@@ -25,6 +43,17 @@ PARTIAL_MATCH = "PartialMatch"
|
|
|
25
43
|
REQUIRES_AZURE_OPENAI = "azure_openai"
|
|
26
44
|
REQUIRES_TOOL_DEFINITIONS = "tool_definitions"
|
|
27
45
|
|
|
46
|
+
# Evaluation status constants
|
|
47
|
+
# Outcome statuses (agent responded, evaluators ran):
|
|
48
|
+
STATUS_PASS = "pass" # All evaluators scored above threshold
|
|
49
|
+
STATUS_FAIL = "fail" # At least one evaluator scored below threshold
|
|
50
|
+
# Error state (evaluation couldn't complete):
|
|
51
|
+
STATUS_ERROR = "error" # API call failed / response couldn't be obtained
|
|
52
|
+
# Thread-level aggregate status (multi-turn only):
|
|
53
|
+
STATUS_PARTIAL = "partial" # Some turns passed, some did not
|
|
54
|
+
# Fallback for missing status:
|
|
55
|
+
STATUS_UNKNOWN = "unknown"
|
|
56
|
+
|
|
28
57
|
# System defaults when no file-level or env-level defaults are configured
|
|
29
58
|
SYSTEM_DEFAULT_EVALUATORS = [
|
|
30
59
|
RELEVANCE,
|
|
@@ -37,6 +66,7 @@ METRIC_IDS = {
|
|
|
37
66
|
RELEVANCE: "relevance",
|
|
38
67
|
COHERENCE: "coherence",
|
|
39
68
|
GROUNDEDNESS: "groundedness",
|
|
69
|
+
SIMILARITY: "similarity",
|
|
40
70
|
TOOL_CALL_ACCURACY: "tool_call_accuracy",
|
|
41
71
|
CITATIONS: "citations",
|
|
42
72
|
EXACT_MATCH: "exact_match",
|
|
@@ -49,3 +79,37 @@ class RegistryEntry:
|
|
|
49
79
|
type: str # "llm", "tool", or "non-llm"
|
|
50
80
|
requires: List[str]
|
|
51
81
|
default_threshold: Optional[float]
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclass(frozen=True)
|
|
85
|
+
class RunConfig:
|
|
86
|
+
"""Typed, immutable runtime configuration passed across module boundaries.
|
|
87
|
+
|
|
88
|
+
Use ``RunConfig.from_namespace(args)`` to build from argparse output.
|
|
89
|
+
Use ``dataclasses.replace(config, field=value)`` to derive new configs.
|
|
90
|
+
"""
|
|
91
|
+
prompts: Optional[List[str]] = None
|
|
92
|
+
expected: Optional[List[str]] = None
|
|
93
|
+
prompts_file: Optional[str] = None
|
|
94
|
+
interactive: bool = False
|
|
95
|
+
m365_agent_id: Optional[str] = None
|
|
96
|
+
output: Optional[str] = None
|
|
97
|
+
log_level: Optional[List[str]] = None
|
|
98
|
+
effective_log_level: str = "info"
|
|
99
|
+
signout: bool = False
|
|
100
|
+
concurrency: int = MAX_CONCURRENCY
|
|
101
|
+
|
|
102
|
+
@classmethod
|
|
103
|
+
def from_namespace(cls, args) -> "RunConfig":
|
|
104
|
+
"""Build a RunConfig from an argparse.Namespace."""
|
|
105
|
+
return cls(
|
|
106
|
+
prompts=args.prompts,
|
|
107
|
+
expected=args.expected,
|
|
108
|
+
prompts_file=args.prompts_file,
|
|
109
|
+
interactive=args.interactive,
|
|
110
|
+
m365_agent_id=args.m365_agent_id,
|
|
111
|
+
output=args.output,
|
|
112
|
+
log_level=args.log_level,
|
|
113
|
+
signout=args.signout,
|
|
114
|
+
concurrency=args.concurrency,
|
|
115
|
+
)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""Environment validation and URL security checks."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
import urllib.parse
|
|
6
|
+
from typing import List
|
|
7
|
+
|
|
8
|
+
from cli_logging.cli_logger import emit_structured_log
|
|
9
|
+
from cli_logging.logging_utils import Operation
|
|
10
|
+
from common import (
|
|
11
|
+
ENV_AZURE_AI_OPENAI_ENDPOINT,
|
|
12
|
+
ENV_AZURE_AI_API_KEY,
|
|
13
|
+
ENV_AZURE_AI_API_VERSION,
|
|
14
|
+
ENV_AZURE_AI_MODEL_NAME,
|
|
15
|
+
ENV_WORK_IQ_A2A_ENDPOINT,
|
|
16
|
+
ENV_WORK_IQ_A2A_CLIENT_ID,
|
|
17
|
+
ENV_WORK_IQ_A2A_SCOPES,
|
|
18
|
+
ENV_TENANT_ID,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# Allowed endpoints for URL validation
|
|
23
|
+
ALLOWED_ENDPOINTS = [
|
|
24
|
+
'substrate.office.com',
|
|
25
|
+
'graph.microsoft.com',
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def validate_environment() -> None:
|
|
30
|
+
"""Validate required environment variables."""
|
|
31
|
+
required_env_vars = [
|
|
32
|
+
ENV_AZURE_AI_OPENAI_ENDPOINT,
|
|
33
|
+
ENV_AZURE_AI_API_KEY,
|
|
34
|
+
ENV_AZURE_AI_API_VERSION,
|
|
35
|
+
ENV_AZURE_AI_MODEL_NAME,
|
|
36
|
+
ENV_WORK_IQ_A2A_ENDPOINT,
|
|
37
|
+
ENV_WORK_IQ_A2A_CLIENT_ID,
|
|
38
|
+
ENV_WORK_IQ_A2A_SCOPES,
|
|
39
|
+
ENV_TENANT_ID,
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
missing_vars = [
|
|
43
|
+
var for var in required_env_vars if not os.environ.get(var)
|
|
44
|
+
]
|
|
45
|
+
if missing_vars:
|
|
46
|
+
emit_structured_log(
|
|
47
|
+
"error",
|
|
48
|
+
"Missing required environment variables: "
|
|
49
|
+
f"{', '.join(missing_vars)}. Please ensure your .env file"
|
|
50
|
+
" contains all required configuration.",
|
|
51
|
+
operation=Operation.VALIDATE_ENV,
|
|
52
|
+
)
|
|
53
|
+
sys.exit(1)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def validate_endpoint_url(url: str, allowed_domains: List[str]) -> None:
|
|
57
|
+
"""Validate URL against security requirements."""
|
|
58
|
+
try:
|
|
59
|
+
parsed = urllib.parse.urlparse(url)
|
|
60
|
+
except Exception as e:
|
|
61
|
+
raise ValueError(f"Invalid URL format: {url}") from e
|
|
62
|
+
|
|
63
|
+
if parsed.scheme in ['javascript', 'data']:
|
|
64
|
+
raise ValueError(f"Dangerous URL scheme detected: {parsed.scheme}")
|
|
65
|
+
|
|
66
|
+
if parsed.scheme != 'https':
|
|
67
|
+
raise ValueError(f"Only HTTPS URLs are allowed, got: {parsed.scheme}")
|
|
68
|
+
|
|
69
|
+
if parsed.netloc not in allowed_domains:
|
|
70
|
+
raise ValueError(f"Domain not in allowed list: {parsed.netloc}")
|
|
71
|
+
|
|
72
|
+
if parsed.fragment:
|
|
73
|
+
raise ValueError("Fragment URLs are not allowed")
|