@microsoft/m365-copilot-eval 1.4.0-preview.1 → 1.6.0-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -4
- package/package.json +4 -3
- package/schema/CHANGELOG.md +14 -0
- package/schema/v1/eval-document.schema.json +3 -3
- package/schema/version.json +1 -1
- package/src/clients/cli/agent_selector.py +74 -0
- package/src/clients/cli/api_clients/A2A/a2a_client.py +96 -30
- package/src/clients/cli/api_clients/base_agent_client.py +0 -1
- package/src/clients/cli/auth/auth_handler.py +21 -1
- package/src/clients/cli/cli_args.py +136 -0
- package/src/clients/cli/cli_logging/cli_logger.py +33 -0
- package/src/clients/cli/cli_logging/console_diagnostics.py +3 -1
- package/src/clients/cli/common.py +53 -0
- package/src/clients/cli/env_validator.py +73 -0
- package/src/clients/cli/evaluation_runner.py +653 -0
- package/src/clients/cli/evaluator_resolver.py +9 -6
- package/src/clients/cli/main.py +130 -1676
- package/src/clients/cli/prompt_loader.py +148 -0
- package/src/clients/cli/readme.md +9 -53
- package/src/clients/cli/response_extractor.py +4 -601
- package/src/clients/cli/result_writer.py +488 -0
- package/src/clients/node-js/bin/runevals.js +34 -13
- package/src/clients/node-js/config/default.js +8 -11
- package/src/clients/node-js/lib/env-loader.js +3 -4
- package/src/clients/node-js/lib/python-runtime.js +137 -65
- package/src/clients/node-js/lib/venv-manager.js +3 -2
- package/src/clients/node-js/lib/version-check.js +268 -0
- package/src/clients/cli/api_clients/REST/__init__.py +0 -3
- package/src/clients/cli/api_clients/REST/sydney_client.py +0 -204
|
@@ -4,6 +4,22 @@ import re
|
|
|
4
4
|
from dataclasses import dataclass
|
|
5
5
|
from typing import List, Optional
|
|
6
6
|
|
|
7
|
+
MAX_CONCURRENCY = 5
|
|
8
|
+
MAX_ATTEMPTS = 4 # Initial attempt + 3 retries
|
|
9
|
+
MAX_TURNS_PER_THREAD = 20
|
|
10
|
+
LONG_THREAD_WARNING_THRESHOLD = 10
|
|
11
|
+
DEFAULT_PASS_THRESHOLD = 3
|
|
12
|
+
|
|
13
|
+
# ── Environment variable name constants ──────────────────────────────
|
|
14
|
+
ENV_AZURE_AI_OPENAI_ENDPOINT = "AZURE_AI_OPENAI_ENDPOINT"
|
|
15
|
+
ENV_AZURE_AI_API_KEY = "AZURE_AI_API_KEY"
|
|
16
|
+
ENV_AZURE_AI_API_VERSION = "AZURE_AI_API_VERSION"
|
|
17
|
+
ENV_AZURE_AI_MODEL_NAME = "AZURE_AI_MODEL_NAME"
|
|
18
|
+
ENV_TENANT_ID = "TENANT_ID"
|
|
19
|
+
ENV_WORK_IQ_A2A_ENDPOINT = "WORK_IQ_A2A_ENDPOINT"
|
|
20
|
+
ENV_WORK_IQ_A2A_CLIENT_ID = "WORK_IQ_A2A_CLIENT_ID"
|
|
21
|
+
ENV_WORK_IQ_A2A_SCOPES = "WORK_IQ_A2A_SCOPES"
|
|
22
|
+
|
|
7
23
|
|
|
8
24
|
def pascal_case_to_title(eval_name: str) -> str:
|
|
9
25
|
"""Convert PascalCase evaluator name to space-separated display name.
|
|
@@ -12,10 +28,12 @@ def pascal_case_to_title(eval_name: str) -> str:
|
|
|
12
28
|
"""
|
|
13
29
|
return re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', eval_name)
|
|
14
30
|
|
|
31
|
+
|
|
15
32
|
# Canonical evaluator name constants
|
|
16
33
|
RELEVANCE = "Relevance"
|
|
17
34
|
COHERENCE = "Coherence"
|
|
18
35
|
GROUNDEDNESS = "Groundedness"
|
|
36
|
+
SIMILARITY = "Similarity"
|
|
19
37
|
TOOL_CALL_ACCURACY = "ToolCallAccuracy"
|
|
20
38
|
CITATIONS = "Citations"
|
|
21
39
|
EXACT_MATCH = "ExactMatch"
|
|
@@ -48,6 +66,7 @@ METRIC_IDS = {
|
|
|
48
66
|
RELEVANCE: "relevance",
|
|
49
67
|
COHERENCE: "coherence",
|
|
50
68
|
GROUNDEDNESS: "groundedness",
|
|
69
|
+
SIMILARITY: "similarity",
|
|
51
70
|
TOOL_CALL_ACCURACY: "tool_call_accuracy",
|
|
52
71
|
CITATIONS: "citations",
|
|
53
72
|
EXACT_MATCH: "exact_match",
|
|
@@ -60,3 +79,37 @@ class RegistryEntry:
|
|
|
60
79
|
type: str # "llm", "tool", or "non-llm"
|
|
61
80
|
requires: List[str]
|
|
62
81
|
default_threshold: Optional[float]
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclass(frozen=True)
|
|
85
|
+
class RunConfig:
|
|
86
|
+
"""Typed, immutable runtime configuration passed across module boundaries.
|
|
87
|
+
|
|
88
|
+
Use ``RunConfig.from_namespace(args)`` to build from argparse output.
|
|
89
|
+
Use ``dataclasses.replace(config, field=value)`` to derive new configs.
|
|
90
|
+
"""
|
|
91
|
+
prompts: Optional[List[str]] = None
|
|
92
|
+
expected: Optional[List[str]] = None
|
|
93
|
+
prompts_file: Optional[str] = None
|
|
94
|
+
interactive: bool = False
|
|
95
|
+
m365_agent_id: Optional[str] = None
|
|
96
|
+
output: Optional[str] = None
|
|
97
|
+
log_level: Optional[List[str]] = None
|
|
98
|
+
effective_log_level: str = "info"
|
|
99
|
+
signout: bool = False
|
|
100
|
+
concurrency: int = MAX_CONCURRENCY
|
|
101
|
+
|
|
102
|
+
@classmethod
|
|
103
|
+
def from_namespace(cls, args) -> "RunConfig":
|
|
104
|
+
"""Build a RunConfig from an argparse.Namespace."""
|
|
105
|
+
return cls(
|
|
106
|
+
prompts=args.prompts,
|
|
107
|
+
expected=args.expected,
|
|
108
|
+
prompts_file=args.prompts_file,
|
|
109
|
+
interactive=args.interactive,
|
|
110
|
+
m365_agent_id=args.m365_agent_id,
|
|
111
|
+
output=args.output,
|
|
112
|
+
log_level=args.log_level,
|
|
113
|
+
signout=args.signout,
|
|
114
|
+
concurrency=args.concurrency,
|
|
115
|
+
)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""Environment validation and URL security checks."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
import urllib.parse
|
|
6
|
+
from typing import List
|
|
7
|
+
|
|
8
|
+
from cli_logging.cli_logger import emit_structured_log
|
|
9
|
+
from cli_logging.logging_utils import Operation
|
|
10
|
+
from common import (
|
|
11
|
+
ENV_AZURE_AI_OPENAI_ENDPOINT,
|
|
12
|
+
ENV_AZURE_AI_API_KEY,
|
|
13
|
+
ENV_AZURE_AI_API_VERSION,
|
|
14
|
+
ENV_AZURE_AI_MODEL_NAME,
|
|
15
|
+
ENV_WORK_IQ_A2A_ENDPOINT,
|
|
16
|
+
ENV_WORK_IQ_A2A_CLIENT_ID,
|
|
17
|
+
ENV_WORK_IQ_A2A_SCOPES,
|
|
18
|
+
ENV_TENANT_ID,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# Allowed endpoints for URL validation
|
|
23
|
+
ALLOWED_ENDPOINTS = [
|
|
24
|
+
'substrate.office.com',
|
|
25
|
+
'graph.microsoft.com',
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def validate_environment() -> None:
|
|
30
|
+
"""Validate required environment variables."""
|
|
31
|
+
required_env_vars = [
|
|
32
|
+
ENV_AZURE_AI_OPENAI_ENDPOINT,
|
|
33
|
+
ENV_AZURE_AI_API_KEY,
|
|
34
|
+
ENV_AZURE_AI_API_VERSION,
|
|
35
|
+
ENV_AZURE_AI_MODEL_NAME,
|
|
36
|
+
ENV_WORK_IQ_A2A_ENDPOINT,
|
|
37
|
+
ENV_WORK_IQ_A2A_CLIENT_ID,
|
|
38
|
+
ENV_WORK_IQ_A2A_SCOPES,
|
|
39
|
+
ENV_TENANT_ID,
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
missing_vars = [
|
|
43
|
+
var for var in required_env_vars if not os.environ.get(var)
|
|
44
|
+
]
|
|
45
|
+
if missing_vars:
|
|
46
|
+
emit_structured_log(
|
|
47
|
+
"error",
|
|
48
|
+
"Missing required environment variables: "
|
|
49
|
+
f"{', '.join(missing_vars)}. Please ensure your .env file"
|
|
50
|
+
" contains all required configuration.",
|
|
51
|
+
operation=Operation.VALIDATE_ENV,
|
|
52
|
+
)
|
|
53
|
+
sys.exit(1)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def validate_endpoint_url(url: str, allowed_domains: List[str]) -> None:
|
|
57
|
+
"""Validate URL against security requirements."""
|
|
58
|
+
try:
|
|
59
|
+
parsed = urllib.parse.urlparse(url)
|
|
60
|
+
except Exception as e:
|
|
61
|
+
raise ValueError(f"Invalid URL format: {url}") from e
|
|
62
|
+
|
|
63
|
+
if parsed.scheme in ['javascript', 'data']:
|
|
64
|
+
raise ValueError(f"Dangerous URL scheme detected: {parsed.scheme}")
|
|
65
|
+
|
|
66
|
+
if parsed.scheme != 'https':
|
|
67
|
+
raise ValueError(f"Only HTTPS URLs are allowed, got: {parsed.scheme}")
|
|
68
|
+
|
|
69
|
+
if parsed.netloc not in allowed_domains:
|
|
70
|
+
raise ValueError(f"Domain not in allowed list: {parsed.netloc}")
|
|
71
|
+
|
|
72
|
+
if parsed.fragment:
|
|
73
|
+
raise ValueError("Fragment URLs are not allowed")
|