@microsoft/m365-copilot-eval 1.0.1-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +415 -0
- package/TERMS.txt +65 -0
- package/package.json +82 -0
- package/src/clients/cli/auth/__init__.py +1 -0
- package/src/clients/cli/auth/auth_handler.py +262 -0
- package/src/clients/cli/custom_evaluators/CitationsEvaluator.py +136 -0
- package/src/clients/cli/custom_evaluators/ConcisenessNonLLMEvaluator.py +18 -0
- package/src/clients/cli/custom_evaluators/ExactMatchEvaluator.py +25 -0
- package/src/clients/cli/custom_evaluators/PII/PII.py +45 -0
- package/src/clients/cli/custom_evaluators/PartialMatchEvaluator.py +39 -0
- package/src/clients/cli/custom_evaluators/__init__.py +1 -0
- package/src/clients/cli/demo_usage.py +83 -0
- package/src/clients/cli/generate_report.py +251 -0
- package/src/clients/cli/main.py +766 -0
- package/src/clients/cli/readme.md +301 -0
- package/src/clients/cli/requirements.txt +10 -0
- package/src/clients/cli/response_extractor.py +589 -0
- package/src/clients/cli/samples/PartnerSuccess.json +122 -0
- package/src/clients/cli/samples/example_prompts.json +14 -0
- package/src/clients/cli/samples/example_prompts_alt.json +12 -0
- package/src/clients/cli/samples/prompts_ambiguity.json +22 -0
- package/src/clients/cli/samples/prompts_rag_grounding.json +22 -0
- package/src/clients/cli/samples/prompts_security_injection.json +22 -0
- package/src/clients/cli/samples/prompts_tool_use_negatives.json +22 -0
- package/src/clients/cli/samples/psaSample.json +18 -0
- package/src/clients/cli/samples/starter.json +10 -0
- package/src/clients/node-js/bin/runevals.js +505 -0
- package/src/clients/node-js/config/default.js +25 -0
- package/src/clients/node-js/lib/cache-utils.js +119 -0
- package/src/clients/node-js/lib/expiry-check.js +164 -0
- package/src/clients/node-js/lib/index.js +25 -0
- package/src/clients/node-js/lib/python-runtime.js +253 -0
- package/src/clients/node-js/lib/venv-manager.js +242 -0
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Uses WAM (Windows Account Manager) based authentication handler.
|
|
3
|
+
|
|
4
|
+
This module provides functionality to acquire access tokens using MSAL Python
|
|
5
|
+
with Windows Account Manager (WAM) as the broker. WAM is available on Windows 10+
|
|
6
|
+
and Windows Server 2019+.
|
|
7
|
+
|
|
8
|
+
For more information, see:
|
|
9
|
+
https://learn.microsoft.com/en-us/entra/msal/python/advanced/wam
|
|
10
|
+
https://github.com/AzureAD/microsoft-authentication-extensions-for-python
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
import platform
|
|
15
|
+
import logging
|
|
16
|
+
from typing import Optional
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
import jwt
|
|
19
|
+
from msal import PublicClientApplication
|
|
20
|
+
from msal_extensions import PersistedTokenCache, build_encrypted_persistence
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
class AuthHandler:
|
|
25
|
+
"""Handler for Windows Account Manager (WAM) based authentication."""
|
|
26
|
+
|
|
27
|
+
def __init__(self, client_id: str, tenant_id: str, scopes_str: str, cache_dir: Optional[str] = None):
|
|
28
|
+
"""
|
|
29
|
+
Initialize the WAM auth handler.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
client_id: App registration client ID (required).
|
|
33
|
+
tenant_id: Directory/tenant ID (required).
|
|
34
|
+
scopes_str: Comma-separated scopes (required).
|
|
35
|
+
cache_dir: Optional directory for token cache file. Defaults to user's home directory.
|
|
36
|
+
"""
|
|
37
|
+
current_os = platform.system()
|
|
38
|
+
if current_os != "Windows":
|
|
39
|
+
raise RuntimeError(
|
|
40
|
+
f"Authentication is only supported on Windows. Detected OS: {current_os}. "
|
|
41
|
+
"Support for other operating systems is coming soon."
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
if not client_id:
|
|
45
|
+
raise ValueError("client_id is required")
|
|
46
|
+
|
|
47
|
+
if not tenant_id:
|
|
48
|
+
raise ValueError("tenant_id is required")
|
|
49
|
+
|
|
50
|
+
if not scopes_str:
|
|
51
|
+
raise ValueError("scopes_str is required")
|
|
52
|
+
|
|
53
|
+
scopes = [s.strip() for s in scopes_str.split(",") if s.strip()]
|
|
54
|
+
|
|
55
|
+
self.client_id = client_id
|
|
56
|
+
self.authority = f"https://login.microsoftonline.com/{tenant_id}"
|
|
57
|
+
self.scopes = scopes
|
|
58
|
+
|
|
59
|
+
# Initialize the public client application with WAM broker enabled and token cache
|
|
60
|
+
try:
|
|
61
|
+
self.app = PublicClientApplication(
|
|
62
|
+
client_id=self.client_id,
|
|
63
|
+
authority=self.authority,
|
|
64
|
+
enable_broker_on_windows=True,
|
|
65
|
+
token_cache=self._setup_token_cache(cache_dir)
|
|
66
|
+
)
|
|
67
|
+
logger.info("WAM auth handler initialized successfully")
|
|
68
|
+
except Exception as e:
|
|
69
|
+
logger.error(f"Failed to initialize WAM auth handler: {e}")
|
|
70
|
+
raise
|
|
71
|
+
|
|
72
|
+
def _setup_token_cache(self, cache_dir: Optional[str] = None) -> PersistedTokenCache:
|
|
73
|
+
"""
|
|
74
|
+
Setup encrypted persistent token cache using MSAL Extensions.
|
|
75
|
+
|
|
76
|
+
Creates a platform-dependent encrypted cache (DPAPI on Windows, Keychain on Mac,
|
|
77
|
+
Libsecret on Linux) and initializes self.token_cache.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
cache_dir: Optional directory for token cache file. Defaults to ~/.msal_cache.
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
PersistedTokenCache: The initialized encrypted token cache.
|
|
84
|
+
|
|
85
|
+
Raises:
|
|
86
|
+
Exception: If cache initialization fails.
|
|
87
|
+
"""
|
|
88
|
+
if cache_dir is None:
|
|
89
|
+
cache_dir = str(Path.home() / ".msal_cache")
|
|
90
|
+
|
|
91
|
+
cache_path = os.path.join(cache_dir, "token_cache.bin")
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
# Create cache directory if it doesn't exist
|
|
95
|
+
Path(cache_dir).mkdir(parents=True, exist_ok=True)
|
|
96
|
+
|
|
97
|
+
# Build encrypted persistence (DPAPI on Windows, Keychain on Mac, Libsecret on Linux)
|
|
98
|
+
persistence = build_encrypted_persistence(cache_path)
|
|
99
|
+
token_cache = PersistedTokenCache(persistence)
|
|
100
|
+
logger.info(f"Encrypted token cache initialized at {cache_path} (encrypted: {persistence.is_encrypted})")
|
|
101
|
+
return token_cache
|
|
102
|
+
except Exception as e:
|
|
103
|
+
logger.error(f"Failed to initialize token cache: {e}")
|
|
104
|
+
raise
|
|
105
|
+
|
|
106
|
+
def acquire_token_interactive(self):
|
|
107
|
+
"""
|
|
108
|
+
Acquire a token interactively using WAM and return the full MSAL result dict.
|
|
109
|
+
|
|
110
|
+
This method first attempts to retrieve a cached token. If no valid cached
|
|
111
|
+
token is found or if it has expired, it will prompt the user to authenticate
|
|
112
|
+
via the WAM dialog.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
The full result dictionary from MSAL (contains access_token, id_token, etc.)
|
|
116
|
+
or None if acquisition fails.
|
|
117
|
+
|
|
118
|
+
Raises:
|
|
119
|
+
ImportError: If broker-related dependencies are not installed.
|
|
120
|
+
Install with: pip install 'msal[broker]>=1.20,<2'
|
|
121
|
+
Exception: If WAM communication fails or user denies the request.
|
|
122
|
+
"""
|
|
123
|
+
try:
|
|
124
|
+
# First, attempt to acquire a token silently from cache
|
|
125
|
+
accounts = self.get_accounts()
|
|
126
|
+
if accounts:
|
|
127
|
+
logger.info(f"Attempting silent token acquisition from {len(accounts)} cached account(s)")
|
|
128
|
+
for account in accounts:
|
|
129
|
+
silent_result = self.app.acquire_token_silent(
|
|
130
|
+
scopes=self.scopes, account=account
|
|
131
|
+
)
|
|
132
|
+
if silent_result and "access_token" in silent_result:
|
|
133
|
+
logger.info("Access token acquired successfully from cache")
|
|
134
|
+
return silent_result
|
|
135
|
+
logger.info("No valid cached token found; proceeding with interactive authentication")
|
|
136
|
+
else:
|
|
137
|
+
logger.info("No cached accounts found; proceeding with interactive authentication")
|
|
138
|
+
|
|
139
|
+
# If no cached token is valid, proceed with interactive acquisition
|
|
140
|
+
result = self.app.acquire_token_interactive(
|
|
141
|
+
scopes=self.scopes,
|
|
142
|
+
parent_window_handle=self.app.CONSOLE_WINDOW_HANDLE,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
if result and "access_token" in result:
|
|
146
|
+
logger.info("Access token acquired successfully via interactive authentication")
|
|
147
|
+
return result
|
|
148
|
+
|
|
149
|
+
error_msg = None
|
|
150
|
+
if result:
|
|
151
|
+
error_msg = result.get(
|
|
152
|
+
"error_description", result.get("error", "Unknown error")
|
|
153
|
+
)
|
|
154
|
+
logger.error(f"Failed to acquire token: {error_msg or 'Unknown error'}")
|
|
155
|
+
return result
|
|
156
|
+
|
|
157
|
+
except ImportError as e:
|
|
158
|
+
logger.error(
|
|
159
|
+
f"WAM broker dependencies not installed: {e}. "
|
|
160
|
+
"Install with: pip install 'msal[broker]>=1.20,<2'"
|
|
161
|
+
)
|
|
162
|
+
raise
|
|
163
|
+
except Exception as e:
|
|
164
|
+
logger.error(f"Error during token acquisition: {e}")
|
|
165
|
+
raise
|
|
166
|
+
|
|
167
|
+
def acquire_token_silent(self, account: Optional[dict] = None) -> Optional[str]:
|
|
168
|
+
"""
|
|
169
|
+
Acquire an access token silently (without user interaction).
|
|
170
|
+
|
|
171
|
+
This is useful for cached tokens or when you have a known account.
|
|
172
|
+
Returns None if a valid token is not available and interaction would be required.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
account: Optional account object from previous interactive authentication.
|
|
176
|
+
If None, attempts to get a cached token for any account.
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
The access token string if successful, None if no cached token available.
|
|
180
|
+
"""
|
|
181
|
+
try:
|
|
182
|
+
result = self.app.acquire_token_silent(scopes=self.scopes, account=account)
|
|
183
|
+
|
|
184
|
+
if result and "access_token" in result:
|
|
185
|
+
logger.info("Access token acquired silently")
|
|
186
|
+
return result["access_token"]
|
|
187
|
+
|
|
188
|
+
logger.debug(
|
|
189
|
+
"No cached token available; interactive acquisition would be required"
|
|
190
|
+
)
|
|
191
|
+
return None
|
|
192
|
+
|
|
193
|
+
except Exception as e:
|
|
194
|
+
logger.debug(f"Silent token acquisition failed: {e}")
|
|
195
|
+
return None
|
|
196
|
+
|
|
197
|
+
def get_accounts(self) -> list:
|
|
198
|
+
"""
|
|
199
|
+
Get the list of accounts available in WAM cache.
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
List of account objects cached by WAM.
|
|
203
|
+
"""
|
|
204
|
+
try:
|
|
205
|
+
accounts = self.app.get_accounts()
|
|
206
|
+
logger.info(f"Found {len(accounts)} cached account(s)")
|
|
207
|
+
return accounts
|
|
208
|
+
except Exception as e:
|
|
209
|
+
logger.error(f"Error retrieving cached accounts: {e}")
|
|
210
|
+
return []
|
|
211
|
+
|
|
212
|
+
def clear_cache(self) -> bool:
|
|
213
|
+
"""
|
|
214
|
+
Clear all cached tokens and accounts from the token cache.
|
|
215
|
+
|
|
216
|
+
This method removes all cached authentication data, forcing the user
|
|
217
|
+
to authenticate interactively on the next acquire_token_interactive call.
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
True if cache was successfully cleared, False otherwise.
|
|
221
|
+
"""
|
|
222
|
+
try:
|
|
223
|
+
# Get all cached accounts
|
|
224
|
+
accounts = self.app.get_accounts()
|
|
225
|
+
|
|
226
|
+
# Remove each account from the cache
|
|
227
|
+
for account in accounts:
|
|
228
|
+
self.app.remove_account(account)
|
|
229
|
+
logger.info(f"Removed account {account.get('username', 'unknown')} from cache")
|
|
230
|
+
|
|
231
|
+
logger.info("Token cache cleared successfully")
|
|
232
|
+
return True
|
|
233
|
+
except Exception as e:
|
|
234
|
+
logger.error(f"Error clearing token cache: {e}")
|
|
235
|
+
return False
|
|
236
|
+
|
|
237
|
+
@staticmethod
|
|
238
|
+
def extract_user_oid_from_access_token(access_token: str) -> str:
|
|
239
|
+
"""
|
|
240
|
+
Extract the user OID from an access token using MSAL's JWT decoding.
|
|
241
|
+
|
|
242
|
+
MSAL includes PyJWT under the hood, so we can use jwt.decode
|
|
243
|
+
directly without adding new dependencies.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
access_token: The access token string
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
The user OID if found, empty string otherwise
|
|
250
|
+
|
|
251
|
+
Raises:
|
|
252
|
+
ValueError: If token format is invalid or OID not found
|
|
253
|
+
"""
|
|
254
|
+
try:
|
|
255
|
+
# Decode without verification (we're just reading claims, not validating signature)
|
|
256
|
+
decoded = jwt.decode(access_token, options={"verify_signature": False})
|
|
257
|
+
oid = decoded.get('oid', '')
|
|
258
|
+
if not oid:
|
|
259
|
+
raise ValueError("OID not found in token claims")
|
|
260
|
+
return oid
|
|
261
|
+
except jwt.DecodeError as e:
|
|
262
|
+
raise ValueError(f"Failed to decode token: {e}")
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CitationsEvaluator - A custom evaluator for analyzing citations in M365 Copilot responses.
|
|
3
|
+
|
|
4
|
+
This evaluator uses regex-based pattern matching to detect citations in two formats:
|
|
5
|
+
1. New OAI format: \ue200cite\ue202turn{X}search{Y}\ue201
|
|
6
|
+
2. Old format: [^i^] where i is the citation index
|
|
7
|
+
|
|
8
|
+
Where X, Y, and i are natural numbers representing conversation turn, search result index, or citation index.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
from enum import Enum
|
|
13
|
+
from typing import Dict, Any, Optional
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class CitationFormat(Enum):
|
|
17
|
+
"""Enum for different citation formats supported by the evaluator."""
|
|
18
|
+
OAI_UNICODE = "oai_unicode" # New format: \ue200cite\ue202turn{X}search{Y}\ue201
|
|
19
|
+
LEGACY_BRACKET = "legacy_bracket" # Old format: [^i^]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class CitationsEvaluator:
|
|
23
|
+
"""
|
|
24
|
+
A custom evaluator that analyzes citations in response text without using an LLM.
|
|
25
|
+
|
|
26
|
+
This evaluator detects citation patterns and returns:
|
|
27
|
+
- Whether at least one citation is present
|
|
28
|
+
- The number of unique citations found
|
|
29
|
+
|
|
30
|
+
Supports both new OAI unicode format and legacy bracket format.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, citation_format: CitationFormat = CitationFormat.OAI_UNICODE):
|
|
34
|
+
"""
|
|
35
|
+
Initialize the CitationsEvaluator with the specified citation format.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
citation_format (CitationFormat): The format of citations to detect.
|
|
39
|
+
Defaults to OAI_UNICODE format.
|
|
40
|
+
"""
|
|
41
|
+
self.citation_format = citation_format
|
|
42
|
+
|
|
43
|
+
if citation_format == CitationFormat.OAI_UNICODE:
|
|
44
|
+
# Pattern to match citations: \ue200cite\ue202turn{number}search{number}\ue201
|
|
45
|
+
self.citation_pattern = r'\ue200cite\ue202turn\d+search\d+\ue201'
|
|
46
|
+
elif citation_format == CitationFormat.LEGACY_BRACKET:
|
|
47
|
+
# Pattern to match citations: [^number^]
|
|
48
|
+
self.citation_pattern = r'\[\^\d+\^\]'
|
|
49
|
+
else:
|
|
50
|
+
raise ValueError(f"Unsupported citation format: {citation_format}")
|
|
51
|
+
|
|
52
|
+
self.compiled_pattern = re.compile(self.citation_pattern)
|
|
53
|
+
|
|
54
|
+
def __call__(self, *, response: str, **kwargs) -> Dict[str, Any]:
|
|
55
|
+
"""
|
|
56
|
+
Evaluate the response text for citations.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
response (str): The response text from the M365 Copilot agent
|
|
60
|
+
**kwargs: Additional keyword arguments (not used but kept for compatibility)
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Dict[str, Any]: Evaluation results containing:
|
|
64
|
+
- citation_format (str): The format used for detection
|
|
65
|
+
- score (int): Number of unique citations found
|
|
66
|
+
- result (str): "pass" if citations found, "fail" otherwise
|
|
67
|
+
- threshold (int): Minimum threshold for passing (1)
|
|
68
|
+
- reason (str): Explanation of the result with citation details
|
|
69
|
+
"""
|
|
70
|
+
if not isinstance(response, str):
|
|
71
|
+
response = str(response) if response is not None else ""
|
|
72
|
+
|
|
73
|
+
# Find all citation matches
|
|
74
|
+
citation_matches = self.compiled_pattern.findall(response)
|
|
75
|
+
|
|
76
|
+
# Get unique citations (remove duplicates)
|
|
77
|
+
unique_citations = list(set(citation_matches))
|
|
78
|
+
|
|
79
|
+
# Extract citation identifiers for reporting
|
|
80
|
+
citation_details = []
|
|
81
|
+
for citation in unique_citations:
|
|
82
|
+
if self.citation_format == CitationFormat.OAI_UNICODE:
|
|
83
|
+
# Extract the turn and search numbers from the citation
|
|
84
|
+
turn_search_match = re.search(r'turn(\d+)search(\d+)', citation)
|
|
85
|
+
if turn_search_match:
|
|
86
|
+
turn_num = turn_search_match.group(1)
|
|
87
|
+
search_num = turn_search_match.group(2)
|
|
88
|
+
citation_details.append(f"turn{turn_num}search{search_num}")
|
|
89
|
+
elif self.citation_format == CitationFormat.LEGACY_BRACKET:
|
|
90
|
+
# Extract the citation number from [^number^]
|
|
91
|
+
bracket_match = re.search(r'\[\^(\d+)\^\]', citation)
|
|
92
|
+
if bracket_match:
|
|
93
|
+
citation_num = bracket_match.group(1)
|
|
94
|
+
citation_details.append(f"citation{citation_num}")
|
|
95
|
+
|
|
96
|
+
# Prepare results in a format compatible with the HTML report generator
|
|
97
|
+
results = {
|
|
98
|
+
"citation_format": self.citation_format.value,
|
|
99
|
+
# HTML report compatible fields
|
|
100
|
+
"score": len(unique_citations), # Use citation count as the score
|
|
101
|
+
"result": "pass" if len(unique_citations) > 0 else "fail", # Pass if citations found
|
|
102
|
+
"threshold": 1, # Threshold of 1 citation minimum
|
|
103
|
+
"reason": f"Found {len(unique_citations)} unique citation(s): {', '.join(citation_details) if citation_details else 'None'}"
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
return results
|
|
107
|
+
|
|
108
|
+
def get_name(self) -> str:
|
|
109
|
+
"""Return the name of this evaluator."""
|
|
110
|
+
return "CitationsEvaluator"
|
|
111
|
+
|
|
112
|
+
def get_description(self) -> str:
|
|
113
|
+
"""Return a description of what this evaluator does."""
|
|
114
|
+
return f"Analyzes response text for M365 Copilot citations using regex pattern matching ({self.citation_format.value} format)"
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def citations_evaluator(*, response: str, citation_format: CitationFormat = CitationFormat.OAI_UNICODE, **kwargs) -> Dict[str, Any]:
|
|
118
|
+
"""
|
|
119
|
+
Standalone function wrapper for the CitationsEvaluator.
|
|
120
|
+
|
|
121
|
+
This function provides a simple interface compatible with Azure AI Evaluation SDK.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
response (str): The response text to evaluate
|
|
125
|
+
citation_format (CitationFormat): The format of citations to detect
|
|
126
|
+
**kwargs: Additional keyword arguments
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
Dict[str, Any]: Citation evaluation results
|
|
130
|
+
"""
|
|
131
|
+
evaluator = CitationsEvaluator(citation_format=citation_format)
|
|
132
|
+
return evaluator(response=response, **kwargs)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
# For convenience, export the main classes and functions
|
|
136
|
+
__all__ = ['CitationsEvaluator', 'CitationFormat', 'citations_evaluator']
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
#from azure.ai.evaluation import evaluate
|
|
2
|
+
|
|
3
|
+
class ConcisenessNonLLMEvaluator:
|
|
4
|
+
def __init__(self):
|
|
5
|
+
pass
|
|
6
|
+
# A class is made callable by implementing the special method __call__
|
|
7
|
+
def __call__(self, *, response: str, **kwargs):
|
|
8
|
+
length = len(response)
|
|
9
|
+
# compute raw score (example: normalized to roughly 0-5)
|
|
10
|
+
raw_score = max(0, ((100 - (length / 10)) / 20))
|
|
11
|
+
# round to nearest integer
|
|
12
|
+
rounded = int(round(raw_score))
|
|
13
|
+
return {
|
|
14
|
+
"concisenessnonllm_score": rounded,
|
|
15
|
+
"concisenessnonllm_threshold": 3,
|
|
16
|
+
"concisenessnonllm_result": "pass" if rounded >= 3 else "fail",
|
|
17
|
+
"concisenessnonllm_reason": f"Any response greater than 1000 characters is given zero score. The longer the answer, the lesser the score. The length of the response is {length} chars. And hence, the score!"
|
|
18
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from azure.ai.evaluation import evaluate
|
|
2
|
+
|
|
3
|
+
class ExactMatchEvaluator:
|
|
4
|
+
def __init__(self):
|
|
5
|
+
pass
|
|
6
|
+
|
|
7
|
+
def __call__(self, *, response: str, expected_answer: str, **kwargs):
|
|
8
|
+
if response is None or response.strip() == "":
|
|
9
|
+
raise ValueError("Response is null, empty, or whitespace.")
|
|
10
|
+
|
|
11
|
+
if expected_answer is None:
|
|
12
|
+
raise ValueError("Expected answer cannot be None.")
|
|
13
|
+
|
|
14
|
+
# Case-sensitive exact match (mimics C# StringComparison.InvariantCulture)
|
|
15
|
+
is_match = response.strip() == expected_answer.strip()
|
|
16
|
+
|
|
17
|
+
return {
|
|
18
|
+
"exact_match": 1.0 if is_match else 0.0,
|
|
19
|
+
"exact_match_result": "pass" if is_match else "fail",
|
|
20
|
+
"exact_match_threshold": 1.0,
|
|
21
|
+
"exact_match_reason": "Exact match found" if is_match else "No exact match found"
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
exact_match_evaluator = ExactMatchEvaluator()
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import sys
|
|
4
|
+
import re
|
|
5
|
+
from promptflow.client import load_flow
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PIIEvaluator:
|
|
9
|
+
def __init__(self, model_config):
|
|
10
|
+
current_dir = os.path.dirname(__file__)
|
|
11
|
+
prompty_path = os.path.join(current_dir, "pii.prompty")
|
|
12
|
+
self._flow = load_flow(source=prompty_path, model={"configuration": model_config})
|
|
13
|
+
|
|
14
|
+
def __call__(self, *, response: str, **kwargs):
|
|
15
|
+
llm_response = self._flow(response=response)
|
|
16
|
+
try:
|
|
17
|
+
# Try to parse as JSON first
|
|
18
|
+
parsed_response = json.loads(llm_response)
|
|
19
|
+
return parsed_response
|
|
20
|
+
except Exception:
|
|
21
|
+
# If it's not JSON, try to extract the score from XML-like tags
|
|
22
|
+
if isinstance(llm_response, str):
|
|
23
|
+
# Look for <S2>score</S2> pattern
|
|
24
|
+
score_match = re.search(r'<S2>(\d+)</S2>', llm_response)
|
|
25
|
+
# Look for <S1>explanation</S1> pattern
|
|
26
|
+
explanation_match = re.search(r'<S1>(.*?)</S1>', llm_response, re.DOTALL)
|
|
27
|
+
|
|
28
|
+
if score_match:
|
|
29
|
+
score = int(score_match.group(1))
|
|
30
|
+
result_dict = {
|
|
31
|
+
'PII': score,
|
|
32
|
+
'score': score,
|
|
33
|
+
'raw_response': llm_response
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
# Add explanation if found
|
|
37
|
+
if explanation_match:
|
|
38
|
+
explanation = explanation_match.group(1).strip()
|
|
39
|
+
result_dict['explanation'] = explanation
|
|
40
|
+
result_dict['reason'] = explanation
|
|
41
|
+
result_dict['PII_reason'] = explanation
|
|
42
|
+
|
|
43
|
+
return result_dict
|
|
44
|
+
# Fallback: return the raw response
|
|
45
|
+
return {'raw_response': llm_response}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from azure.ai.evaluation import evaluate
|
|
2
|
+
|
|
3
|
+
class PartialMatchEvaluator:
|
|
4
|
+
def __init__(self, case_sensitive=False):
|
|
5
|
+
self.case_sensitive = case_sensitive
|
|
6
|
+
|
|
7
|
+
def __call__(self, *, response: str, expected_answer: str, **kwargs):
|
|
8
|
+
if response is None or response.strip() == "":
|
|
9
|
+
raise ValueError("Response cannot be null or empty.")
|
|
10
|
+
if expected_answer is None:
|
|
11
|
+
raise ValueError("Expected answer cannot be null.")
|
|
12
|
+
|
|
13
|
+
resp = response.strip()
|
|
14
|
+
exp = expected_answer.strip()
|
|
15
|
+
|
|
16
|
+
# Adjust case sensitivity
|
|
17
|
+
if not self.case_sensitive:
|
|
18
|
+
resp = resp.lower()
|
|
19
|
+
exp = exp.lower()
|
|
20
|
+
|
|
21
|
+
# Score: fraction of expected text found inside response
|
|
22
|
+
if exp in resp:
|
|
23
|
+
# Percent of expected text that matched (length-based)
|
|
24
|
+
score = len(exp) / len(resp)
|
|
25
|
+
else:
|
|
26
|
+
score = 0.0
|
|
27
|
+
|
|
28
|
+
threshold = 0.5 # 50% match threshold
|
|
29
|
+
is_pass = score >= threshold
|
|
30
|
+
|
|
31
|
+
return {
|
|
32
|
+
"partial_match": score,
|
|
33
|
+
"partial_match_result": "pass" if is_pass else "fail",
|
|
34
|
+
"partial_match_threshold": threshold,
|
|
35
|
+
"partial_match_reason": f"Match score: {score:.3f} ({'above' if is_pass else 'below'} threshold {threshold})"
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
partial_match_evaluator = PartialMatchEvaluator(case_sensitive=False)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Custom evaluators package
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Example usage script for the M365 Copilot Agent Evaluation CLI
|
|
4
|
+
This script demonstrates various ways to use the CLI tool.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import subprocess
|
|
8
|
+
import sys
|
|
9
|
+
import os
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
def run_command(cmd, description):
|
|
13
|
+
"""Run a command and display its description."""
|
|
14
|
+
print(f"\n{'='*60}")
|
|
15
|
+
print(f"Example: {description}")
|
|
16
|
+
print(f"Command: {cmd}")
|
|
17
|
+
print(f"{'='*60}")
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
# Note: In a real scenario, you would have your Azure credentials configured
|
|
21
|
+
# This will fail without proper environment variables, but shows the CLI interface
|
|
22
|
+
result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=30)
|
|
23
|
+
if result.returncode == 0:
|
|
24
|
+
print("✅ Command executed successfully")
|
|
25
|
+
print(result.stdout)
|
|
26
|
+
else:
|
|
27
|
+
print("❌ Command failed (likely due to missing Azure credentials)")
|
|
28
|
+
print("Error:", result.stderr)
|
|
29
|
+
except subprocess.TimeoutExpired:
|
|
30
|
+
print("⏱️ Command timed out (this is expected without Azure credentials)")
|
|
31
|
+
except Exception as e:
|
|
32
|
+
print(f"❌ Error running command: {e}")
|
|
33
|
+
|
|
34
|
+
def main():
|
|
35
|
+
"""Demonstrate CLI usage examples."""
|
|
36
|
+
script_dir = Path(__file__).parent
|
|
37
|
+
os.chdir(script_dir)
|
|
38
|
+
|
|
39
|
+
print("M365 Copilot Agent Evaluation CLI - Usage Examples")
|
|
40
|
+
print("=" * 60)
|
|
41
|
+
print("Note: These examples will fail without proper Azure credentials.")
|
|
42
|
+
print("This script demonstrates the CLI interface and available options.")
|
|
43
|
+
|
|
44
|
+
# Example 1: Show help
|
|
45
|
+
run_command(
|
|
46
|
+
"python main.py --help",
|
|
47
|
+
"Display help and all available options"
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# Example 2: Custom prompt (dry run)
|
|
51
|
+
run_command(
|
|
52
|
+
'python main.py --prompts "What is Microsoft Graph?" --expected "Microsoft Graph is a gateway to data and intelligence in Microsoft 365."',
|
|
53
|
+
"Run evaluation with custom prompt and expected response"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# Example 3: Prompts from file
|
|
57
|
+
run_command(
|
|
58
|
+
"python main.py --prompts-file samples/example_prompts.json --output results.json",
|
|
59
|
+
"Load prompts from JSON file and save results to JSON"
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Example 4: CSV output
|
|
63
|
+
run_command(
|
|
64
|
+
"python main.py --prompts-file samples/example_prompts.json --output results.csv --format csv",
|
|
65
|
+
"Load prompts from file and save results to CSV"
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
print(f"\n{'='*60}")
|
|
69
|
+
print("Setup Instructions:")
|
|
70
|
+
print("1. Install Azure CLI and run 'az login' to authenticate")
|
|
71
|
+
print("2. Create a .env file with your Azure AI configuration")
|
|
72
|
+
print("3. Set the following environment variables:")
|
|
73
|
+
print(" - AZURE_AI_FOUNDRY_PROJECT_ENDPOINT")
|
|
74
|
+
print(" - AZURE_AI_AGENT_ID")
|
|
75
|
+
print(" - AZURE_AI_OPENAI_ENDPOINT")
|
|
76
|
+
print(" - AZURE_AI_API_KEY")
|
|
77
|
+
print(" - AZURE_AI_API_VERSION")
|
|
78
|
+
print(" - AZURE_AI_MODEL_NAME")
|
|
79
|
+
print("4. Run: python main.py")
|
|
80
|
+
print(f"{'='*60}")
|
|
81
|
+
|
|
82
|
+
if __name__ == "__main__":
|
|
83
|
+
main()
|