@microsoft/m365-copilot-eval 1.3.0-preview.1 → 1.5.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/README.md +135 -100
  2. package/package.json +7 -4
  3. package/schema/CHANGELOG.md +7 -0
  4. package/schema/v1/eval-document.schema.json +143 -11
  5. package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
  6. package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
  7. package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
  8. package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
  9. package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
  10. package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
  11. package/schema/v1/examples/valid/multi-turn-output.json +59 -0
  12. package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
  13. package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
  14. package/schema/version.json +2 -2
  15. package/src/clients/cli/agent_selector.py +74 -0
  16. package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
  17. package/src/clients/cli/api_clients/A2A/a2a_client.py +475 -0
  18. package/src/clients/cli/api_clients/__init__.py +3 -0
  19. package/src/clients/cli/api_clients/base_agent_client.py +77 -0
  20. package/src/clients/cli/cli_args.py +136 -0
  21. package/src/clients/cli/cli_logging/cli_logger.py +33 -0
  22. package/src/clients/cli/cli_logging/console_diagnostics.py +56 -2
  23. package/src/clients/cli/cli_logging/logging_utils.py +0 -1
  24. package/src/clients/cli/common.py +64 -0
  25. package/src/clients/cli/env_validator.py +73 -0
  26. package/src/clients/cli/evaluation_runner.py +653 -0
  27. package/src/clients/cli/evaluator_resolver.py +9 -6
  28. package/src/clients/cli/generate_report.py +272 -129
  29. package/src/clients/cli/main.py +157 -1174
  30. package/src/clients/cli/parallel_executor.py +57 -0
  31. package/src/clients/cli/prompt_loader.py +148 -0
  32. package/src/clients/cli/readme.md +9 -53
  33. package/src/clients/cli/requirements.txt +1 -1
  34. package/src/clients/cli/response_extractor.py +4 -603
  35. package/src/clients/cli/result_writer.py +488 -0
  36. package/src/clients/cli/retry_policy.py +52 -0
  37. package/src/clients/cli/samples/multiturn_example.json +35 -0
  38. package/src/clients/cli/throttle_gate.py +82 -0
  39. package/src/clients/node-js/bin/runevals.js +82 -20
  40. package/src/clients/node-js/config/default.js +12 -11
  41. package/src/clients/node-js/lib/agent-id.js +12 -0
  42. package/src/clients/node-js/lib/env-loader.js +14 -20
  43. package/src/clients/node-js/lib/eula-manager.js +78 -0
  44. package/src/clients/node-js/lib/progress.js +13 -11
@@ -0,0 +1,475 @@
1
+ from __future__ import annotations
2
+
3
+ import functools
4
+ import json
5
+ import locale
6
+ import logging
7
+ import re
8
+ import urllib.error
9
+ import urllib.request
10
+ import uuid
11
+ from typing import Any, Dict, List, Optional, Tuple
12
+
13
+ from api_clients.base_agent_client import BaseAgentClient
14
+ from cli_logging.console_diagnostics import emit_structured_log
15
+ from cli_logging.logging_utils import Operation
16
+
17
+ # Feature flag required by the experimental A2A surface.
18
+ _A2A_FEATURE_FLAG = "feature.EnableA2AServer"
19
+
20
+ _REQUEST_TIMEOUT_SECS = 120
21
+
22
+ # OAI citation marker patterns — compiled once at module level.
23
+ # Marker format: \ue200cite(\ue202turn{X}search{Y})+\ue201
24
+ _CITATION_REF_PAT = re.compile(r"\ue202turn\d+search(\d+)")
25
+ _CITATION_BLOCK_PAT = re.compile(r"\ue200cite(?:\ue202turn\d+search\d+)+\ue201")
26
+
27
+
28
+ class A2AClient(BaseAgentClient):
29
+ """A2A (Agent-to-Agent) JSON-RPC 2.0 client for Work IQ agents."""
30
+
31
+ def __init__(
32
+ self,
33
+ *,
34
+ a2a_endpoint: str,
35
+ access_token: str,
36
+ logger: Optional[logging.Logger] = None,
37
+ diagnostic_records: Optional[List[Dict[str, Any]]] = None,
38
+ ) -> None:
39
+ """
40
+ Args:
41
+ a2a_endpoint: Base URL of the A2A endpoint.
42
+ access_token: Bearer token for A2A authentication.
43
+ logger: Logger to use. Defaults to a module-level logger if not provided.
44
+ diagnostic_records: List to accumulate structured log entries.
45
+ """
46
+ self._endpoint = a2a_endpoint.rstrip("/")
47
+ self._access_token = access_token
48
+ self._logger = logger or logging.getLogger(__name__)
49
+ self._diagnostic_records = diagnostic_records
50
+ self._resolved_agent_url: Optional[str] = None
51
+
52
+ # ------------------------------------------------------------------ #
53
+ # BaseAgentClient implementation #
54
+ # ------------------------------------------------------------------ #
55
+
56
+ def resolve_agent(self, agent_id: str) -> None:
57
+ """Pre-resolve agent URL from agent card and cache for the session."""
58
+ self._resolved_agent_url = self._resolve_agent_url(agent_id)
59
+
60
+ def fetch_available_agents(self) -> List[Dict[str, Any]]:
61
+ """Fetch agents from the A2A discovery endpoint.
62
+
63
+ Calls GET {endpoint}/.agents. Each A2A agent card is normalized to
64
+ include 'gptId', 'name', and 'provider' so it is compatible with
65
+ the shared select_agent_interactively selector.
66
+
67
+ Returns an empty list if the endpoint is unreachable or returns an
68
+ error.
69
+ """
70
+ try:
71
+ agents_url = f"{self._endpoint}/.agents"
72
+ headers = self._build_request_headers()
73
+ emit_structured_log(
74
+ "debug",
75
+ f"[A2A] Fetching available agents from: {agents_url}",
76
+ Operation.FETCH_AGENTS,
77
+ logger=self._logger,
78
+ diagnostic_records=self._diagnostic_records,
79
+ )
80
+ req = urllib.request.Request(agents_url, headers=headers, method="GET")
81
+ with urllib.request.urlopen(req, timeout=_REQUEST_TIMEOUT_SECS) as resp:
82
+ agents = json.loads(resp.read().decode("utf-8"))
83
+ emit_structured_log(
84
+ "debug",
85
+ f"[A2A] Available agents response: {json.dumps(agents)}",
86
+ Operation.FETCH_AGENTS,
87
+ logger=self._logger,
88
+ diagnostic_records=self._diagnostic_records
89
+ )
90
+ return [self._normalize_agent_card(a) for a in agents]
91
+ except urllib.error.HTTPError as e:
92
+ emit_structured_log(
93
+ "warning",
94
+ f"[A2A] Unable to fetch agents list (HTTP {e.code}).",
95
+ Operation.FETCH_AGENTS,
96
+ logger=self._logger,
97
+ diagnostic_records=self._diagnostic_records,
98
+ )
99
+ return []
100
+ except Exception as e:
101
+ emit_structured_log(
102
+ "warning",
103
+ f"[A2A] Error fetching agents: {e}",
104
+ Operation.FETCH_AGENTS,
105
+ logger=self._logger,
106
+ diagnostic_records=self._diagnostic_records,
107
+ )
108
+ return []
109
+
110
+ @staticmethod
111
+ def _normalize_agent_card(agent: Dict[str, Any]) -> Dict[str, Any]:
112
+ """Normalize an A2A agent card to the shape expected by the selector.
113
+ """
114
+ return {
115
+ "gptId": agent.get("agentId"),
116
+ "name": agent.get("name"),
117
+ "provider": agent.get("provider")
118
+ }
119
+
120
+ def send_prompt(
121
+ self,
122
+ prompt: str,
123
+ agent_id: str | None = None,
124
+ conversation_context: Optional[Dict[str, Any]] = None,
125
+ ) -> Tuple[Dict[str, Any], Optional[Dict[str, Any]]]:
126
+ """Send a single prompt to the A2A endpoint and return the response with context.
127
+
128
+ Args:
129
+ prompt: The prompt string to send.
130
+ agent_id: Target agent ID. Required for A2A — no fallback discovery.
131
+ conversation_context: Context from a previous turn (contains context_id),
132
+ or None for the first turn / single-turn usage.
133
+
134
+ Returns:
135
+ Tuple of (enhanced_response_dict, conversation_context).
136
+ """
137
+ agent_id = (agent_id or "").strip()
138
+ if not agent_id:
139
+ raise ValueError("agent_id is required for A2A requests.")
140
+
141
+ headers = self._build_request_headers(include_content_type=True)
142
+ agent_url = self._resolved_agent_url or self._resolve_agent_url(agent_id)
143
+
144
+ context_id = conversation_context.get("context_id") if conversation_context else None
145
+
146
+ emit_structured_log(
147
+ "debug",
148
+ "[A2A] Sending prompt to agent.",
149
+ Operation.SEND_PROMPT,
150
+ logger=self._logger,
151
+ diagnostic_records=self._diagnostic_records,
152
+ )
153
+
154
+ payload = self._build_chat_payload(prompt, context_id)
155
+
156
+ emit_structured_log(
157
+ "debug",
158
+ f"[A2A] Sending to {agent_url}: {payload.decode('utf-8')[:500]}",
159
+ Operation.SEND_PROMPT,
160
+ logger=self._logger,
161
+ diagnostic_records=self._diagnostic_records,
162
+ )
163
+
164
+ result_dict, raw_result = self._send_and_parse_message(agent_url, payload, headers)
165
+
166
+ # Build updated context for subsequent turns
167
+ new_context_id = context_id
168
+ if raw_result:
169
+ new_context_id = raw_result.get("contextId") or context_id
170
+ updated_context = {"context_id": new_context_id} if new_context_id else None
171
+
172
+ return result_dict, updated_context
173
+
174
+ # ------------------------------------------------------------------ #
175
+ # Private helpers #
176
+ # ------------------------------------------------------------------ #
177
+
178
+ def _build_request_headers(self, *, include_content_type: bool = False) -> Dict[str, str]:
179
+ headers: Dict[str, str] = {
180
+ "Authorization": f"Bearer {self._access_token}",
181
+ "X-variants": _A2A_FEATURE_FLAG,
182
+ }
183
+ if include_content_type:
184
+ headers["Content-Type"] = "application/json"
185
+ return headers
186
+
187
+ def _build_chat_payload(self, prompt: str, context_id: str | None = None) -> bytes:
188
+ message: Dict[str, Any] = {
189
+ "kind": "message",
190
+ "role": "user",
191
+ "parts": [{"kind": "text", "text": prompt}],
192
+ "messageId": str(uuid.uuid4()),
193
+ "metadata": {
194
+ "location": self._get_a2a_location(),
195
+ },
196
+ }
197
+ if context_id:
198
+ message["contextId"] = context_id
199
+ return json.dumps({
200
+ "jsonrpc": "2.0",
201
+ "method": "message/send",
202
+ "params": {"message": message},
203
+ "id": str(uuid.uuid4()),
204
+ }).encode("utf-8")
205
+
206
+ @staticmethod
207
+ @functools.lru_cache(maxsize=1)
208
+ def _get_a2a_location() -> Dict[str, Any]:
209
+ locale_str = locale.getlocale()[0] or ""
210
+ country = locale_str.split("_")[-1] if "_" in locale_str else ""
211
+ return {
212
+ "countryOrRegion": country,
213
+ "countryOrRegionConfidence": 1.0,
214
+ "timeZone": BaseAgentClient._get_iana_timezone_name(),
215
+ }
216
+
217
+ def _resolve_agent_url(self, agent_id: str) -> str:
218
+ """Resolve the agent URL from the agent card, falling back to base URL.
219
+
220
+ """
221
+ headers = self._build_request_headers(include_content_type=True)
222
+ base_agent_url = f"{self._endpoint}/{agent_id}"
223
+ card_url = f"{base_agent_url}/.well-known/agent-card.json"
224
+ agent_url = base_agent_url
225
+ emit_structured_log(
226
+ "debug",
227
+ f"[A2A] Fetching agent card from: {card_url}",
228
+ Operation.FETCH_AGENTS,
229
+ logger=self._logger,
230
+ diagnostic_records=self._diagnostic_records,
231
+ )
232
+ try:
233
+ card_req = urllib.request.Request(card_url, headers=headers)
234
+ with urllib.request.urlopen(card_req, timeout=_REQUEST_TIMEOUT_SECS) as resp:
235
+ raw_card = resp.read().decode("utf-8")
236
+ if raw_card.strip():
237
+ card = json.loads(raw_card)
238
+ agent_url = card.get("url") or base_agent_url
239
+ except (urllib.error.HTTPError, urllib.error.URLError, json.JSONDecodeError, UnicodeDecodeError) as e:
240
+ emit_structured_log(
241
+ "debug",
242
+ f"[A2A] Agent card fetch failed ({e}); using base URL: {base_agent_url}",
243
+ Operation.FETCH_AGENTS,
244
+ logger=self._logger,
245
+ diagnostic_records=self._diagnostic_records,
246
+ )
247
+ emit_structured_log(
248
+ "debug",
249
+ f"[A2A] Resolved agent URL: {agent_url}",
250
+ Operation.SEND_PROMPT,
251
+ logger=self._logger,
252
+ diagnostic_records=self._diagnostic_records,
253
+ )
254
+ return agent_url
255
+
256
+ def _send_and_parse_message(
257
+ self,
258
+ agent_url: str,
259
+ payload: bytes,
260
+ headers: Dict[str, str],
261
+ ) -> tuple[Dict[str, Any], Dict[str, Any]]:
262
+ """Send a JSON-RPC message to the agent and parse the response.
263
+
264
+ Returns:
265
+ A tuple of (result_dict, raw_result) where result_dict is the
266
+ normalized response dict (raw_response_text, display_response_text,
267
+ a2a_attributions) and raw_result is the parsed JSON result object.
268
+
269
+ Raises:
270
+ RuntimeError: On HTTP errors, connection errors, JSON parse errors,
271
+ or A2A protocol errors.
272
+ """
273
+ req = urllib.request.Request(agent_url, data=payload, headers=headers, method="POST")
274
+ try:
275
+ with urllib.request.urlopen(req, timeout=_REQUEST_TIMEOUT_SECS) as resp:
276
+ raw = resp.read().decode("utf-8", errors="replace")
277
+ except urllib.error.HTTPError as e:
278
+ body = ""
279
+ try:
280
+ body = e.read().decode("utf-8", errors="replace")
281
+ except Exception:
282
+ pass
283
+ raise RuntimeError(
284
+ f"A2A request failed (HTTP {e.code} {e.reason})."
285
+ + (f" Body: {body[:500]}" if body else "")
286
+ ) from e
287
+ except urllib.error.URLError as e:
288
+ raise RuntimeError(
289
+ f"A2A connection error: {getattr(e, 'reason', str(e))}"
290
+ ) from e
291
+
292
+ emit_structured_log(
293
+ "debug",
294
+ f"[A2A] Raw response: {raw[:500]}",
295
+ Operation.SEND_PROMPT,
296
+ logger=self._logger,
297
+ diagnostic_records=self._diagnostic_records,
298
+ )
299
+
300
+ try:
301
+ data = json.loads(raw)
302
+ except json.JSONDecodeError as e:
303
+ raise RuntimeError(f"A2A response is not valid JSON: {e}") from e
304
+
305
+ if "error" in data:
306
+ err = data["error"]
307
+ raise RuntimeError(
308
+ f"A2A JSON-RPC error {err.get('code', 'unknown')}: {err.get('message', 'no message')}"
309
+ )
310
+
311
+ if "result" not in data:
312
+ raise RuntimeError(
313
+ f"A2A response missing 'result' key. Keys present: {list(data.keys())}"
314
+ )
315
+
316
+ result = data["result"]
317
+ kind = result.get("kind")
318
+ text = ""
319
+ attributions: List[Dict[str, Any]] = []
320
+
321
+ if kind == "message":
322
+ text = "\n".join(
323
+ p.get("text", "")
324
+ for p in result.get("parts", [])
325
+ if p.get("kind") == "text"
326
+ )
327
+ attributions = result.get("metadata", {}).get("attributions", [])
328
+ elif kind == "task":
329
+ state = result.get("status", {}).get("state")
330
+ if state == "completed":
331
+ msg = result.get("status", {}).get("message") or {}
332
+ all_parts = list(msg.get("parts", []))
333
+ for artifact in result.get("artifacts", []):
334
+ all_parts.extend(artifact.get("parts", []))
335
+ text = "\n".join(
336
+ p.get("text", "")
337
+ for p in all_parts
338
+ if p.get("kind") == "text"
339
+ )
340
+ attributions = msg.get("metadata", {}).get("attributions", [])
341
+ elif state in ("failed", "canceled", "rejected"):
342
+ status_msg = result.get("status", {}).get("message") or {}
343
+ detail = "\n".join(
344
+ p.get("text", "")
345
+ for p in status_msg.get("parts", [])
346
+ if p.get("kind") == "text"
347
+ ).strip()
348
+ suffix = f" Detail: {detail}" if detail else ""
349
+ raise RuntimeError(
350
+ f"A2A task {state}. Task id: {result.get('id')}{suffix}"
351
+ )
352
+ elif state in ("input_required", "auth_required"):
353
+ requirement = {
354
+ "input_required": "user input",
355
+ "auth_required": "authentication",
356
+ }.get(state, state.replace("_", " "))
357
+ raise RuntimeError(
358
+ f"A2A task requires {requirement} and cannot proceed automatically."
359
+ f" Task id: {result.get('id')}"
360
+ )
361
+ elif state in ("submitted", "working"):
362
+ raise RuntimeError(
363
+ f"A2A task is still {state}; synchronous send returned before completion."
364
+ f" Task id: {result.get('id')}"
365
+ )
366
+ else:
367
+ raise RuntimeError(
368
+ f"A2A task in unexpected state: {state!r}. Task id: {result.get('id')}"
369
+ )
370
+ else:
371
+ raise RuntimeError(f"Unexpected A2A result kind: {kind!r}")
372
+
373
+ if attributions:
374
+ emit_structured_log(
375
+ "debug",
376
+ f"[A2A] Attributions ({len(attributions)}): "
377
+ + ", ".join(
378
+ a.get("providerDisplayName", a.get("attributionType", ""))
379
+ for a in attributions
380
+ ),
381
+ Operation.SEND_PROMPT,
382
+ logger=self._logger,
383
+ diagnostic_records=self._diagnostic_records,
384
+ )
385
+
386
+ display_text = self._replace_citation_markers(text, attributions, self._logger, self._diagnostic_records)
387
+
388
+ result_dict = {
389
+ "raw_response_text": text,
390
+ "display_response_text": display_text,
391
+ "a2a_attributions": attributions,
392
+ "metadata": {
393
+ "conversation_id": result.get("contextId"),
394
+ },
395
+ }
396
+ return result_dict, result
397
+
398
+ @staticmethod
399
+ def _replace_citation_markers(
400
+ text: str,
401
+ attributions: List[Dict[str, Any]],
402
+ logger: Optional[logging.Logger] = None,
403
+ diagnostic_records: Optional[List[Dict[str, Any]]] = None,
404
+ ) -> str:
405
+ """Replace OAI Unicode citation markers with markdown links.
406
+
407
+ Marker format: \\ue200cite(\\ue202turn{X}search{Y})+\\ue201
408
+ Compound markers (multiple turn/search refs between a single pair of
409
+ bookend characters) are also handled.
410
+
411
+ The search{Y} number is NOT a direct array index — it's a grounding result
412
+ number. Mapping: unique search numbers in first-appearance order →
413
+ citation_attrs[0, 1, ...].
414
+
415
+ Args:
416
+ text: Response text that may contain OAI citation markers.
417
+ attributions: Attribution objects from A2A response metadata.
418
+ logger: Logger for debug messages. Defaults to module logger if not provided.
419
+ diagnostic_records: List to accumulate structured log entries.
420
+
421
+ Returns:
422
+ Text with citation markers replaced by markdown links, or the
423
+ original text unchanged if there are no citation attributions.
424
+ """
425
+ _logger = logger or logging.getLogger(__name__)
426
+ citation_attrs = [a for a in attributions if a.get("attributionType") == "citation"]
427
+ if not text:
428
+ if text == "" and attributions:
429
+ emit_structured_log(
430
+ "warning",
431
+ "[A2A] Response text is empty; skipping citation replacement.",
432
+ Operation.SEND_PROMPT,
433
+ logger=_logger,
434
+ diagnostic_records=diagnostic_records,
435
+ )
436
+ return text
437
+ if not citation_attrs:
438
+ return text
439
+
440
+ # Build ordered map: search-number-string → 0-based index into citation_attrs
441
+ seen: Dict[str, int] = {}
442
+ for m in _CITATION_REF_PAT.finditer(text):
443
+ k = m.group(1)
444
+ if k not in seen:
445
+ seen[k] = len(seen)
446
+
447
+ def replace_citation(m: re.Match) -> str:
448
+ links = []
449
+ for idx_str in _CITATION_REF_PAT.findall(m.group(0)):
450
+ pos = seen.get(idx_str)
451
+ if pos is not None and pos < len(citation_attrs):
452
+ attr = citation_attrs[pos]
453
+ url = attr.get("seeMoreWebUrl") or ""
454
+ label = attr.get("providerDisplayName") or url or idx_str
455
+ if url:
456
+ links.append(f"[{label}]({url})")
457
+ else:
458
+ emit_structured_log(
459
+ "warning",
460
+ f"[A2A] Citation search#{idx_str} has no URL; skipping link.",
461
+ Operation.SEND_PROMPT,
462
+ logger=_logger,
463
+ diagnostic_records=diagnostic_records,
464
+ )
465
+ else:
466
+ emit_structured_log(
467
+ "warning",
468
+ f"[A2A] Citation search#{idx_str} has no matching attribution; skipping link.",
469
+ Operation.SEND_PROMPT,
470
+ logger=_logger,
471
+ diagnostic_records=diagnostic_records,
472
+ )
473
+ return " ".join(links)
474
+
475
+ return _CITATION_BLOCK_PAT.sub(replace_citation, text)
@@ -0,0 +1,3 @@
1
+ from .base_agent_client import BaseAgentClient
2
+
3
+ __all__ = ["BaseAgentClient"]
@@ -0,0 +1,77 @@
1
+ from __future__ import annotations
2
+
3
+ import functools
4
+ from abc import ABC, abstractmethod
5
+ from datetime import datetime
6
+ from typing import Any, Dict, List, Optional, Tuple
7
+
8
+ import tzlocal
9
+
10
+
11
+ class BaseAgentClient(ABC):
12
+ """Abstract base class for agent API clients.
13
+ """
14
+
15
+ @abstractmethod
16
+ def fetch_available_agents(self) -> List[Dict[str, Any]]:
17
+ """Return the list of agents accessible to the configured user.
18
+
19
+ Implementations that do not support agent enumeration should
20
+ return an empty list.
21
+ """
22
+ pass
23
+
24
+ @abstractmethod
25
+ def send_prompt(
26
+ self,
27
+ prompt: str,
28
+ agent_id: str | None = None,
29
+ conversation_context: Optional[Dict[str, Any]] = None,
30
+ ) -> Tuple[Dict[str, Any], Optional[Dict[str, Any]]]:
31
+ """Send a single prompt and return the response with conversation context.
32
+
33
+ For single-turn usage, pass conversation_context=None.
34
+ For multi-turn usage, pass the context returned from the previous turn.
35
+
36
+ Args:
37
+ prompt: The prompt string to send.
38
+ agent_id: Optional agent ID to target.
39
+ conversation_context: Opaque context dict from a previous turn,
40
+ or None for the first turn / single-turn usage.
41
+
42
+ Returns:
43
+ Tuple of (enhanced_response_dict, conversation_context).
44
+ The conversation_context should be passed to the next turn
45
+ in a multi-turn conversation, or discarded for single-turn.
46
+ The context structure is implementation-specific:
47
+ - A2A: {"context_id": str}
48
+ Returns None as context when no conversation state is established.
49
+ """
50
+ pass
51
+
52
+ def resolve_agent(self, agent_id: str) -> None:
53
+ """Pre-resolve agent endpoint. Called once before pipeline starts.
54
+
55
+ Default is no-op. Subclasses may override to cache agent discovery.
56
+ """
57
+ pass
58
+
59
+ @staticmethod
60
+ @functools.lru_cache(maxsize=1)
61
+ def _get_iana_timezone_name() -> str:
62
+ try:
63
+ return tzlocal.get_localzone_name()
64
+ except Exception:
65
+ return str(tzlocal.get_localzone())
66
+
67
+ @staticmethod
68
+ @functools.lru_cache(maxsize=1)
69
+ def _get_location_info() -> Dict[str, Any]:
70
+ now = datetime.now().astimezone()
71
+ utc_offset = now.utcoffset()
72
+ offset_hours = int(utc_offset.total_seconds() // 3600) if utc_offset is not None else 0
73
+ return {
74
+ "timeZoneOffset": offset_hours,
75
+ "timeZone": BaseAgentClient._get_iana_timezone_name(),
76
+ }
77
+
@@ -0,0 +1,136 @@
1
+ """CLI argument parsing and version-check bypass logic."""
2
+
3
+ import argparse
4
+ import os
5
+
6
+ from cli_logging.cli_logger import emit_structured_log
7
+ from cli_logging.logging_utils import Operation
8
+ from common import MAX_CONCURRENCY, RunConfig
9
+ from agent_selector import normalize_agent_id
10
+
11
+
12
+ # Flags that should bypass remote min-version enforcement.
13
+ # --help is not needed here because argparse exits before runtime checks.
14
+ VERSION_CHECK_BYPASS_FLAGS = (
15
+ "signout",
16
+ )
17
+
18
+
19
+ def should_bypass_min_version_check(config: RunConfig) -> bool:
20
+ """Return True if the current invocation should skip min-version checks."""
21
+ return any(getattr(config, flag, False) for flag in VERSION_CHECK_BYPASS_FLAGS)
22
+
23
+
24
+ def parse_arguments():
25
+ """Parse command line arguments."""
26
+ parser = argparse.ArgumentParser(
27
+ description="M365 Copilot Agent Evaluation CLI",
28
+ formatter_class=argparse.RawDescriptionHelpFormatter,
29
+ epilog="""
30
+ Examples:
31
+ # Run with default prompts
32
+ python main.py
33
+
34
+ # Run with custom prompts
35
+ python main.py --prompts "What is Microsoft Graph?" --expected "Microsoft Graph is a gateway..."
36
+
37
+ # Run with prompts from file
38
+ python main.py --prompts-file prompts.json
39
+
40
+ # Interactive mode
41
+ python main.py --interactive
42
+
43
+ # Save results to JSON
44
+ python main.py --output results.json
45
+
46
+ # Save results to CSV
47
+ python main.py --output results.csv
48
+
49
+ # Save results to HTML and open in browser
50
+ python main.py --output report.html
51
+
52
+ # Debug-level diagnostics
53
+ python main.py --log-level debug
54
+
55
+ # Sign out and clear cached authentication tokens
56
+ python main.py --signout
57
+ """
58
+ )
59
+
60
+ # Input options (mutually exclusive)
61
+ input_group = parser.add_mutually_exclusive_group()
62
+ input_group.add_argument(
63
+ '--prompts',
64
+ nargs='+',
65
+ help='List of prompts to evaluate'
66
+ )
67
+ input_group.add_argument(
68
+ '--prompts-file',
69
+ type=str,
70
+ help='JSON file containing prompts and expected responses'
71
+ )
72
+ input_group.add_argument(
73
+ '--interactive',
74
+ action='store_true',
75
+ help='Interactive mode to enter prompts'
76
+ )
77
+
78
+ # Expected responses (only used with --prompts)
79
+ parser.add_argument(
80
+ '--expected',
81
+ nargs='+',
82
+ help='List of expected responses (must match number of prompts)'
83
+ )
84
+
85
+ # Agent ID (--m365-agent-id is primary, --agent-id kept for backward compatibility)
86
+ parser.add_argument(
87
+ '--m365-agent-id', '--agent-id',
88
+ type=str,
89
+ default=os.environ.get("M365_AGENT_ID") or os.environ.get("AGENT_ID"),
90
+ help='Agent ID (default from M365_AGENT_ID environment variable)'
91
+ )
92
+
93
+ # Output options
94
+ parser.add_argument(
95
+ '--output',
96
+ type=str,
97
+ help='Output file path. Format is determined by file extension: .json, .csv, .html. If not provided, results are printed to console.'
98
+ )
99
+
100
+ # Behavior options
101
+ parser.add_argument(
102
+ '--log-level',
103
+ nargs='?',
104
+ const='info',
105
+ action='append',
106
+ help='Set log verbosity: debug, info, warning, error. Bare --log-level resolves to info.'
107
+ )
108
+
109
+ parser.add_argument(
110
+ '--signout',
111
+ action='store_true',
112
+ help='Sign out and clear cached authentication tokens'
113
+ )
114
+
115
+ parser.add_argument(
116
+ '--concurrency',
117
+ type=int,
118
+ default=MAX_CONCURRENCY,
119
+ help=f'Number of parallel workers for prompt processing (1-{MAX_CONCURRENCY}, default: {MAX_CONCURRENCY})'
120
+ )
121
+
122
+ args = parser.parse_args()
123
+
124
+ args.m365_agent_id = normalize_agent_id(args.m365_agent_id)
125
+
126
+ if args.concurrency < 1:
127
+ parser.error('--concurrency must be an integer >= 1.')
128
+ if args.concurrency > MAX_CONCURRENCY:
129
+ emit_structured_log(
130
+ "warning",
131
+ f"--concurrency {args.concurrency} exceeds max {MAX_CONCURRENCY}; clamping to {MAX_CONCURRENCY}.",
132
+ operation=Operation.SETUP,
133
+ )
134
+ args.concurrency = MAX_CONCURRENCY
135
+
136
+ return args