@microsoft/m365-copilot-eval 1.2.1-preview.1 → 1.4.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/README.md +140 -101
  2. package/package.json +7 -4
  3. package/schema/CHANGELOG.md +8 -0
  4. package/schema/v1/eval-document.schema.json +256 -8
  5. package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
  6. package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
  7. package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
  8. package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
  9. package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
  10. package/schema/v1/examples/valid/comprehensive.json +27 -2
  11. package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
  12. package/schema/v1/examples/valid/multi-turn-output.json +59 -0
  13. package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
  14. package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
  15. package/schema/version.json +2 -2
  16. package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
  17. package/src/clients/cli/api_clients/A2A/a2a_client.py +456 -0
  18. package/src/clients/cli/api_clients/REST/__init__.py +3 -0
  19. package/src/clients/cli/api_clients/REST/sydney_client.py +204 -0
  20. package/src/clients/cli/api_clients/__init__.py +3 -0
  21. package/src/clients/cli/api_clients/base_agent_client.py +78 -0
  22. package/src/clients/cli/cli_logging/__init__.py +0 -0
  23. package/src/clients/cli/cli_logging/console_diagnostics.py +107 -0
  24. package/src/clients/cli/cli_logging/logging_utils.py +144 -0
  25. package/src/clients/cli/common.py +62 -0
  26. package/src/clients/cli/custom_evaluators/CitationsEvaluator.py +3 -3
  27. package/src/clients/cli/custom_evaluators/ExactMatchEvaluator.py +11 -11
  28. package/src/clients/cli/custom_evaluators/PartialMatchEvaluator.py +1 -11
  29. package/src/clients/cli/evaluator_resolver.py +150 -0
  30. package/src/clients/cli/generate_report.py +347 -184
  31. package/src/clients/cli/main.py +1288 -481
  32. package/src/clients/cli/parallel_executor.py +57 -0
  33. package/src/clients/cli/readme.md +14 -7
  34. package/src/clients/cli/requirements.txt +1 -1
  35. package/src/clients/cli/response_extractor.py +30 -14
  36. package/src/clients/cli/retry_policy.py +52 -0
  37. package/src/clients/cli/samples/multiturn_example.json +35 -0
  38. package/src/clients/cli/throttle_gate.py +82 -0
  39. package/src/clients/node-js/bin/runevals.js +134 -41
  40. package/src/clients/node-js/config/default.js +5 -1
  41. package/src/clients/node-js/lib/agent-id.js +12 -0
  42. package/src/clients/node-js/lib/env-loader.js +11 -16
  43. package/src/clients/node-js/lib/eula-manager.js +78 -0
  44. package/src/clients/node-js/lib/progress.js +13 -11
@@ -0,0 +1,57 @@
1
+ """Parallel prompt execution utilities.
2
+
3
+ This module provides a minimal reusable executor that preserves input order.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ from dataclasses import dataclass
10
+ from typing import Any, Callable, Generic, Iterable, List, Optional, TypeVar
11
+
12
+
13
+ T = TypeVar("T")
14
+ R = TypeVar("R")
15
+
16
+
17
+ @dataclass
18
+ class WorkerResult(Generic[R]):
19
+ """Result model used to preserve input ordering and capture failures."""
20
+
21
+ index: int
22
+ value: Optional[R] = None
23
+ error: Optional[Exception] = None
24
+
25
+
26
+ def execute_in_parallel(
27
+ items: Iterable[T],
28
+ worker: Callable[[T, int], R],
29
+ max_workers: int,
30
+ ) -> List[WorkerResult[R]]:
31
+ """Execute worker function in parallel while preserving input order.
32
+
33
+ `worker` receives `(item, index)` and returns a value.
34
+ """
35
+ indexed_items = list(enumerate(items))
36
+ if not indexed_items:
37
+ return []
38
+
39
+ normalized_workers = max(1, min(max_workers, len(indexed_items)))
40
+ results: List[WorkerResult[R]] = [WorkerResult(index=i) for i, _ in indexed_items]
41
+
42
+ with ThreadPoolExecutor(max_workers=normalized_workers) as executor:
43
+ future_map = {
44
+ executor.submit(worker, item, index): index
45
+ for index, item in indexed_items
46
+ }
47
+
48
+ for future in as_completed(future_map):
49
+ index = future_map[future]
50
+ try:
51
+ results[index] = WorkerResult(index=index, value=future.result())
52
+ except (KeyboardInterrupt, SystemExit):
53
+ raise
54
+ except Exception as exc:
55
+ results[index] = WorkerResult(index=index, error=exc)
56
+
57
+ return results
@@ -97,21 +97,28 @@ python main.py --interactive
97
97
 
98
98
  #### Additional Options
99
99
  ```bash
100
- # Verbose output (shows detailed processing steps)
101
- python main.py --verbose
100
+ # Logging verbosity (canonical control surface)
101
+ python main.py --log-level debug
102
+ python main.py --log-level info
103
+ python main.py --log-level warning
104
+ python main.py --log-level error
105
+
106
+ # Bare flag resolves to info
107
+ python main.py --log-level
102
108
 
103
- # Quiet mode (minimal output)
109
+ # Legacy flags (no longer supported; use --log-level instead)
110
+ # The following will fail with "unrecognized arguments" errors:
111
+ python main.py --verbose
104
112
  python main.py --quiet
105
113
 
114
+ # Share diagnostics with support (console-based, no archive artifacts)
115
+ python main.py --log-level debug --prompts-file samples/example_prompts.json
116
+
106
117
  # Get help and see all options
107
118
  python main.py --help
108
119
 
109
120
  # Specify / override the Agent ID (takes precedence over M365_AGENT_ID env var)
110
121
  python main.py --m365-agent-id "00000000-0000-0000-0000-000000000000"
111
-
112
- # Citation format options
113
- python main.py --citation-format oai_unicode # Default: New OAI format
114
- python main.py --citation-format legacy_bracket # Old [^i^] format
115
122
  ```
116
123
 
117
124
  #### File Format Examples
@@ -6,7 +6,7 @@ msal[broker]>=1.34,<2
6
6
  msal-extensions>=1.3.1
7
7
  packaging>=20.0
8
8
  PyJWT>=2.11.0
9
- python-dotenv==1.1.1
9
+ python-dotenv==1.2.2
10
10
  markdown==3.8.2
11
11
  promptflow>=1.18.1
12
12
  questionary>=2.1.1
@@ -27,14 +27,20 @@ Date: September 21, 2025
27
27
 
28
28
  import json
29
29
  import logging
30
- from typing import Dict, List, Any, Optional
30
+ from typing import Dict, List, Any, Optional, Tuple
31
31
  from datetime import datetime
32
32
  from enum import Enum
33
+ from cli_logging.logging_utils import LOG_LEVEL_MAP, LogLevel
33
34
 
34
35
  # Configure logging
35
- logging.basicConfig(level=logging.INFO)
36
+ if not logging.getLogger().handlers:
37
+ logging.basicConfig(level=logging.INFO)
36
38
  logger = logging.getLogger(__name__)
37
39
 
40
+ def _log_level_to_python_level(log_level: str) -> int:
41
+ normalized = (log_level or "info").strip().lower()
42
+ return LOG_LEVEL_MAP.get(normalized, logging.INFO)
43
+
38
44
  class MessageRole(Enum):
39
45
  """Enumeration for message roles."""
40
46
  USER = "user"
@@ -71,8 +77,10 @@ class EnhancedResponseExtractor:
71
77
  "generate_response"
72
78
  }
73
79
 
74
- def __init__(self):
80
+ def __init__(self, log_level: str = "info"):
75
81
  self.tool_call_counter = 0
82
+ self.log_level = (log_level or "info").strip().lower()
83
+ logger.setLevel(_log_level_to_python_level(self.log_level))
76
84
 
77
85
  def _generate_tool_call_id(self, tool_name: str) -> str:
78
86
  """Generate a unique tool call ID."""
@@ -461,6 +469,7 @@ class EnhancedResponseExtractor:
461
469
  "metadata": {
462
470
  "conversation_id": response_data.get("conversationId"),
463
471
  "request_id": response_data.get("requestId"),
472
+ "message_id": None,
464
473
  "telemetry_available": False
465
474
  }
466
475
  }
@@ -470,6 +479,11 @@ class EnhancedResponseExtractor:
470
479
  if isinstance(response_data, dict):
471
480
  # Messages are directly in the response_data object
472
481
  messages = response_data.get("messages", [])
482
+
483
+ # Extract message_id from the last bot message in this response
484
+ bot_messages = [m for m in messages if m.get("author") != "user"]
485
+ if bot_messages and bot_messages[-1].get("messageId"):
486
+ enhanced_response["metadata"]["message_id"] = bot_messages[-1]["messageId"]
473
487
 
474
488
  # Extract telemetry tools if available
475
489
  telemetry_tools = []
@@ -526,6 +540,7 @@ class EnhancedResponseExtractor:
526
540
  "metadata": {
527
541
  "conversation_id": None,
528
542
  "request_id": None,
543
+ "message_id": None,
529
544
  "telemetry_available": False
530
545
  }
531
546
  }
@@ -552,28 +567,29 @@ class EnhancedResponseExtractor:
552
567
  "metadata": {
553
568
  "conversation_id": None,
554
569
  "request_id": None,
570
+ "message_id": None,
555
571
  "telemetry_available": False,
556
572
  "error": str(e)
557
573
  }
558
574
  }
559
575
 
560
- def extract_enhanced_responses(responses: Dict[str, str]) -> Dict[str, Dict[str, Any]]:
576
+ def extract_enhanced_response(raw_response: str, log_level: str = "info") -> Dict[str, Any]:
561
577
  """
562
- Extract enhanced response information for multiple responses.
578
+ Extract enhanced response information from a raw response string.
563
579
 
564
580
  Args:
565
- responses: Dictionary mapping prompts to raw response strings
581
+ raw_response: Raw response string from the agent
582
+ log_level: Logging level for the extraction process (default: "info")
566
583
 
567
584
  Returns:
568
- Dictionary mapping prompts to enhanced response data
585
+ A dictionary containing the enhanced response information, including:
586
+ - "response": Reconstructed message flow with tool calls and results
587
+ - "tool_definitions": List of tool definitions extracted from telemetry
588
+ - "raw_response_text": Original response text for backward compatibility
589
+ - "metadata": Additional metadata such as conversation ID, request ID, etc.
569
590
  """
570
- extractor = EnhancedResponseExtractor()
571
- enhanced_responses = {}
572
-
573
- for prompt, raw_response in responses.items():
574
- enhanced_responses[prompt] = extractor.extract_enhanced_response(raw_response)
575
-
576
- return enhanced_responses
591
+ extractor = EnhancedResponseExtractor(log_level=log_level)
592
+ return extractor.extract_enhanced_response(raw_response)
577
593
 
578
594
  def get_response_text_for_evaluation(enhanced_response: Dict[str, Any]) -> str:
579
595
  """
@@ -0,0 +1,52 @@
1
+ """Retry utilities for transient HTTP failures in evaluation flows."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import datetime, timezone
6
+ from email.utils import parsedate_to_datetime
7
+ from typing import Optional
8
+
9
+ RETRYABLE_HTTP_STATUS_CODES = {429, 503, 504}
10
+ MAX_BACKOFF_SECONDS = 60
11
+
12
+
13
+ def is_retryable_status(status_code: Optional[int]) -> bool:
14
+ """Return True for transient HTTP status codes covered by the spec."""
15
+ if status_code is None:
16
+ return False
17
+ return int(status_code) in RETRYABLE_HTTP_STATUS_CODES
18
+
19
+
20
+ def get_backoff_seconds(attempt: int) -> int:
21
+ """Return exponential backoff delay capped at MAX_BACKOFF_SECONDS.
22
+
23
+ Examples: 2, 4, 8 for attempts 1..3.
24
+ """
25
+ if attempt < 1:
26
+ raise ValueError("attempt must be >= 1")
27
+ return min(2 ** attempt, MAX_BACKOFF_SECONDS)
28
+
29
+
30
+ def get_retry_after_seconds(retry_after_header: Optional[str]) -> Optional[int]:
31
+ """Parse Retry-After header value (delay-seconds or HTTP-date per RFC 7231)."""
32
+ if retry_after_header is None:
33
+ return None
34
+
35
+ value = retry_after_header.strip()
36
+ if not value:
37
+ return None
38
+
39
+ # Try delay-seconds (integer) first
40
+ try:
41
+ return max(0, int(value))
42
+ except ValueError:
43
+ pass
44
+
45
+ # Try HTTP-date format (RFC 7231 §7.1.3)
46
+ try:
47
+ retry_date = parsedate_to_datetime(value)
48
+ now = datetime.now(timezone.utc)
49
+ delta = int((retry_date - now).total_seconds())
50
+ return max(0, delta)
51
+ except (ValueError, TypeError):
52
+ return None
@@ -0,0 +1,35 @@
1
+ {
2
+ "schemaVersion": "1.2.0",
3
+ "default_evaluators": {
4
+ "Relevance": {},
5
+ "Coherence": {}
6
+ },
7
+ "items": [
8
+ {
9
+ "prompt": "What is Microsoft Graph?",
10
+ "expected_response": "Microsoft Graph is a gateway to data and intelligence in Microsoft 365."
11
+ },
12
+ {
13
+ "name": "Travel planning conversation",
14
+ "description": "Multi-turn thread testing context retention across turns",
15
+ "turns": [
16
+ {
17
+ "prompt": "I'm planning a trip to Seattle next week.",
18
+ "expected_response": "I can help you plan your Seattle trip."
19
+ },
20
+ {
21
+ "prompt": "What's the weather going to be like?",
22
+ "expected_response": "Seattle weather is typically mild with possible rain."
23
+ },
24
+ {
25
+ "prompt": "Should I bring a rain jacket?",
26
+ "expected_response": "Yes, it's always a good idea to bring rain gear to Seattle.",
27
+ "evaluators": {
28
+ "Groundedness": { "threshold": 4 }
29
+ },
30
+ "evaluators_mode": "extend"
31
+ }
32
+ ]
33
+ }
34
+ ]
35
+ }
@@ -0,0 +1,82 @@
1
+ """Per-API throttle gate support for transient HTTP 429 handling."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import threading
6
+ import time
7
+ from dataclasses import dataclass
8
+ from typing import Optional
9
+
10
+
11
+ @dataclass
12
+ class GateState:
13
+ """Snapshot state for diagnostics and tests."""
14
+
15
+ api_name: str
16
+ blocked_until_epoch: float
17
+ is_blocked: bool
18
+ last_retry_after_seconds: Optional[int]
19
+
20
+
21
+ class ThrottleGate:
22
+ """Thread-safe per-API gate that pauses workers until the block window elapses."""
23
+
24
+ def __init__(self, api_name: str) -> None:
25
+ self.api_name = api_name
26
+ self._lock = threading.Lock()
27
+ self._blocked_until_epoch = 0.0
28
+ self._last_retry_after_seconds: Optional[int] = None
29
+
30
+ def apply_retry_after(self, retry_after_seconds: int) -> float:
31
+ """Apply retry-after duration and keep the maximum active block window.
32
+
33
+ Returns the current effective blocked-until epoch.
34
+ """
35
+ retry_after_seconds = max(0, int(retry_after_seconds))
36
+ candidate = time.time() + retry_after_seconds
37
+
38
+ with self._lock:
39
+ if candidate > self._blocked_until_epoch:
40
+ self._blocked_until_epoch = candidate
41
+ self._last_retry_after_seconds = retry_after_seconds
42
+ return self._blocked_until_epoch
43
+
44
+ MAX_GATE_WAIT_SECONDS = 300.0
45
+
46
+ def wait_if_blocked(self) -> float:
47
+ """Sleep until the gate opens. Returns the total slept duration in seconds.
48
+
49
+ Re-checks the block window after each sleep to handle concurrent
50
+ ``apply_retry_after`` calls that extend the window (avoids TOCTOU).
51
+ Raises ``TimeoutError`` if the total wait exceeds ``MAX_GATE_WAIT_SECONDS``.
52
+ """
53
+ total_slept = 0.0
54
+ while True:
55
+ with self._lock:
56
+ delay = max(0.0, self._blocked_until_epoch - time.time())
57
+ if delay <= 0:
58
+ return total_slept
59
+ if total_slept + delay > self.MAX_GATE_WAIT_SECONDS:
60
+ raise TimeoutError(
61
+ f"ThrottleGate '{self.api_name}' exceeded maximum wait of "
62
+ f"{self.MAX_GATE_WAIT_SECONDS}s (slept {total_slept:.1f}s so far)."
63
+ )
64
+ time.sleep(delay)
65
+ total_slept += delay
66
+
67
+ def clear(self) -> None:
68
+ """Reset the gate to unblocked state."""
69
+ with self._lock:
70
+ self._blocked_until_epoch = 0.0
71
+ self._last_retry_after_seconds = None
72
+
73
+ def state(self) -> GateState:
74
+ """Return immutable snapshot state."""
75
+ with self._lock:
76
+ now = time.time()
77
+ return GateState(
78
+ api_name=self.api_name,
79
+ blocked_until_epoch=self._blocked_until_epoch,
80
+ is_blocked=self._blocked_until_epoch > now,
81
+ last_retry_after_seconds=self._last_retry_after_seconds,
82
+ )