npm - @microsoft/m365-copilot-eval - Versions diffs - 1.2.1-preview.1 → 1.4.0-preview.1 - Mend

@microsoft/m365-copilot-eval 1.2.1-preview.1 → 1.4.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

package/README.md +140 -101
package/package.json +7 -4
package/schema/CHANGELOG.md +8 -0
package/schema/v1/eval-document.schema.json +256 -8
package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
package/schema/v1/examples/valid/comprehensive.json +27 -2
package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
package/schema/v1/examples/valid/multi-turn-output.json +59 -0
package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
package/schema/version.json +2 -2
package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
package/src/clients/cli/api_clients/A2A/a2a_client.py +456 -0
package/src/clients/cli/api_clients/REST/__init__.py +3 -0
package/src/clients/cli/api_clients/REST/sydney_client.py +204 -0
package/src/clients/cli/api_clients/__init__.py +3 -0
package/src/clients/cli/api_clients/base_agent_client.py +78 -0
package/src/clients/cli/cli_logging/__init__.py +0 -0
package/src/clients/cli/cli_logging/console_diagnostics.py +107 -0
package/src/clients/cli/cli_logging/logging_utils.py +144 -0
package/src/clients/cli/common.py +62 -0
package/src/clients/cli/custom_evaluators/CitationsEvaluator.py +3 -3
package/src/clients/cli/custom_evaluators/ExactMatchEvaluator.py +11 -11
package/src/clients/cli/custom_evaluators/PartialMatchEvaluator.py +1 -11
package/src/clients/cli/evaluator_resolver.py +150 -0
package/src/clients/cli/generate_report.py +347 -184
package/src/clients/cli/main.py +1288 -481
package/src/clients/cli/parallel_executor.py +57 -0
package/src/clients/cli/readme.md +14 -7
package/src/clients/cli/requirements.txt +1 -1
package/src/clients/cli/response_extractor.py +30 -14
package/src/clients/cli/retry_policy.py +52 -0
package/src/clients/cli/samples/multiturn_example.json +35 -0
package/src/clients/cli/throttle_gate.py +82 -0
package/src/clients/node-js/bin/runevals.js +134 -41
package/src/clients/node-js/config/default.js +5 -1
package/src/clients/node-js/lib/agent-id.js +12 -0
package/src/clients/node-js/lib/env-loader.js +11 -16
package/src/clients/node-js/lib/eula-manager.js +78 -0
package/src/clients/node-js/lib/progress.js +13 -11

package/src/clients/cli/parallel_executor.py ADDED Viewed

@@ -0,0 +1,57 @@
+"""Parallel prompt execution utilities.
+This module provides a minimal reusable executor that preserves input order.
+"""
+from __future__ import annotations
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass
+from typing import Any, Callable, Generic, Iterable, List, Optional, TypeVar
+T = TypeVar("T")
+R = TypeVar("R")
+@dataclass
+class WorkerResult(Generic[R]):
+    """Result model used to preserve input ordering and capture failures."""
+    index: int
+    value: Optional[R] = None
+    error: Optional[Exception] = None
+def execute_in_parallel(
+    items: Iterable[T],
+    worker: Callable[[T, int], R],
+    max_workers: int,
+) -> List[WorkerResult[R]]:
+    """Execute worker function in parallel while preserving input order.
+    `worker` receives `(item, index)` and returns a value.
+    """
+    indexed_items = list(enumerate(items))
+    if not indexed_items:
+        return []
+    normalized_workers = max(1, min(max_workers, len(indexed_items)))
+    results: List[WorkerResult[R]] = [WorkerResult(index=i) for i, _ in indexed_items]
+    with ThreadPoolExecutor(max_workers=normalized_workers) as executor:
+        future_map = {
+            executor.submit(worker, item, index): index
+            for index, item in indexed_items
+        }
+        for future in as_completed(future_map):
+            index = future_map[future]
+            try:
+                results[index] = WorkerResult(index=index, value=future.result())
+            except (KeyboardInterrupt, SystemExit):
+                raise
+            except Exception as exc:
+                results[index] = WorkerResult(index=index, error=exc)
+    return results

package/src/clients/cli/readme.md CHANGED Viewed

@@ -97,21 +97,28 @@ python main.py --interactive
 #### Additional Options
 ```bash
-# Verbose output (shows detailed processing steps)
-python main.py --verbose
+# Logging verbosity (canonical control surface)
+python main.py --log-level debug
+python main.py --log-level info
+python main.py --log-level warning
+python main.py --log-level error
+# Bare flag resolves to info
+python main.py --log-level
-# Quiet mode (minimal output)
+# Legacy flags (no longer supported; use --log-level instead)
+# The following will fail with "unrecognized arguments" errors:
+python main.py --verbose
 python main.py --quiet
+# Share diagnostics with support (console-based, no archive artifacts)
+python main.py --log-level debug --prompts-file samples/example_prompts.json
 # Get help and see all options
 python main.py --help
 # Specify / override the Agent ID (takes precedence over M365_AGENT_ID env var)
 python main.py --m365-agent-id "00000000-0000-0000-0000-000000000000"
-# Citation format options
-python main.py --citation-format oai_unicode      # Default: New OAI format
-python main.py --citation-format legacy_bracket   # Old [^i^] format
 ```
 #### File Format Examples

package/src/clients/cli/requirements.txt CHANGED Viewed

@@ -6,7 +6,7 @@ msal[broker]>=1.34,<2
 msal-extensions>=1.3.1
 packaging>=20.0
 PyJWT>=2.11.0
-python-dotenv==1.1.1
+python-dotenv==1.2.2
 markdown==3.8.2
 promptflow>=1.18.1
 questionary>=2.1.1

package/src/clients/cli/response_extractor.py CHANGED Viewed

@@ -27,14 +27,20 @@ Date: September 21, 2025
 import json
 import logging
-from typing import Dict, List, Any, Optional
+from typing import Dict, List, Any, Optional, Tuple
 from datetime import datetime
 from enum import Enum
+from cli_logging.logging_utils import LOG_LEVEL_MAP, LogLevel
 # Configure logging
-logging.basicConfig(level=logging.INFO)
+if not logging.getLogger().handlers:
+    logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+def _log_level_to_python_level(log_level: str) -> int:
+    normalized = (log_level or "info").strip().lower()
+    return LOG_LEVEL_MAP.get(normalized, logging.INFO)
 class MessageRole(Enum):
     """Enumeration for message roles."""
     USER = "user"
@@ -71,8 +77,10 @@ class EnhancedResponseExtractor:
         "generate_response"
     }
-    def __init__(self):
+    def __init__(self, log_level: str = "info"):
         self.tool_call_counter = 0
+        self.log_level = (log_level or "info").strip().lower()
+        logger.setLevel(_log_level_to_python_level(self.log_level))
     def _generate_tool_call_id(self, tool_name: str) -> str:
         """Generate a unique tool call ID."""
@@ -461,6 +469,7 @@ class EnhancedResponseExtractor:
                 "metadata": {
                     "conversation_id": response_data.get("conversationId"),
                     "request_id": response_data.get("requestId"),
+                    "message_id": None,
                     "telemetry_available": False
                 }
             }
@@ -470,6 +479,11 @@ class EnhancedResponseExtractor:
             if isinstance(response_data, dict):
                 # Messages are directly in the response_data object
                 messages = response_data.get("messages", [])
+            # Extract message_id from the last bot message in this response
+            bot_messages = [m for m in messages if m.get("author") != "user"]
+            if bot_messages and bot_messages[-1].get("messageId"):
+                enhanced_response["metadata"]["message_id"] = bot_messages[-1]["messageId"]
             # Extract telemetry tools if available
             telemetry_tools = []
@@ -526,6 +540,7 @@ class EnhancedResponseExtractor:
                 "metadata": {
                     "conversation_id": None,
                     "request_id": None,
+                    "message_id": None,
                     "telemetry_available": False
                 }
             }
@@ -552,28 +567,29 @@ class EnhancedResponseExtractor:
                 "metadata": {
                     "conversation_id": None,
                     "request_id": None,
+                    "message_id": None,
                     "telemetry_available": False,
                     "error": str(e)
                 }
             }
-def extract_enhanced_responses(responses: Dict[str, str]) -> Dict[str, Dict[str, Any]]:
+def extract_enhanced_response(raw_response: str, log_level: str = "info") -> Dict[str, Any]:
     """
-    Extract enhanced response information for multiple responses.
+    Extract enhanced response information from a raw response string.
     Args:
-        responses: Dictionary mapping prompts to raw response strings
+        raw_response: Raw response string from the agent
+        log_level: Logging level for the extraction process (default: "info")
     Returns:
-        Dictionary mapping prompts to enhanced response data
+        A dictionary containing the enhanced response information, including:
+        - "response": Reconstructed message flow with tool calls and results
+        - "tool_definitions": List of tool definitions extracted from telemetry
+        - "raw_response_text": Original response text for backward compatibility
+        - "metadata": Additional metadata such as conversation ID, request ID, etc.
     """
-    extractor = EnhancedResponseExtractor()
-    enhanced_responses = {}
-    for prompt, raw_response in responses.items():
-        enhanced_responses[prompt] = extractor.extract_enhanced_response(raw_response)
-    return enhanced_responses
+    extractor = EnhancedResponseExtractor(log_level=log_level)
+    return extractor.extract_enhanced_response(raw_response)
 def get_response_text_for_evaluation(enhanced_response: Dict[str, Any]) -> str:
     """

package/src/clients/cli/retry_policy.py ADDED Viewed

@@ -0,0 +1,52 @@
+"""Retry utilities for transient HTTP failures in evaluation flows."""
+from __future__ import annotations
+from datetime import datetime, timezone
+from email.utils import parsedate_to_datetime
+from typing import Optional
+RETRYABLE_HTTP_STATUS_CODES = {429, 503, 504}
+MAX_BACKOFF_SECONDS = 60
+def is_retryable_status(status_code: Optional[int]) -> bool:
+    """Return True for transient HTTP status codes covered by the spec."""
+    if status_code is None:
+        return False
+    return int(status_code) in RETRYABLE_HTTP_STATUS_CODES
+def get_backoff_seconds(attempt: int) -> int:
+    """Return exponential backoff delay capped at MAX_BACKOFF_SECONDS.
+    Examples: 2, 4, 8 for attempts 1..3.
+    """
+    if attempt < 1:
+        raise ValueError("attempt must be >= 1")
+    return min(2 ** attempt, MAX_BACKOFF_SECONDS)
+def get_retry_after_seconds(retry_after_header: Optional[str]) -> Optional[int]:
+    """Parse Retry-After header value (delay-seconds or HTTP-date per RFC 7231)."""
+    if retry_after_header is None:
+        return None
+    value = retry_after_header.strip()
+    if not value:
+        return None
+    # Try delay-seconds (integer) first
+    try:
+        return max(0, int(value))
+    except ValueError:
+        pass
+    # Try HTTP-date format (RFC 7231 §7.1.3)
+    try:
+        retry_date = parsedate_to_datetime(value)
+        now = datetime.now(timezone.utc)
+        delta = int((retry_date - now).total_seconds())
+        return max(0, delta)
+    except (ValueError, TypeError):
+        return None

package/src/clients/cli/samples/multiturn_example.json ADDED Viewed

@@ -0,0 +1,35 @@
+{
+  "schemaVersion": "1.2.0",
+  "default_evaluators": {
+    "Relevance": {},
+    "Coherence": {}
+  },
+  "items": [
+    {
+      "prompt": "What is Microsoft Graph?",
+      "expected_response": "Microsoft Graph is a gateway to data and intelligence in Microsoft 365."
+    },
+    {
+      "name": "Travel planning conversation",
+      "description": "Multi-turn thread testing context retention across turns",
+      "turns": [
+        {
+          "prompt": "I'm planning a trip to Seattle next week.",
+          "expected_response": "I can help you plan your Seattle trip."
+        },
+        {
+          "prompt": "What's the weather going to be like?",
+          "expected_response": "Seattle weather is typically mild with possible rain."
+        },
+        {
+          "prompt": "Should I bring a rain jacket?",
+          "expected_response": "Yes, it's always a good idea to bring rain gear to Seattle.",
+          "evaluators": {
+            "Groundedness": { "threshold": 4 }
+          },
+          "evaluators_mode": "extend"
+        }
+      ]
+    }
+  ]
+}

package/src/clients/cli/throttle_gate.py ADDED Viewed

@@ -0,0 +1,82 @@
+"""Per-API throttle gate support for transient HTTP 429 handling."""
+from __future__ import annotations
+import threading
+import time
+from dataclasses import dataclass
+from typing import Optional
+@dataclass
+class GateState:
+    """Snapshot state for diagnostics and tests."""
+    api_name: str
+    blocked_until_epoch: float
+    is_blocked: bool
+    last_retry_after_seconds: Optional[int]
+class ThrottleGate:
+    """Thread-safe per-API gate that pauses workers until the block window elapses."""
+    def __init__(self, api_name: str) -> None:
+        self.api_name = api_name
+        self._lock = threading.Lock()
+        self._blocked_until_epoch = 0.0
+        self._last_retry_after_seconds: Optional[int] = None
+    def apply_retry_after(self, retry_after_seconds: int) -> float:
+        """Apply retry-after duration and keep the maximum active block window.
+        Returns the current effective blocked-until epoch.
+        """
+        retry_after_seconds = max(0, int(retry_after_seconds))
+        candidate = time.time() + retry_after_seconds
+        with self._lock:
+            if candidate > self._blocked_until_epoch:
+                self._blocked_until_epoch = candidate
+            self._last_retry_after_seconds = retry_after_seconds
+            return self._blocked_until_epoch
+    MAX_GATE_WAIT_SECONDS = 300.0
+    def wait_if_blocked(self) -> float:
+        """Sleep until the gate opens. Returns the total slept duration in seconds.
+        Re-checks the block window after each sleep to handle concurrent
+        ``apply_retry_after`` calls that extend the window (avoids TOCTOU).
+        Raises ``TimeoutError`` if the total wait exceeds ``MAX_GATE_WAIT_SECONDS``.
+        """
+        total_slept = 0.0
+        while True:
+            with self._lock:
+                delay = max(0.0, self._blocked_until_epoch - time.time())
+            if delay <= 0:
+                return total_slept
+            if total_slept + delay > self.MAX_GATE_WAIT_SECONDS:
+                raise TimeoutError(
+                    f"ThrottleGate '{self.api_name}' exceeded maximum wait of "
+                    f"{self.MAX_GATE_WAIT_SECONDS}s (slept {total_slept:.1f}s so far)."
+                )
+            time.sleep(delay)
+            total_slept += delay
+    def clear(self) -> None:
+        """Reset the gate to unblocked state."""
+        with self._lock:
+            self._blocked_until_epoch = 0.0
+            self._last_retry_after_seconds = None
+    def state(self) -> GateState:
+        """Return immutable snapshot state."""
+        with self._lock:
+            now = time.time()
+            return GateState(
+                api_name=self.api_name,
+                blocked_until_epoch=self._blocked_until_epoch,
+                is_blocked=self._blocked_until_epoch > now,
+                last_retry_after_seconds=self._last_retry_after_seconds,
+            )