npm - @microsoft/m365-copilot-eval - Versions diffs - 1.3.0-preview.1 → 1.5.0-preview.1 - Mend

@microsoft/m365-copilot-eval 1.3.0-preview.1 → 1.5.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

package/README.md +135 -100
package/package.json +7 -4
package/schema/CHANGELOG.md +7 -0
package/schema/v1/eval-document.schema.json +143 -11
package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
package/schema/v1/examples/valid/multi-turn-output.json +59 -0
package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
package/schema/version.json +2 -2
package/src/clients/cli/agent_selector.py +74 -0
package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
package/src/clients/cli/api_clients/A2A/a2a_client.py +475 -0
package/src/clients/cli/api_clients/__init__.py +3 -0
package/src/clients/cli/api_clients/base_agent_client.py +77 -0
package/src/clients/cli/cli_args.py +136 -0
package/src/clients/cli/cli_logging/cli_logger.py +33 -0
package/src/clients/cli/cli_logging/console_diagnostics.py +56 -2
package/src/clients/cli/cli_logging/logging_utils.py +0 -1
package/src/clients/cli/common.py +64 -0
package/src/clients/cli/env_validator.py +73 -0
package/src/clients/cli/evaluation_runner.py +653 -0
package/src/clients/cli/evaluator_resolver.py +9 -6
package/src/clients/cli/generate_report.py +272 -129
package/src/clients/cli/main.py +157 -1174
package/src/clients/cli/parallel_executor.py +57 -0
package/src/clients/cli/prompt_loader.py +148 -0
package/src/clients/cli/readme.md +9 -53
package/src/clients/cli/requirements.txt +1 -1
package/src/clients/cli/response_extractor.py +4 -603
package/src/clients/cli/result_writer.py +488 -0
package/src/clients/cli/retry_policy.py +52 -0
package/src/clients/cli/samples/multiturn_example.json +35 -0
package/src/clients/cli/throttle_gate.py +82 -0
package/src/clients/node-js/bin/runevals.js +82 -20
package/src/clients/node-js/config/default.js +12 -11
package/src/clients/node-js/lib/agent-id.js +12 -0
package/src/clients/node-js/lib/env-loader.js +14 -20
package/src/clients/node-js/lib/eula-manager.js +78 -0
package/src/clients/node-js/lib/progress.js +13 -11

package/src/clients/cli/response_extractor.py CHANGED Viewed

@@ -1,607 +1,8 @@
-"""
-Enhanced Response Extraction Module
+"""Response text extraction for evaluation."""
-This module provides functionality for extracting detailed response information including
-tool calls, tool results, and message flow reconstruction from agent responses.
+from typing import Any, Dict
-Key Components:
-- Message flow reconstruction
-- Tool invocation parsing
-- Enhanced response data structure
-- Too                        ]
-                                          ]
-                    ]
-                })
-        return reconstructed                })
-        return reconstructedi                        ]
-                    ]
-                })
-        return reconstructedn extraction (placeholder for future implementation)
-Author: GitHub Copilot
-Date: September 21, 2025
-"""
-import json
-import logging
-from typing import Dict, List, Any, Optional, Tuple
-from datetime import datetime
-from enum import Enum
-from cli_logging.logging_utils import LOG_LEVEL_MAP, LogLevel
-# Configure logging
-if not logging.getLogger().handlers:
-    logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-def _log_level_to_python_level(log_level: str) -> int:
-    normalized = (log_level or "info").strip().lower()
-    return LOG_LEVEL_MAP.get(normalized, logging.INFO)
-class MessageRole(Enum):
-    """Enumeration for message roles."""
-    USER = "user"
-    ASSISTANT = "assistant"
-    TOOL = "tool"
-class ContentType(Enum):
-    """Enumeration for content types."""
-    TEXT = "text"
-    TOOL_CALL = "tool_call"
-    TOOL_RESULT = "tool_result"
-class ToolStatus(Enum):
-    """Enumeration for tool invocation status."""
-    SUCCESS = "Success"
-    FAILURE = "Failure"
-class MessageType(Enum):
-    """Enumeration for different message types in conversations."""
-    USER = "user"
-    BOT = "bot"
-    INTERNAL = "Internal"
-    INTERNAL_SEARCH = "InternalSearchResult"
-class EnhancedResponseExtractor:
-    """Enhanced extractor for detailed response information."""
-    # List of internal tool names that should be filtered out
-    INTERNAL_TOOLS = {
-        "hydrate_tool_response",
-        "meta_prioritize",
-        "reason",
-        "generate_express_response",
-        "generate_response"
-    }
-    def __init__(self, log_level: str = "info"):
-        self.tool_call_counter = 0
-        self.log_level = (log_level or "info").strip().lower()
-        logger.setLevel(_log_level_to_python_level(self.log_level))
-    def _generate_tool_call_id(self, tool_name: str) -> str:
-        """Generate a unique tool call ID."""
-        self.tool_call_counter += 1
-        timestamp = datetime.now().strftime("%Y%m%d")
-        return f"tool_call_{timestamp}_{self.tool_call_counter:03d}_{tool_name}"
-    def _is_internal_tool(self, tool_name: str) -> bool:
-        """
-        Check if a tool is an internal tool that should be filtered out.
-        Args:
-            tool_name: Name of the tool
-        Returns:
-            bool: True if internal tool, False otherwise
-        """
-        return tool_name in self.INTERNAL_TOOLS
-    def _is_tool_message(self, message: Dict[str, Any]) -> bool:
-        """
-        Check if a message represents a tool invocation (excluding internal tools).
-        Args:
-            message: Message dictionary
-        Returns:
-            bool: True if tool message and not internal tool, False otherwise
-        """
-        # First check if it's a tool message at all
-        is_tool = (
-            (message.get("messageType") == MessageType.INTERNAL.value and
-             message.get("contentOrigin") == "OpenAPI-spec") or
-            message.get("messageType") == MessageType.INTERNAL_SEARCH.value or
-            (message.get("messageType") == MessageType.INTERNAL.value and
-             message.get("invocation") is not None)
-        )
-        if not is_tool:
-            return False
-        # Check if it's an internal tool that should be filtered out
-        invocation_str = message.get("invocation", "")
-        if invocation_str:
-            tool_info = self._parse_tool_invocation(invocation_str)
-            tool_name = tool_info.get("name", "")
-            if self._is_internal_tool(tool_name):
-                return False
-        return True
-    def _parse_tool_invocation(self, invocation_str: str) -> Dict[str, Any]:
-        """
-        Parse tool invocation string to extract tool name and arguments.
-        Args:
-            invocation_str: Tool invocation string
-        Returns:
-            Dict containing tool name and arguments
-        """
-        try:
-            # Handle Flux v3 format (JSON array)
-            if invocation_str.startswith('['):
-                invocation_data = json.loads(invocation_str)
-                if isinstance(invocation_data, list) and len(invocation_data) > 0:
-                    func_data = invocation_data[0].get("function", {})
-                    tool_name = func_data.get("name", "unknown_tool")
-                    arguments = json.loads(func_data.get("arguments", "{}"))
-                    return {"name": tool_name, "arguments": arguments}
-            # Handle standard format: tool_name(arg1="value1", arg2="value2")
-            if "(" in invocation_str and ")" in invocation_str:
-                tool_name = invocation_str.split("(")[0].strip()
-                args_str = invocation_str[invocation_str.find("(")+1:invocation_str.rfind(")")]
-                # Parse arguments
-                arguments = {}
-                if args_str.strip():
-                    # Simple parsing for key="value" format
-                    import re
-                    matches = re.findall(r'(\w+)=(["\'])(.*?)\2', args_str)
-                    for match in matches:
-                        key, _, value = match
-                        arguments[key] = value
-                return {"name": tool_name, "arguments": arguments}
-            # Fallback: treat as tool name without arguments
-            return {"name": invocation_str.strip(), "arguments": {}}
-        except Exception as e:
-            logger.warning(f"Failed to parse tool invocation '{invocation_str}': {e}")
-            return {"name": "unknown_tool", "arguments": {}}
-    def _extract_tool_results(self, message: Dict[str, Any]) -> Any:
-        """
-        Extract tool results from a message.
-        Args:
-            message: Message dictionary
-        Returns:
-            Tool results or None if extraction fails
-        """
-        try:
-            text = message.get("text", "")
-            if text:
-                result_data = json.loads(text)
-                # Check for search metadata errors
-                if isinstance(result_data, dict):
-                    search_metadata = result_data.get("searchMetadata", {})
-                    if "error" in search_metadata.get("status", ""):
-                        return {"error": search_metadata.get("status", "Unknown error")}
-                    # Return results or the whole object
-                    return result_data.get("results", result_data)
-                return result_data
-        except json.JSONDecodeError:
-            # Return raw text if not JSON
-            return message.get("text", "")
-        except Exception as e:
-            logger.warning(f"Failed to extract tool results: {e}")
-            return None
-        return None
-    def _extract_telemetry_tools(self, telemetry: Dict[str, Any]) -> List[Dict[str, Any]]:
-        """
-        Extract tool invocations from telemetry data.
-        Args:
-            telemetry: Telemetry data dictionary
-        Returns:
-            List of tool invocation details
-        """
-        tools_invoked = []
-        tools_queue = []
-        for metric in telemetry.get("metrics", []):
-            service_name = metric.get("serviceName")
-            # Track tool invocations from FluxToolInvoker
-            if service_name == "FluxToolInvoker" and metric.get("status") == ToolStatus.SUCCESS.value:
-                try:
-                    output = json.loads(metric["output"])
-                    for item in output:
-                        invocation = item.get("invocation", "")
-                        # Parse tool name to check if it's internal
-                        tool_info = self._parse_tool_invocation(invocation)
-                        tool_name = tool_info.get("name", "")
-                        # Skip internal tools
-                        if not self._is_internal_tool(tool_name):
-                            tools_queue.append(invocation)
-                except (json.JSONDecodeError, KeyError):
-                    continue
-            # Track tool results from ExtensionRunner
-            elif service_name in ["ExtensionRunner:ext:OpenAPI-spec", "ExtensionRunner:ext:enterprise-search"]:
-                if tools_queue:
-                    invocation_str = tools_queue.pop(0)
-                    tool_info = self._parse_tool_invocation(invocation_str)
-                    tool_data = {
-                        "invocation": invocation_str,
-                        "tool_name": tool_info["name"],
-                        "arguments": tool_info["arguments"],
-                        "status": metric.get("status", ToolStatus.FAILURE.value),
-                        "results": None
-                    }
-                    if metric.get("status") == ToolStatus.SUCCESS.value:
-                        try:
-                            api_response = json.loads(metric.get("output", "")).get("responses", [])
-                            if api_response:
-                                response_data = json.loads(api_response[0].get("text", ""))
-                                # Check for errors
-                                search_metadata = response_data.get("searchMetadata", {})
-                                if "error" in search_metadata.get("status", ""):
-                                    tool_data["status"] = ToolStatus.FAILURE.value
-                                    tool_data["results"] = {"error": search_metadata.get("status", "")}
-                                else:
-                                    tool_data["results"] = response_data.get("results", response_data)
-                        except Exception as e:
-                            logger.warning(f"Failed to parse tool results from telemetry: {e}")
-                            tool_data["status"] = ToolStatus.FAILURE.value
-                    tools_invoked.append(tool_data)
-        return tools_invoked
-    def _extract_tool_definitions(self, telemetry: Dict[str, Any]) -> List[Dict[str, Any]]:
-        """
-        Extract tool definitions from telemetry data.
-        Args:
-            telemetry: Telemetry data dictionary
-        Returns:
-            List of tool definition dictionaries
-        """
-        tool_definitions = []
-        for metric in telemetry.get("metrics", []):
-            service_name = metric.get("serviceName")
-            # Look for DeepLeoImprovedNetworking service with function invocation
-            if (service_name == "DeepLeoImprovedNetworking" and
-                metric.get("output", "").startswith("CallTags: fluxv3:invokingfunction,")):
-                try:
-                    # Parse the input field which contains the tool definitions
-                    input_str = metric.get("input", "")
-                    if input_str:
-                        input_data = json.loads(input_str)
-                        tools = input_data.get("tools", [])
-                        if tools and isinstance(tools, list):
-                            # Add tools to our definitions list (avoid duplicates and filter out internal tools)
-                            for tool in tools:
-                                # Check if this tool is internal
-                                tool_name = tool.get("function", {}).get("name", "")
-                                if not self._is_internal_tool(tool_name) and tool not in tool_definitions:
-                                    tool_definitions.append(tool.get("function", {}))
-                            logger.info(f"Extracted {len(tools)} tool definitions from telemetry")
-                except json.JSONDecodeError as e:
-                    logger.warning(f"Failed to parse tool definitions from telemetry input: {e}")
-                except Exception as e:
-                    logger.warning(f"Error extracting tool definitions: {e}")
-        return tool_definitions
-    def _reconstruct_message_flow(self, messages: List[Dict[str, Any]], telemetry_tools: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-        """
-        Reconstruct the message flow including tool calls and results.
-        Args:
-            messages: Original messages from response
-            telemetry_tools: Tool invocations from telemetry
-        Returns:
-            List of reconstructed messages
-        """
-        reconstructed = []
-        telemetry_tool_index = 0
-        for message in messages:
-            timestamp = message.get("createdAt", datetime.now().isoformat() + "Z")
-            run_id = message.get("conversationId", "0")
-            author = message.get("author", "")
-            # Skip user messages entirely - we only want the agent's response flow
-            if author == MessageType.USER.value:
-                continue
-            # Handle tool messages
-            elif self._is_tool_message(message):
-                invocation_str = message.get("invocation", "")
-                tool_info = self._parse_tool_invocation(invocation_str)
-                tool_call_id = self._generate_tool_call_id(tool_info["name"])
-                # Add tool call message
-                reconstructed.append({
-                    "createdAt": timestamp,
-                    "run_id": run_id,
-                    "role": MessageRole.ASSISTANT.value,
-                    "content": [
-                        {
-                            "type": ContentType.TOOL_CALL.value,
-                            "tool_call_id": tool_call_id,
-                            "name": tool_info["name"],
-                            "arguments": tool_info["arguments"]
-                        }
-                    ]
-                })
-                # Add tool result message
-                results = self._extract_tool_results(message)
-                reconstructed.append({
-                    "createdAt": timestamp,
-                    "run_id": run_id,
-                    "role": MessageRole.TOOL.value,
-                    "tool_call_id": tool_call_id,
-                    "content": [
-                        {
-                            "type": ContentType.TOOL_RESULT.value,
-                            ContentType.TOOL_RESULT.value: results
-                        }
-                    ]
-                })
-            # Handle bot response messages
-            elif author == MessageType.BOT.value and "messageType" not in message:
-                # Check if we have unused telemetry tools to add before the final response
-                while telemetry_tool_index < len(telemetry_tools):
-                    tool = telemetry_tools[telemetry_tool_index]
-                    tool_call_id = self._generate_tool_call_id(tool["tool_name"])
-                    # Add tool call
-                    reconstructed.append({
-                        "createdAt": timestamp,
-                        "run_id": run_id,
-                        "role": MessageRole.ASSISTANT.value,
-                        "content": [
-                            {
-                                "type": ContentType.TOOL_CALL.value,
-                                "tool_call_id": tool_call_id,
-                                "name": tool["tool_name"],
-                                "arguments": tool["arguments"]
-                            }
-                        ]
-                    })
-                    # Add tool result
-                    reconstructed.append({
-                        "createdAt": timestamp,
-                        "run_id": run_id,
-                        "role": MessageRole.TOOL.value,
-                        "tool_call_id": tool_call_id,
-                        "content": [
-                            {
-                                "type": ContentType.TOOL_RESULT.value,
-                                ContentType.TOOL_RESULT.value: tool["results"]
-                            }
-                        ]
-                    })
-                    telemetry_tool_index += 1
-                # Add final assistant response
-                reconstructed.append({
-                    "createdAt": timestamp,
-                    "run_id": run_id,
-                    "role": MessageRole.ASSISTANT.value,
-                    "content": [
-                        {
-                            "type": ContentType.TEXT.value,
-                            "text": message.get("text", "")
-                        }
-                    ]
-                })
-        return reconstructed
-    def extract_enhanced_response(self, raw_response: str) -> Dict[str, Any]:
-        """
-        Extract enhanced response information from raw agent response.
-        Args:
-            raw_response: Raw response string from agent
-        Returns:
-            Dict containing enhanced response data
-        """
-        try:
-            # Parse the raw response
-            response_data = json.loads(raw_response)
-            # Extract basic response text (fallback to original behavior)
-            response_text = ""
-            if isinstance(response_data, dict):
-                # Look for bot response in messages array
-                messages_for_text = response_data.get("messages", [])
-                for message in messages_for_text:
-                    if (message.get("author") == "bot" and
-                        "messageType" not in message and
-                        message.get("text")):
-                        response_text = message.get("text", "").strip()
-                        break
-            if not response_text:
-                response_text = raw_response.strip()
-            # Initialize enhanced structure
-            enhanced_response = {
-                "response": [],  # Will contain reconstructed message flow
-                "tool_definitions": [],  # Placeholder for future implementation
-                "raw_response_text": response_text,  # Backward compatibility
-                "metadata": {
-                    "conversation_id": response_data.get("conversationId"),
-                    "request_id": response_data.get("requestId"),
-                    "message_id": None,
-                    "telemetry_available": False
-                }
-            }
-            # Extract messages if available
-            messages = []
-            if isinstance(response_data, dict):
-                # Messages are directly in the response_data object
-                messages = response_data.get("messages", [])
-            # Extract message_id from the last bot message in this response
-            bot_messages = [m for m in messages if m.get("author") != "user"]
-            if bot_messages and bot_messages[-1].get("messageId"):
-                enhanced_response["metadata"]["message_id"] = bot_messages[-1]["messageId"]
-            # Extract telemetry tools if available
-            telemetry_tools = []
-            tool_definitions = []
-            telemetry = response_data.get("telemetry", {})
-            if telemetry:
-                enhanced_response["metadata"]["telemetry_available"] = True
-                telemetry_tools = self._extract_telemetry_tools(telemetry)
-                tool_definitions = self._extract_tool_definitions(telemetry)
-            # Update tool definitions in the response
-            enhanced_response["tool_definitions"] = tool_definitions
-            # Reconstruct message flow
-            if messages:
-                enhanced_response["response"] = self._reconstruct_message_flow(messages, telemetry_tools)
-            else:
-                # Fallback: create simple text response
-                enhanced_response["response"] = [
-                    {
-                        "createdAt": datetime.now().isoformat() + "Z",
-                        "run_id": "0",
-                        "role": MessageRole.ASSISTANT.value,
-                        "content": [
-                            {
-                                "type": ContentType.TEXT.value,
-                                "text": response_text
-                            }
-                        ]
-                    }
-                ]
-            return enhanced_response
-        except json.JSONDecodeError:
-            # Handle non-JSON responses
-            logger.warning("Received non-JSON response, creating simple text response")
-            return {
-                "response": [
-                    {
-                        "createdAt": datetime.now().isoformat() + "Z",
-                        "run_id": "0",
-                        "role": MessageRole.ASSISTANT.value,
-                        "content": [
-                            {
-                                "type": ContentType.TEXT.value,
-                                "text": raw_response.strip()
-                            }
-                        ]
-                    }
-                ],
-                "tool_definitions": [],
-                "raw_response_text": raw_response.strip(),
-                "metadata": {
-                    "conversation_id": None,
-                    "request_id": None,
-                    "message_id": None,
-                    "telemetry_available": False
-                }
-            }
-        except Exception as e:
-            logger.error(f"Failed to extract enhanced response: {e}")
-            # Return minimal structure on error
-            return {
-                "response": [
-                    {
-                        "createdAt": datetime.now().isoformat() + "Z",
-                        "run_id": "0",
-                        "role": MessageRole.ASSISTANT.value,
-                        "content": [
-                            {
-                                "type": ContentType.TEXT.value,
-                                "text": raw_response.strip() if raw_response else "Error processing response"
-                            }
-                        ]
-                    }
-                ],
-                "tool_definitions": [],
-                "raw_response_text": raw_response.strip() if raw_response else "",
-                "metadata": {
-                    "conversation_id": None,
-                    "request_id": None,
-                    "message_id": None,
-                    "telemetry_available": False,
-                    "error": str(e)
-                }
-            }
-def extract_enhanced_responses(responses: List[Tuple[str, str]], log_level: str = "info") -> List[Dict[str, Any]]:
-    """
-    Extract enhanced response information for multiple responses.
-    Args:
-        responses: List of (prompt_text, raw_response_string) tuples, one per prompt
-                   sent to the chat API. Order and duplicates are preserved.
-    Returns:
-        List of enhanced response dicts (one per prompt, same order as input).
-    """
-    extractor = EnhancedResponseExtractor(log_level=log_level)
-    enhanced_responses = []
-    for prompt, raw_response in responses:
-        enhanced = extractor.extract_enhanced_response(raw_response)
-        enhanced_responses.append(enhanced)
-    return enhanced_responses
 def get_response_text_for_evaluation(enhanced_response: Dict[str, Any]) -> str:
-    """
-    Extract simple text response for evaluation purposes (backward compatibility).
-    Args:
-        enhanced_response: Enhanced response dictionary
-    Returns:
-        Simple text response string
-    """
-    # Use raw_response_text for backward compatibility
-    return enhanced_response.get("raw_response_text", "")
+    """Extract plain text from an agent response dict for evaluation."""
+    return enhanced_response.get("raw_response_text", "")