npm - @microsoft/m365-copilot-eval - Versions diffs - 1.3.0-preview.1 → 1.5.0-preview.1 - Mend

@microsoft/m365-copilot-eval 1.3.0-preview.1 → 1.5.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

package/README.md +135 -100
package/package.json +7 -4
package/schema/CHANGELOG.md +7 -0
package/schema/v1/eval-document.schema.json +143 -11
package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
package/schema/v1/examples/valid/multi-turn-output.json +59 -0
package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
package/schema/version.json +2 -2
package/src/clients/cli/agent_selector.py +74 -0
package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
package/src/clients/cli/api_clients/A2A/a2a_client.py +475 -0
package/src/clients/cli/api_clients/__init__.py +3 -0
package/src/clients/cli/api_clients/base_agent_client.py +77 -0
package/src/clients/cli/cli_args.py +136 -0
package/src/clients/cli/cli_logging/cli_logger.py +33 -0
package/src/clients/cli/cli_logging/console_diagnostics.py +56 -2
package/src/clients/cli/cli_logging/logging_utils.py +0 -1
package/src/clients/cli/common.py +64 -0
package/src/clients/cli/env_validator.py +73 -0
package/src/clients/cli/evaluation_runner.py +653 -0
package/src/clients/cli/evaluator_resolver.py +9 -6
package/src/clients/cli/generate_report.py +272 -129
package/src/clients/cli/main.py +157 -1174
package/src/clients/cli/parallel_executor.py +57 -0
package/src/clients/cli/prompt_loader.py +148 -0
package/src/clients/cli/readme.md +9 -53
package/src/clients/cli/requirements.txt +1 -1
package/src/clients/cli/response_extractor.py +4 -603
package/src/clients/cli/result_writer.py +488 -0
package/src/clients/cli/retry_policy.py +52 -0
package/src/clients/cli/samples/multiturn_example.json +35 -0
package/src/clients/cli/throttle_gate.py +82 -0
package/src/clients/node-js/bin/runevals.js +82 -20
package/src/clients/node-js/config/default.js +12 -11
package/src/clients/node-js/lib/agent-id.js +12 -0
package/src/clients/node-js/lib/env-loader.js +14 -20
package/src/clients/node-js/lib/eula-manager.js +78 -0
package/src/clients/node-js/lib/progress.js +13 -11

package/src/clients/cli/parallel_executor.py ADDED Viewed

@@ -0,0 +1,57 @@
+"""Parallel prompt execution utilities.
+This module provides a minimal reusable executor that preserves input order.
+"""
+from __future__ import annotations
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass
+from typing import Any, Callable, Generic, Iterable, List, Optional, TypeVar
+T = TypeVar("T")
+R = TypeVar("R")
+@dataclass
+class WorkerResult(Generic[R]):
+    """Result model used to preserve input ordering and capture failures."""
+    index: int
+    value: Optional[R] = None
+    error: Optional[Exception] = None
+def execute_in_parallel(
+    items: Iterable[T],
+    worker: Callable[[T, int], R],
+    max_workers: int,
+) -> List[WorkerResult[R]]:
+    """Execute worker function in parallel while preserving input order.
+    `worker` receives `(item, index)` and returns a value.
+    """
+    indexed_items = list(enumerate(items))
+    if not indexed_items:
+        return []
+    normalized_workers = max(1, min(max_workers, len(indexed_items)))
+    results: List[WorkerResult[R]] = [WorkerResult(index=i) for i, _ in indexed_items]
+    with ThreadPoolExecutor(max_workers=normalized_workers) as executor:
+        future_map = {
+            executor.submit(worker, item, index): index
+            for index, item in indexed_items
+        }
+        for future in as_completed(future_map):
+            index = future_map[future]
+            try:
+                results[index] = WorkerResult(index=index, value=future.result())
+            except (KeyboardInterrupt, SystemExit):
+                raise
+            except Exception as exc:
+                results[index] = WorkerResult(index=index, error=exc)
+    return results

package/src/clients/cli/prompt_loader.py ADDED Viewed

@@ -0,0 +1,148 @@
+"""Dataset loading (file, interactive)."""
+import json
+import sys
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import questionary
+from cli_logging.cli_logger import emit_structured_log
+from cli_logging.logging_utils import Operation
+from common import RunConfig
+from schema_handler import DocumentUpgrader
+def load_prompts_from_file(file_path: str) -> Tuple[List[Dict], Optional[Dict]]:
+    """Load prompts and expected responses from a JSON file.
+    Supports three formats:
+    1. Eval document: {"schemaVersion": "1.0.0", "items": [{"prompt": "..."}]}
+    2. Array format: [{"prompt": "...", "expected_response": "..."}]
+    3. Dict format: {"prompts": [...], "expected_responses": [...]}
+    For eval documents (format 1) and array format (format 2), schema validation
+    and auto-upgrade are applied via DocumentUpgrader.
+    Returns:
+        Tuple of (eval_items, default_evaluators). Items are dicts with prompt,
+        expected_response, and optional evaluators/evaluators_mode fields.
+    """
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        # Detect if this is an eval document (has "items" key) or could be upgraded
+        is_eval_document = (
+            isinstance(data, dict) and "items" in data
+        ) or isinstance(data, list)
+        # Run schema validation and auto-upgrade for eval documents
+        if is_eval_document:
+            try:
+                upgrader = DocumentUpgrader()
+            except Exception as e:
+                # Schema infrastructure not available (missing files, etc.) — skip
+                emit_structured_log("warning", f"Unable to initialize document upgrader: {e}", operation=Operation.LOAD_PROMPTS)
+                upgrader = None
+            if upgrader is not None:
+                result = upgrader.upgrade(Path(file_path))
+                if result.error:
+                    emit_structured_log("error", f"Schema validation error: {result.error}", operation=Operation.LOAD_PROMPTS)
+                    sys.exit(1)
+                if result.upgraded and result.message:
+                    emit_structured_log("info", result.message, operation=Operation.LOAD_PROMPTS)
+                # Use the parsed document from the upgrade result
+                if result.document is not None:
+                    data = result.document
+        if isinstance(data, list):
+            # Format: [{"prompt": "...", "expected_response": "..."}, ...]
+            return data, None
+        elif isinstance(data, dict):
+            if "items" in data:
+                # Eval document format: {"schemaVersion": "...", "items": [...]}
+                return data["items"], data.get("default_evaluators")
+            else:
+                # Format: {"prompts": [...], "expected_responses": [...]}
+                prompts = data.get("prompts", [])
+                expected_responses = data.get("expected_responses", [])
+                eval_items = [
+                    {"prompt": p, "expected_response": e}
+                    for p, e in zip(prompts, expected_responses)
+                ]
+                return eval_items, None
+        else:
+            raise ValueError("Invalid file format")
+    except SystemExit:
+        raise
+    except Exception as e:
+        emit_structured_log("error", f"Error loading prompts from file: {e}", operation=Operation.LOAD_PROMPTS)
+        sys.exit(1)
+def get_interactive_prompts() -> Tuple[List[str], List[str]]:
+    """Get prompts and expected responses interactively."""
+    prompts = []
+    expected_responses = []
+    print("Interactive mode: Enter your prompts and expected responses.")
+    print("Press Enter with empty prompt to finish.")
+    while True:
+        prompt = input(f"\nPrompt {len(prompts) + 1}: ").strip()
+        if not prompt:
+            break
+        expected = input(f"Expected response {len(expected_responses) + 1}: ").strip()
+        prompts.append(prompt)
+        expected_responses.append(expected)
+    if not prompts:
+        print("No prompts entered. Exiting.")
+        sys.exit(1)
+    return prompts, expected_responses
+def get_prompt_datasets(config: RunConfig) -> Tuple[List[Dict], Optional[Dict]]:
+    """Get prompts and expected responses based on command line arguments.
+    Returns:
+        Tuple of (eval_items, default_evaluators).
+    """
+    if config.prompts:
+        if config.expected and len(config.prompts) != len(config.expected):
+            emit_structured_log(
+                "error",
+                "Number of prompts must match number of expected responses. "
+                "Update --expected values to match the prompt count.",
+            )
+            sys.exit(1)
+        expected_responses = config.expected or [""] * len(config.prompts)
+        eval_items = [
+            {"prompt": p, "expected_response": e}
+            for p, e in zip(config.prompts, expected_responses)
+        ]
+        return eval_items, None
+    elif config.prompts_file:
+        return load_prompts_from_file(config.prompts_file)
+    elif config.interactive:
+        prompts, expected_responses = get_interactive_prompts()
+        eval_items = [
+            {"prompt": p, "expected_response": e}
+            for p, e in zip(prompts, expected_responses)
+        ]
+        return eval_items, None
+    else:
+        emit_structured_log(
+            "error",
+            "No prompts provided. Use --prompts, --prompts-file, or --interactive.",
+            operation=Operation.SETUP,
+        )
+        sys.exit(1)

package/src/clients/cli/readme.md CHANGED Viewed

@@ -7,7 +7,6 @@ Current evaluation metrics:
 - Coherence (1–5)
 - Groundedness (1–5)
 - Citations (count with pass/fail based on presence)
-- Tool Call Accuracy (1-5, evaluates correct tool usage when tools are invoked)
 ## 📋 Prerequisites
@@ -28,7 +27,7 @@ pip install -r requirements.txt
 ### 2. Set Up Environment Variables
-Create a `.env` file in the `src/clients/cli` directory (or export them). Choose **one** auth path for the Copilot API: either provide a pre-issued token or use interactive WAM auth (Windows).
+Create a `.env` file in the `src/clients/cli` directory (or export them). Use interactive WAM auth (Windows) to authenticate with the Copilot API.
 ```bash
 # Azure OpenAI (evaluation models)
@@ -37,17 +36,8 @@ AZURE_AI_API_KEY="<azure-openai-key>"
 AZURE_AI_API_VERSION="2024-12-01-preview"
 AZURE_AI_MODEL_NAME="gpt-4o-mini"
-# Copilot Chat API (response generation)
-COPILOT_API_ENDPOINT="https://substrate.office.com/m365Copilot"   # CLI appends /chat
-X_SCENARIO_HEADER="<scenario-header>"                             # e.g, officeweb
-# Auth option A: static access token (no prompt)
-COPILOT_API_ACCESS_TOKEN="<access-token>"
-# Auth option B: interactive WAM auth (used if COPILOT_API_ACCESS_TOKEN is empty)
-M365_EVAL_CLIENT_ID="<app-registration-client-id>"
+# Your Tenant Id
 TENANT_ID="<aad-tenant-id>"
-COPILOT_SCOPES="https://substrate.office.com/sydney/.default"
 # Optional: default agent id (overridable via --m365-agent-id)
 M365_AGENT_ID="00000000-0000-0000-0000-000000000000"
@@ -151,39 +141,6 @@ python main.py --m365-agent-id "00000000-0000-0000-0000-000000000000"
 }
 ```
-## 🔧 Tool Call Accuracy Evaluation
-The CLI now includes advanced tool call accuracy evaluation that analyzes how effectively the agent uses available tools:
-### What It Evaluates
-- **Tool Selection**: Whether the agent chooses appropriate tools for the given task
-- **Parameter Accuracy**: Correctness of arguments passed to tool functions
-- **Tool Usage Patterns**: Overall effectiveness of tool invocation strategies
-### How It Works
-1. **Response Analysis**: Extracts tool calls and results from conversation telemetry
-2. **Tool Definitions**: Captures available tools from conversation metadata
-3. **Accuracy Assessment**: Uses Azure AI Evaluation SDK's ToolCallAccuracyEvaluator
-4. **Score Calculation**: Returns 1-5 score with pass/fail threshold (default: 3)
-### Enhanced Data Extraction
-The tool now extracts detailed information from agent responses:
-- **Message Flow**: Complete chronological sequence of tool calls and results
-- **Tool Definitions**: Available tools and their schemas
-- **Internal Filtering**: Removes framework-internal tools for cleaner analysis
-### Example Output
-```bash
-📊 Aggregate Statistics (3 prompts):
-════════════════════════════════════════════════════════════
-Tool Call Accuracy:
-  Pass Rate: 66.7% (2/3 passed)
-  Avg Score: 0.75
-  Threshold: 0.5
-```
-**Note**: Tool Call Accuracy evaluation only applies when the agent response includes tool invocations. For text-only responses, this metric will not be computed.
 ## 🔧 Configuration
 ### Getting Azure AI Foundry Configuration Values
@@ -224,7 +181,7 @@ M365-Copilot-Agent-Evals/
 │   └── clients/
 │       └── cli/
 │           ├── main.py              # Main evaluation script
-│           ├── response_extractor.py # Enhanced response parsing and tool extraction
+│           ├── response_extractor.py # Enhanced response parsing
 │           ├── generate_report.py   # HTML report generation with aggregates
 │           ├── requirements.txt     # Python dependencies
 │           ├── readme.md           # This file
@@ -247,11 +204,10 @@ M365-Copilot-Agent-Evals/
 ## 📊 Features
-- **Chat Invocation**: Sends prompts to the Sydney chat API
-- **Evaluation Metrics**: Relevance, Coherence, Groundedness, Citations, Tool Call Accuracy
+- **Chat Invocation**: Sends prompts via the WorkIQ API
+- **Evaluation Metrics**: Relevance, Coherence, Groundedness, Citations
   - **Multi-format Citation Detection**: Supports both new OAI Unicode and legacy bracket formats
-  - **Tool Call Analysis**: Extracts and evaluates tool invocations from conversation telemetry
-- **Enhanced Response Extraction**: Detailed parsing of tool calls, results, and message flow
+- **Enhanced Response Extraction**: Detailed parsing of results, and message flow
 - **Aggregate Statistics**: Summary metrics across multiple prompts with pass/fail rates
 - **Colorized Console Output**
 - **Multiple Output Formats**: JSON, CSV, HTML with aggregate dashboards
@@ -265,7 +221,7 @@ M365-Copilot-Agent-Evals/
 ## 🔑 Authentication Requirements
-- Copilot API: Either a static `COPILOT_API_ACCESS_TOKEN` **or** WAM interactive auth via `M365_EVAL_CLIENT_ID` and `TENANT_ID` (optional `COPILOT_SCOPES`)
+- WorkIQ A2A API: WAM interactive auth via `WORK_IQ_A2A_CLIENT_ID`, `TENANT_ID`, and `WORK_IQ_A2A_SCOPES`
 - Evaluators: Azure OpenAI key (`AZURE_AI_API_KEY`)
 ## 📚 Useful Resources
@@ -282,9 +238,9 @@ M365-Copilot-Agent-Evals/
 1. **Authentication Errors**: Ensure your Azure credentials are properly configured and you have access to the resources.
-2. **HTTP 401 / 403**: If using a static token, verify `COPILOT_API_ACCESS_TOKEN` and required headers. If using WAM, confirm `M365_EVAL_CLIENT_ID`, `X_SCENARIO_HEADER`, `TENANT_ID`, and `COPILOT_SCOPES` are correct.
+2. **HTTP 401 / 403**: Confirm `WORK_IQ_A2A_CLIENT_ID`, `TENANT_ID`, and `WORK_IQ_A2A_SCOPES` are correct. Re-run `--signout` to clear cached tokens and re-authenticate.
-3. **Endpoint Issues**: Confirm `COPILOT_API_ENDPOINT` (base URL without `/chat`) is reachable (try `curl` / `Invoke-WebRequest`).
+3. **Endpoint Issues**: Confirm `WORK_IQ_A2A_ENDPOINT` is reachable (try `curl` / `Invoke-WebRequest`).
 ### Getting Help

package/src/clients/cli/requirements.txt CHANGED Viewed

@@ -6,7 +6,7 @@ msal[broker]>=1.34,<2
 msal-extensions>=1.3.1
 packaging>=20.0
 PyJWT>=2.11.0
-python-dotenv==1.1.1
+python-dotenv==1.2.2
 markdown==3.8.2
 promptflow>=1.18.1
 questionary>=2.1.1