npm - @microsoft/m365-copilot-eval - Versions diffs - 1.4.0-preview.1 → 1.5.0-preview.1 - Mend

@microsoft/m365-copilot-eval 1.4.0-preview.1 → 1.5.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/README.md +7 -4
package/package.json +2 -2
package/schema/CHANGELOG.md +7 -0
package/schema/v1/eval-document.schema.json +3 -3
package/src/clients/cli/agent_selector.py +74 -0
package/src/clients/cli/api_clients/A2A/a2a_client.py +39 -20
package/src/clients/cli/api_clients/base_agent_client.py +0 -1
package/src/clients/cli/cli_args.py +136 -0
package/src/clients/cli/cli_logging/cli_logger.py +33 -0
package/src/clients/cli/cli_logging/console_diagnostics.py +3 -1
package/src/clients/cli/common.py +53 -0
package/src/clients/cli/env_validator.py +73 -0
package/src/clients/cli/evaluation_runner.py +653 -0
package/src/clients/cli/evaluator_resolver.py +9 -6
package/src/clients/cli/main.py +128 -1675
package/src/clients/cli/prompt_loader.py +148 -0
package/src/clients/cli/readme.md +9 -53
package/src/clients/cli/response_extractor.py +4 -601
package/src/clients/cli/result_writer.py +488 -0
package/src/clients/node-js/bin/runevals.js +3 -4
package/src/clients/node-js/config/default.js +8 -11
package/src/clients/node-js/lib/env-loader.js +3 -4
package/src/clients/cli/api_clients/REST/__init__.py +0 -3
package/src/clients/cli/api_clients/REST/sydney_client.py +0 -204

package/src/clients/cli/evaluation_runner.py ADDED Viewed

@@ -0,0 +1,653 @@
+"""Core evaluation pipeline — evaluator dispatch, retry, parallel execution."""
+import json
+import time
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Dict, List, Optional, Tuple
+from azure.ai.evaluation import (
+    AzureOpenAIModelConfiguration,
+    RelevanceEvaluator,
+    CoherenceEvaluator,
+    GroundednessEvaluator,
+    SimilarityEvaluator,
+    ToolCallAccuracyEvaluator,
+)
+from api_clients.base_agent_client import BaseAgentClient
+from cli_logging.cli_logger import emit_structured_log
+from cli_logging.logging_utils import Operation
+from common import (
+    RELEVANCE,
+    COHERENCE,
+    GROUNDEDNESS,
+    SIMILARITY,
+    TOOL_CALL_ACCURACY,
+    CITATIONS,
+    EXACT_MATCH,
+    PARTIAL_MATCH,
+    REQUIRES_AZURE_OPENAI,
+    REQUIRES_TOOL_DEFINITIONS,
+    METRIC_IDS,
+    MAX_ATTEMPTS,
+    MAX_CONCURRENCY,
+    DEFAULT_PASS_THRESHOLD,
+    STATUS_PASS,
+    STATUS_FAIL,
+    STATUS_ERROR,
+    STATUS_PARTIAL,
+    STATUS_UNKNOWN,
+    MAX_TURNS_PER_THREAD,
+    LONG_THREAD_WARNING_THRESHOLD,
+    RunConfig,
+)
+from custom_evaluators.CitationsEvaluator import CitationsEvaluator, CitationFormat
+from custom_evaluators.ExactMatchEvaluator import ExactMatchEvaluator
+from custom_evaluators.PartialMatchEvaluator import PartialMatchEvaluator
+from evaluator_resolver import (
+    validate_evaluator_names,
+    check_prerequisites,
+    resolve_evaluators_for_prompt,
+    get_evaluator_threshold,
+)
+from parallel_executor import execute_in_parallel
+from response_extractor import get_response_text_for_evaluation
+from retry_policy import (
+    is_retryable_status,
+    get_backoff_seconds,
+    get_retry_after_seconds,
+)
+from throttle_gate import ThrottleGate
+@dataclass
+class PipelineConfig:
+    """Runtime configuration for the evaluation pipeline."""
+    agent_client: BaseAgentClient
+    model_config: AzureOpenAIModelConfiguration
+    has_azure_openai: bool
+    default_evaluators: Dict[str, Any]
+    chat_gate: ThrottleGate = field(default_factory=lambda: ThrottleGate("chat_api"))
+    is_retryable_status: Any = field(default=is_retryable_status)
+    get_backoff_seconds: Any = field(default=get_backoff_seconds)
+class ItemType(Enum):
+    SINGLE_TURN = "single_turn"
+    MULTI_TURN = "multi_turn"
+def detect_item_type(item: dict) -> ItemType:
+    """Determine if an evaluation item is single-turn or multi-turn.
+    Returns ItemType.SINGLE_TURN if item has 'prompt' without 'turns',
+    ItemType.MULTI_TURN if item has 'turns' array.
+    Raises ValueError for invalid items (both, neither, or invalid turns).
+    """
+    has_turns = "turns" in item
+    has_prompt = "prompt" in item
+    if has_turns and has_prompt:
+        raise ValueError(
+            "Invalid evaluation item: cannot have both 'turns' and 'prompt'. "
+            "Use 'turns' for multi-turn threads or 'prompt' for single-turn."
+        )
+    if has_turns and not isinstance(item["turns"], list):
+        raise ValueError("Invalid evaluation item: 'turns' must be a list")
+    if has_turns:
+        if len(item["turns"]) == 0:
+            raise ValueError("Invalid multi-turn thread: 'turns' array cannot be empty")
+        return ItemType.MULTI_TURN
+    if has_prompt:
+        return ItemType.SINGLE_TURN
+    raise ValueError(
+        "Invalid evaluation item: must have either 'turns' array (multi-turn) "
+        "or 'prompt' field (single-turn)"
+    )
+def _decorate_metric(metric_id: str, data, threshold: Optional[float] = None) -> Dict[str, Any]:
+    """Augment raw evaluator output with standardized threshold + pass/fail result."""
+    pass_threshold = threshold if threshold is not None else DEFAULT_PASS_THRESHOLD
+    payload = {}
+    if isinstance(data, dict):
+        payload.update(data)
+    else:
+        payload['raw'] = data
+    score_val = None
+    if isinstance(data, dict):
+        if metric_id in data:
+            score_val = data[metric_id]
+    if isinstance(score_val, (int, float)):
+        payload['threshold'] = pass_threshold
+        payload['result'] = STATUS_PASS if score_val >= pass_threshold else STATUS_FAIL
+    else:
+        payload['threshold'] = pass_threshold
+        payload.setdefault('result', STATUS_UNKNOWN)
+    return payload
+def _run_evaluators_for_item(
+    prompt: str,
+    actual_response: str,
+    expected_response: str,
+    enhanced_response: Dict[str, Any],
+    resolved_evaluators: Dict[str, Any],
+    model_config: AzureOpenAIModelConfiguration,
+    has_azure_openai: bool,
+    m365_agent_id: Optional[str],
+) -> Tuple[Dict[str, Optional[str]], List[str]]:
+    """Run resolved evaluators against a single item/turn.
+    Returns (results_dict, evaluators_ran).
+    """
+    has_tool_defs = bool(
+        m365_agent_id and enhanced_response.get("tool_definitions")
+    )
+    available_context = {
+        REQUIRES_AZURE_OPENAI: has_azure_openai,
+        REQUIRES_TOOL_DEFINITIONS: has_tool_defs,
+    }
+    results_dict: Dict[str, Optional[str]] = {}
+    evaluators_ran: List[str] = []
+    for eval_name, eval_options in resolved_evaluators.items():
+        can_run, warn_msg = check_prerequisites(eval_name, available_context)
+        if not can_run:
+            if warn_msg:
+                emit_structured_log(
+                    "warning",
+                    f"Evaluator '{eval_name}' prerequisite check failed: {warn_msg}",
+                    operation=Operation.EVALUATE,
+                )
+            results_dict[eval_name] = None
+            continue
+        threshold = get_evaluator_threshold(eval_name, eval_options)
+        try:
+            if eval_name == RELEVANCE:
+                raw_score = RelevanceEvaluator(model_config=model_config)(query=prompt, response=actual_response)
+                results_dict[RELEVANCE] = _decorate_metric(METRIC_IDS[RELEVANCE], raw_score, threshold)
+            elif eval_name == COHERENCE:
+                raw_score = CoherenceEvaluator(model_config=model_config)(query=prompt, response=actual_response)
+                results_dict[COHERENCE] = _decorate_metric(METRIC_IDS[COHERENCE], raw_score, threshold)
+            elif eval_name == GROUNDEDNESS:
+                raw_score = GroundednessEvaluator(model_config=model_config)(response=actual_response, context=expected_response)
+                results_dict[GROUNDEDNESS] = _decorate_metric(METRIC_IDS[GROUNDEDNESS], raw_score, threshold)
+            elif eval_name == SIMILARITY:
+                raw_score = SimilarityEvaluator(model_config=model_config)(query=prompt, response=actual_response, ground_truth=expected_response)
+                results_dict[SIMILARITY] = _decorate_metric(METRIC_IDS[SIMILARITY], raw_score, threshold)
+            elif eval_name == TOOL_CALL_ACCURACY:
+                raw_score = ToolCallAccuracyEvaluator(model_config)(
+                    query=prompt,
+                    response=enhanced_response.get("response", actual_response),
+                    tool_definitions=enhanced_response.get("tool_definitions", []),
+                )
+                results_dict[TOOL_CALL_ACCURACY] = _decorate_metric(METRIC_IDS[TOOL_CALL_ACCURACY], raw_score, threshold)
+            elif eval_name == CITATIONS:
+                fmt_str = eval_options.get("citation_format", "oai_unicode")
+                fmt_map = {
+                    "oai_unicode": CitationFormat.OAI_UNICODE,
+                    "bracket": CitationFormat.LEGACY_BRACKET,
+                    "mixed": CitationFormat.AUTO,
+                }
+                raw_score = CitationsEvaluator(citation_format=fmt_map.get(fmt_str, CitationFormat.OAI_UNICODE))(response=actual_response)
+                results_dict[CITATIONS] = _decorate_metric(METRIC_IDS[CITATIONS], raw_score, threshold)
+            elif eval_name == EXACT_MATCH:
+                case_sensitive = eval_options.get("case_sensitive", False)
+                raw_score = ExactMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response, expected_answer=expected_response)
+                # ExactMatch is binary — the evaluator already sets 'result'
+                # so _decorate_metric (which computes result from score vs threshold) is not needed.
+                results_dict[EXACT_MATCH] = raw_score
+            elif eval_name == PARTIAL_MATCH:
+                case_sensitive = eval_options.get("case_sensitive", False)
+                raw_score = PartialMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response, expected_answer=expected_response)
+                results_dict[PARTIAL_MATCH] = _decorate_metric(METRIC_IDS[PARTIAL_MATCH], raw_score, threshold)
+            evaluators_ran.append(eval_name)
+        except Exception as e:
+            emit_structured_log(
+                "error",
+                f"Evaluator '{eval_name}' crashed and will be omitted from results: {e}",
+                operation=Operation.EVALUATE,
+            )
+            results_dict[eval_name] = None
+    return results_dict, evaluators_ran
+def _check_all_passed(results_dict: Dict[str, Optional[Dict[str, Any]]]) -> bool:
+    """Check if all evaluator results passed. Skipped evaluators (None) are ignored."""
+    for result_data in results_dict.values():
+        if result_data is None:
+            continue
+        if result_data.get("result") == STATUS_FAIL:
+            return False
+    return True
+def _evaluate_multi_turn_responses(
+    turns: List[Dict],
+    m365_agent_id: Optional[str],
+    effective_log_level: str,
+    default_evaluators: Dict[str, Any],
+    model_config: AzureOpenAIModelConfiguration,
+    has_azure_openai: bool,
+) -> Tuple[List[Dict], Dict]:
+    """Run per-turn evaluations and build evaluated turn results with summary.
+    Returns:
+        Tuple of (evaluated_turns, summary). Each evaluated turn contains
+        prompt, response, expected_response, status, evaluators_ran, results,
+        and optionally error. Does not mutate the input turns.
+    """
+    evaluated_turns: List[Dict] = []
+    turns_passed = 0
+    turns_failed = 0
+    for i, turn in enumerate(turns):
+        evaluated_turn: Dict[str, Any] = {
+            "prompt": turn.get("prompt", ""),
+        }
+        if "expected_response" in turn:
+            evaluated_turn["expected_response"] = turn["expected_response"]
+        if "response" in turn:
+            evaluated_turn["response"] = turn["response"]
+        if "evaluators" in turn:
+            evaluated_turn["evaluators"] = turn["evaluators"]
+        if "evaluators_mode" in turn:
+            evaluated_turn["evaluators_mode"] = turn["evaluators_mode"]
+        if turn.get("status") == STATUS_ERROR:
+            evaluated_turn["status"] = STATUS_ERROR
+            evaluated_turn["error"] = turn.get("error", "")
+            turns_failed += 1
+            evaluated_turns.append(evaluated_turn)
+            continue
+        enhanced_response = turn.get("_enhanced_response", {})
+        actual_response = get_response_text_for_evaluation(enhanced_response)
+        resolved = resolve_evaluators_for_prompt(
+            turn.get("evaluators"), turn.get("evaluators_mode", "extend"),
+            turn.get("prompt", ""), default_evaluators,
+        )
+        results_dict, evaluators_ran = _run_evaluators_for_item(
+            turn.get("prompt", ""), actual_response, turn.get("expected_response", ""),
+            enhanced_response, resolved, model_config, has_azure_openai, m365_agent_id,
+        )
+        all_passed = _check_all_passed(results_dict)
+        evaluated_turn["results"] = results_dict
+        evaluated_turn["evaluators_ran"] = evaluators_ran
+        evaluated_turn["status"] = STATUS_PASS if all_passed else STATUS_FAIL
+        if effective_log_level == "debug":
+            emit_structured_log(
+                "debug",
+                f"Evaluation completed for turn {i + 1} prompt='{turn.get('prompt', '')}'. "
+                f"Evaluators: {', '.join(evaluators_ran)}. "
+                f"Scores: {results_dict}",
+                operation=Operation.EVALUATE,
+            )
+        if all_passed:
+            turns_passed += 1
+        else:
+            turns_failed += 1
+        evaluated_turns.append(evaluated_turn)
+    turns_total = len(turns)
+    if turns_passed == turns_total:
+        overall_status = STATUS_PASS
+    elif turns_failed == turns_total:
+        overall_status = STATUS_FAIL
+    else:
+        overall_status = STATUS_PARTIAL
+    summary = {
+        "turns_total": turns_total,
+        "turns_passed": turns_passed,
+        "turns_failed": turns_failed,
+        "overall_status": overall_status,
+    }
+    return evaluated_turns, summary
+def _evaluate_single_response(
+    enhanced_response: Dict[str, Any],
+    eval_item: Dict,
+    m365_agent_id: Optional[str],
+    effective_log_level: str,
+    model_config: AzureOpenAIModelConfiguration,
+    has_azure_openai: bool,
+    default_evaluators: Dict[str, Any],
+) -> Dict[str, Any]:
+    """Run all evaluators for a single prompt/response pair and return the result dict."""
+    actual_response_text = get_response_text_for_evaluation(enhanced_response)
+    prompt = eval_item.get("prompt", "")
+    expected_response = eval_item.get("expected_response", "")
+    resolved = resolve_evaluators_for_prompt(
+        eval_item.get("evaluators"), eval_item.get("evaluators_mode", "extend"),
+        prompt, default_evaluators,
+    )
+    results_dict, evaluators_ran = _run_evaluators_for_item(
+        prompt, actual_response_text, expected_response, enhanced_response,
+        resolved, model_config, has_azure_openai, m365_agent_id,
+    )
+    evaluation_result = {
+        "prompt": prompt,
+        "response": enhanced_response.get(
+            "display_response_text", actual_response_text
+        ),
+        "expected_response": expected_response,
+        "evaluators_ran": evaluators_ran,
+        "results": results_dict,
+    }
+    if "evaluators" in eval_item:
+        evaluation_result["evaluators"] = eval_item["evaluators"]
+    if "evaluators_mode" in eval_item:
+        evaluation_result["evaluators_mode"] = eval_item["evaluators_mode"]
+    if effective_log_level == "debug":
+        emit_structured_log(
+            "debug",
+            f"Evaluation completed for prompt='{evaluation_result['prompt']}'. "
+            f"Evaluators: {', '.join(evaluators_ran)}. "
+            f"Scores: {evaluation_result['results']}",
+            operation=Operation.EVALUATE,
+        )
+    return evaluation_result
+def get_effective_worker_count(prompt_count: int, concurrency: int) -> int:
+    """Compute safe worker count for prompt processing."""
+    if prompt_count <= 0:
+        return 1
+    try:
+        requested_int = int(concurrency)
+    except (TypeError, ValueError):
+        requested_int = MAX_CONCURRENCY
+    bounded = max(1, min(requested_int, MAX_CONCURRENCY))
+    return min(bounded, prompt_count)
+def run_pipeline(
+    pipeline: PipelineConfig,
+    eval_items: List[Dict],
+    config: RunConfig,
+) -> List[Dict[str, Any]]:
+    """Run the full evaluation pipeline: send prompts and evaluate responses in parallel.
+    Each worker processes one prompt end-to-end: send → evaluate.
+    Results are returned in original prompt order (FR-006).
+    """
+    # Validate all evaluator names upfront before dispatching workers
+    all_evaluator_maps = [pipeline.default_evaluators]
+    for eval_item in eval_items:
+        if "evaluators" in eval_item:
+            all_evaluator_maps.append(eval_item["evaluators"])
+        for turn in eval_item.get("turns", []):
+            if "evaluators" in turn:
+                all_evaluator_maps.append(turn["evaluators"])
+    for emap in all_evaluator_maps:
+        validate_evaluator_names(emap)
+    # Validate all items upfront and classify types before dispatching workers
+    item_types: List[ItemType] = []
+    for idx, eval_item in enumerate(eval_items):
+        try:
+            item_type = detect_item_type(eval_item)
+        except ValueError as e:
+            raise ValueError(f"Invalid evaluation item at index {idx}: {e}") from e
+        if item_type == ItemType.MULTI_TURN:
+            turn_count = len(eval_item["turns"])
+            if turn_count > MAX_TURNS_PER_THREAD:
+                raise ValueError(
+                    f"Invalid evaluation item at index {idx}: 'turns' array has "
+                    f"{turn_count} items (max {MAX_TURNS_PER_THREAD})"
+                )
+        item_types.append(item_type)
+    total = len(eval_items)
+    worker_count = get_effective_worker_count(total, config.concurrency)
+    multi_turn_count = sum(1 for t in item_types if t == ItemType.MULTI_TURN)
+    single_turn_count = total - multi_turn_count
+    emit_structured_log(
+        "info",
+        f"Running pipeline with {worker_count} worker(s) for {total} item(s) "
+        f"({single_turn_count} single-turn, {multi_turn_count} multi-turn).",
+        operation=Operation.EVALUATE,
+    )
+    def _process_item(eval_item: Dict, index: int) -> Dict[str, Any]:
+        if item_types[index] == ItemType.MULTI_TURN:
+            return _process_multi_turn(eval_item, index)
+        return _process_single_turn(eval_item, index)
+    def _process_single_turn(eval_item: Dict, index: int) -> Dict[str, Any]:
+        prompt = eval_item.get("prompt", "")
+        emit_structured_log(
+            "info",
+            f"Processing item {index + 1}/{total} (single-turn).",
+            operation=Operation.SEND_PROMPT,
+        )
+        # Phase A: Send prompt to agent (with retry + throttle gate)
+        response = None
+        for attempt in range(1, MAX_ATTEMPTS + 1):
+            pipeline.chat_gate.wait_if_blocked()
+            try:
+                response, _ = pipeline.agent_client.send_prompt(prompt, agent_id=config.m365_agent_id)
+                break
+            except Exception as exc:
+                cause = exc.__cause__
+                status = int(getattr(cause, "code", 0) or 0) or None if cause else None
+                retry_after = get_retry_after_seconds(
+                    cause.headers.get("Retry-After") if cause and getattr(cause, "headers", None) else None
+                )
+                if retry_after is not None and pipeline.is_retryable_status(status):
+                    pipeline.chat_gate.apply_retry_after(retry_after)
+                if not pipeline.is_retryable_status(status) or attempt >= MAX_ATTEMPTS:
+                    emit_structured_log(
+                        "error",
+                        f"Item {index + 1}/{total} failed after {attempt} attempt(s): {exc}",
+                        operation=Operation.SEND_PROMPT,
+                    )
+                    return {
+                        "prompt": prompt,
+                        "response": "",
+                        "expected_response": eval_item.get("expected_response", ""),
+                        "evaluators_ran": [],
+                        "results": {},
+                        "status": STATUS_ERROR,
+                        "errorDetails": str(exc),
+                    }
+                delay = retry_after if retry_after is not None else pipeline.get_backoff_seconds(attempt)
+                time.sleep(delay)
+        # Phase B: Evaluate response
+        return _evaluate_single_response(
+            response, eval_item, config.m365_agent_id, config.effective_log_level,
+            pipeline.model_config, pipeline.has_azure_openai,
+            pipeline.default_evaluators,
+        )
+    def _process_multi_turn(eval_item: Dict, index: int) -> Dict[str, Any]:
+        turns = eval_item["turns"]
+        thread_name = eval_item.get("name", "Unnamed thread")
+        emit_structured_log(
+            "info",
+            f"Processing item {index + 1}/{total} (multi-turn: '{thread_name}').",
+            operation=Operation.SEND_PROMPT,
+        )
+        if len(turns) > LONG_THREAD_WARNING_THRESHOLD:
+            emit_structured_log(
+                "warning",
+                f"Thread '{thread_name}' has {len(turns)} turns (>{LONG_THREAD_WARNING_THRESHOLD}). This may take a while.",
+                operation=Operation.SEND_PROMPT,
+            )
+        # Phase A: Send each turn with throttle gate + 429-only retry
+        # Multi-turn only retries on 429 (server confirmed it didn't process
+        # the request). Other transient errors (503, 504) are ambiguous about
+        # whether the server processed the turn, risking duplicate turns in
+        # the conversation if retried.
+        conversation_context = None
+        conversation_id = None
+        enriched_turns: List[Dict[str, Any]] = []
+        failed = False
+        for i, turn in enumerate(turns):
+            prompt = turn["prompt"]
+            emit_structured_log(
+                "debug",
+                f"Sending turn {i + 1}/{len(turns)} of '{thread_name}'.",
+                operation=Operation.SEND_PROMPT,
+            )
+            response = None
+            for attempt in range(1, MAX_ATTEMPTS + 1):
+                pipeline.chat_gate.wait_if_blocked()
+                try:
+                    response, conversation_context = pipeline.agent_client.send_prompt(
+                        prompt, agent_id=config.m365_agent_id,
+                        conversation_context=conversation_context,
+                    )
+                    break
+                except Exception as exc:
+                    cause = exc.__cause__
+                    status = int(getattr(cause, "code", 0) or 0) or None if cause else None
+                    retry_after = get_retry_after_seconds(
+                        cause.headers.get("Retry-After") if cause and getattr(cause, "headers", None) else None
+                    )
+                    # Only retry on 429 — server confirmed it didn't process the request
+                    if status == 429 and attempt < MAX_ATTEMPTS:
+                        if retry_after is not None:
+                            pipeline.chat_gate.apply_retry_after(retry_after)
+                        delay = retry_after if retry_after is not None else pipeline.get_backoff_seconds(attempt)
+                        time.sleep(delay)
+                        continue
+                    # All other errors: stop the thread
+                    emit_structured_log(
+                        "error",
+                        f"Turn {i + 1}/{len(turns)} of '{thread_name}' failed after {attempt} attempt(s): {exc}",
+                        operation=Operation.SEND_PROMPT,
+                    )
+                    failed = True
+                    break
+            if failed:
+                # Mark this turn and all remaining turns as error
+                enriched_turns.append({
+                    **turn,
+                    "response": "",
+                    "status": STATUS_ERROR,
+                    "error": "Failed to get response from agent",
+                })
+                for j in range(i + 1, len(turns)):
+                    enriched_turns.append({
+                        **turns[j],
+                        "response": "",
+                        "status": STATUS_ERROR,
+                        "error": "Skipped: preceding turn failed",
+                    })
+                break
+            # Enrich turn with response
+            response_text = get_response_text_for_evaluation(response)
+            enriched_turns.append({
+                **turn,
+                "response": response.get("display_response_text", response_text),
+                "_enhanced_response": response,
+            })
+            # Capture conversation_id from first response
+            if conversation_id is None:
+                conversation_id = response.get("metadata", {}).get("conversation_id")
+        # Phase B: Run per-turn evaluations
+        evaluated_turns, summary = _evaluate_multi_turn_responses(
+            enriched_turns, config.m365_agent_id, config.effective_log_level,
+            pipeline.default_evaluators,
+            model_config=pipeline.model_config,
+            has_azure_openai=pipeline.has_azure_openai,
+        )
+        return {
+            "type": "multi_turn",
+            "name": eval_item.get("name", ""),
+            "description": eval_item.get("description", ""),
+            "conversation_id": conversation_id or "",
+            "turns": evaluated_turns,
+            "summary": summary,
+        }
+    execution_results = execute_in_parallel(
+        eval_items, _process_item, max_workers=worker_count,
+    )
+    # Unwrap WorkerResult objects into plain dicts, with error fallback
+    ordered_results: List[Dict[str, Any]] = []
+    for wr in execution_results:
+        if wr.error:
+            idx = wr.index
+            item = eval_items[idx]
+            if item_types[idx] == ItemType.MULTI_TURN:
+                ordered_results.append({
+                    "type": "multi_turn",
+                    "name": item.get("name", ""),
+                    "turns": [
+                        {**t, "status": STATUS_ERROR, "error": str(wr.error), "response": "", "results": {}}
+                        for t in item.get("turns", [])
+                    ],
+                    "summary": {
+                        "turns_total": len(item.get("turns", [])),
+                        "turns_passed": 0,
+                        "turns_failed": len(item.get("turns", [])),
+                        "overall_status": STATUS_FAIL,
+                    },
+                    "error": str(wr.error),
+                })
+            else:
+                ordered_results.append({
+                    "prompt": item.get("prompt", ""),
+                    "response": "",
+                    "expected_response": item.get("expected_response", ""),
+                    "evaluators_ran": [],
+                    "results": {},
+                    "status": STATUS_ERROR,
+                    "errorDetails": str(wr.error),
+                })
+        else:
+            ordered_results.append(wr.value)
+    return ordered_results

package/src/clients/cli/evaluator_resolver.py CHANGED Viewed

@@ -12,12 +12,11 @@ from common import (
     RELEVANCE,
     COHERENCE,
     GROUNDEDNESS,
-    TOOL_CALL_ACCURACY,
+    SIMILARITY,
     CITATIONS,
     EXACT_MATCH,
     PARTIAL_MATCH,
     REQUIRES_AZURE_OPENAI,
-    REQUIRES_TOOL_DEFINITIONS,
     SYSTEM_DEFAULT_EVALUATORS,
     RegistryEntry,
 )
@@ -30,7 +29,7 @@ EVALUATOR_REGISTRY: Dict[str, RegistryEntry] = {
     RELEVANCE: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
     COHERENCE: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
     GROUNDEDNESS: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
-    TOOL_CALL_ACCURACY: RegistryEntry(type="tool", requires=[REQUIRES_AZURE_OPENAI, REQUIRES_TOOL_DEFINITIONS], default_threshold=3),
+    SIMILARITY: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
     CITATIONS: RegistryEntry(type="non-llm", requires=[], default_threshold=1),
     EXACT_MATCH: RegistryEntry(type="non-llm", requires=[], default_threshold=None),
     PARTIAL_MATCH: RegistryEntry(type="non-llm", requires=[], default_threshold=0.5),
@@ -61,9 +60,13 @@ def validate_evaluator_names(evaluator_map: Dict[str, Any]) -> None:
     lines.append("")
     lines.append("Valid evaluators are:")
-    lines.append(f"  - {', '.join(llm_evals)} (LLM-based)")
-    lines.append(f"  - {', '.join(tool_evals)} (tool evaluation)")
-    lines.append(f"  - {', '.join(non_llm_evals)} (non-LLM)")
+    for category, label in [
+        (llm_evals, "LLM-based"),
+        (tool_evals, "tool evaluation"),
+        (non_llm_evals, "non-LLM"),
+    ]:
+        if category:
+            lines.append(f"  - {', '.join(category)} ({label})")
     raise ValueError("\n".join(lines))