npm - @microsoft/m365-copilot-eval - Versions diffs - 1.2.0-preview.1 → 1.3.0-preview.1 - Mend

@microsoft/m365-copilot-eval 1.2.0-preview.1 → 1.3.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/README.md +12 -4
package/package.json +3 -2
package/schema/CHANGELOG.md +8 -0
package/schema/v1/eval-document.schema.json +117 -1
package/schema/v1/examples/valid/comprehensive.json +27 -2
package/schema/version.json +2 -2
package/src/clients/cli/cli_logging/__init__.py +0 -0
package/src/clients/cli/cli_logging/console_diagnostics.py +55 -0
package/src/clients/cli/cli_logging/logging_utils.py +145 -0
package/src/clients/cli/common.py +51 -0
package/src/clients/cli/custom_evaluators/CitationsEvaluator.py +3 -3
package/src/clients/cli/custom_evaluators/ExactMatchEvaluator.py +11 -11
package/src/clients/cli/custom_evaluators/PartialMatchEvaluator.py +1 -11
package/src/clients/cli/evaluator_resolver.py +150 -0
package/src/clients/cli/generate_report.py +130 -110
package/src/clients/cli/main.py +545 -236
package/src/clients/cli/readme.md +14 -7
package/src/clients/cli/requirements.txt +1 -0
package/src/clients/cli/response_extractor.py +32 -14
package/src/clients/node-js/bin/runevals.js +58 -28
package/src/clients/node-js/config/default.js +1 -1

package/src/clients/cli/main.py CHANGED Viewed

@@ -3,6 +3,8 @@ import os
 import argparse
 import sys
 import csv
+import functools
+import logging
 import webbrowser
 import urllib.request
 import urllib.error
@@ -20,14 +22,39 @@ from azure.ai.evaluation import (
 from dotenv import load_dotenv
 from auth.auth_handler import AuthHandler
 from custom_evaluators.CitationsEvaluator import CitationsEvaluator, CitationFormat
-#from custom_evaluators.ConcisenessNonLLMEvaluator import ConcisenessNonLLMEvaluator
-#from custom_evaluators.PII.PII import PIIEvaluator
+from custom_evaluators.ExactMatchEvaluator import ExactMatchEvaluator
+from custom_evaluators.PartialMatchEvaluator import PartialMatchEvaluator
 from generate_report import generate_html_report, calculate_aggregate_statistics
 from response_extractor import extract_enhanced_responses, get_response_text_for_evaluation
 from schema_handler import DocumentUpgrader, SchemaVersionManager
+from common import (
+    RELEVANCE,
+    COHERENCE,
+    GROUNDEDNESS,
+    TOOL_CALL_ACCURACY,
+    CITATIONS,
+    EXACT_MATCH,
+    PARTIAL_MATCH,
+    REQUIRES_AZURE_OPENAI,
+    REQUIRES_TOOL_DEFINITIONS,
+    METRIC_IDS,
+    pascal_case_to_title,
+)
+from evaluator_resolver import (
+    EVALUATOR_REGISTRY,
+    validate_evaluator_names,
+    check_prerequisites,
+    resolve_default_evaluators,
+    resolve_evaluators_for_prompt,
+    get_evaluator_threshold,
+)
 from version_check import check_min_version, get_cli_version
 from datetime import datetime, timezone
 from pathlib import Path
+import tzlocal
+from cli_logging.console_diagnostics import render_diagnostic, serialize_diagnostic_record
+from cli_logging.logging_utils import LOG_LEVEL_MAP, LogLevel, Operation, format_structured_log_entry, resolve_log_level
 # Allowed endpoints for URL validation
 ALLOWED_ENDPOINTS = [
@@ -46,20 +73,63 @@ VERSION_CHECK_BYPASS_FLAGS = (
     "signout",
 )
+CLI_LOGGER_NAME = "m365.eval.cli"
+CLI_LOGGER = logging.getLogger(CLI_LOGGER_NAME)
+DIAGNOSTIC_RECORDS: List[Dict[str, Any]] = []
+def _ensure_logger_handler() -> None:
+    if CLI_LOGGER.handlers:
+        return
+    handler = logging.StreamHandler(sys.stdout)
+    handler.setFormatter(logging.Formatter("%(message)s"))
+    CLI_LOGGER.addHandler(handler)
+    CLI_LOGGER.propagate = False
+def configure_cli_logging(effective_log_level: str) -> None:
+    _ensure_logger_handler()
+    CLI_LOGGER.setLevel(LOG_LEVEL_MAP[effective_log_level])
+def emit_structured_log(level: str, message: str, operation: str = Operation.EVALUATE) -> None:
+    _ensure_logger_handler()
+    context = {
+        "request-id": None,
+        "conversation-id": None,
+        "message-id": None,
+        "operation": operation,
+    }
+    entry = format_structured_log_entry(
+        level=level,
+        message=message,
+        logger_name=CLI_LOGGER_NAME,
+        run_context=context,
+    )
+    DIAGNOSTIC_RECORDS.append(entry)
+    try:
+        CLI_LOGGER.log(LOG_LEVEL_MAP.get(level, logging.INFO), render_diagnostic(entry))
+    except Exception:
+        pass
 def should_bypass_min_version_check(args: argparse.Namespace) -> bool:
     """Return True if the current invocation should skip min-version checks."""
     return any(getattr(args, flag, False) for flag in VERSION_CHECK_BYPASS_FLAGS)
-def write_results_to_html(results: List[Dict], output_file: str):
+def write_results_to_html(results: List[Dict], output_file: str,
+                          agent_name: Optional[str] = None, agent_id: Optional[str] = None,
+                          cli_version: Optional[str] = None):
     """Write results to HTML file using generate_html_report from generate_report.py."""
     try:
-        html = generate_html_report(results)
+        html = generate_html_report(results, agent_name=agent_name, agent_id=agent_id,
+                                    cli_version=cli_version)
         with open(output_file, 'w', encoding='utf-8') as f:
             f.write(html)
-        print(f"HTML report saved to {output_file}")
+        emit_structured_log("info", f"HTML report saved to {output_file}", operation=Operation.WRITE_OUTPUT)
     except Exception as e:
-        print(f"Error writing to HTML file: {e}")
+        emit_structured_log("error", f"Error writing to HTML file: {e}", operation=Operation.WRITE_OUTPUT)
         sys.exit(1)
 def get_default_prompts_and_responses():
@@ -72,7 +142,7 @@ def get_default_prompts_and_responses():
     ]
     return prompts, expected_responses
-def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
+def load_prompts_from_file(file_path: str) -> Tuple[List[Dict], Optional[Dict]]:
     """Load prompts and expected responses from a JSON file.
     Supports three formats:
@@ -82,6 +152,10 @@ def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
     For eval documents (format 1) and array format (format 2), schema validation
     and auto-upgrade are applied via DocumentUpgrader.
+    Returns:
+        Tuple of (eval_items, default_evaluators). Items are dicts with prompt,
+        expected_response, and optional evaluators/evaluators_mode fields.
     """
     try:
         with open(file_path, 'r', encoding='utf-8') as f:
@@ -98,18 +172,18 @@ def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
                 upgrader = DocumentUpgrader()
             except Exception as e:
                 # Schema infrastructure not available (missing files, etc.) — skip
-                print(f"Warning: Unable to initialize document upgrader: {e}")
+                emit_structured_log("warning", f"Unable to initialize document upgrader: {e}", operation=Operation.LOAD_PROMPTS)
                 upgrader = None
             if upgrader is not None:
                 result = upgrader.upgrade(Path(file_path))
                 if result.error:
-                    print(f"Schema validation error: {result.error}")
+                    emit_structured_log("error", f"Schema validation error: {result.error}", operation=Operation.LOAD_PROMPTS)
                     sys.exit(1)
                 if result.upgraded and result.message:
-                    print(result.message)
+                    emit_structured_log("info", result.message, operation=Operation.LOAD_PROMPTS)
                 # Use the parsed document from the upgrade result
                 if result.document is not None:
@@ -117,26 +191,26 @@ def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
         if isinstance(data, list):
             # Format: [{"prompt": "...", "expected_response": "..."}, ...]
-            prompts = [item.get("prompt", "") for item in data]
-            expected_responses = [item.get("expected_response", "") for item in data]
+            return data, None
         elif isinstance(data, dict):
             if "items" in data:
                 # Eval document format: {"schemaVersion": "...", "items": [...]}
-                items = data["items"]
-                prompts = [item.get("prompt", "") for item in items]
-                expected_responses = [item.get("expected_response", "") for item in items]
+                return data["items"], data.get("default_evaluators")
             else:
                 # Format: {"prompts": [...], "expected_responses": [...]}
                 prompts = data.get("prompts", [])
                 expected_responses = data.get("expected_responses", [])
+                eval_items = [
+                    {"prompt": p, "expected_response": e}
+                    for p, e in zip(prompts, expected_responses)
+                ]
+                return eval_items, None
         else:
             raise ValueError("Invalid file format")
-        return prompts, expected_responses
     except SystemExit:
         raise
     except Exception as e:
-        print(f"Error loading prompts from file: {e}")
+        emit_structured_log("error", f"Error loading prompts from file: {e}", operation=Operation.LOAD_PROMPTS)
         sys.exit(1)
 def get_interactive_prompts() -> Tuple[List[str], List[str]]:
@@ -163,116 +237,168 @@ def get_interactive_prompts() -> Tuple[List[str], List[str]]:
     return prompts, expected_responses
-def run_evaluations(args, responses: dict, expected_responses: list) -> list:
-    """Run evaluations against the responses."""
+def run_evaluations(args, responses: List[Dict[str, Any]], eval_items: List[Dict],
+                     default_evaluators: Dict[str, Any]) -> list:
+    """Run evaluations against the responses using per-prompt evaluator resolution.
+    Args:
+        args: CLI arguments.
+        responses: List of enhanced response dicts (one per prompt, aligned with eval_items by index).
+        eval_items: List of item dicts (prompt, expected_response, evaluators, evaluators_mode).
+        default_evaluators: Resolved default evaluators (from resolve_default_evaluators).
+    """
+    if len(responses) != len(eval_items):
+        raise ValueError(
+            f"Mismatch between number of responses ({len(responses)}) and evaluation items ({len(eval_items)})."
+        )
     model_config = AzureOpenAIModelConfiguration(
         azure_endpoint=os.environ.get("AZURE_AI_OPENAI_ENDPOINT"),
         api_key=os.environ.get("AZURE_AI_API_KEY"),
         api_version=os.environ.get("AZURE_AI_API_VERSION"),
         azure_deployment=os.environ.get("AZURE_AI_MODEL_NAME"),
     )
-    # Initialize evaluators
-    relevance_evaluator = RelevanceEvaluator(model_config=model_config) # Evaluate relevance for a given response. Range is 1 - 5.
-    coherence_evaluator = CoherenceEvaluator(model_config=model_config) # Measures the coherence (human-like quality) of the response. Range is 1 - 5.
-    groundedness_evaluator = GroundednessEvaluator(model_config=model_config) # Evaluates the response for for factuality and groundedness against provided context. Range is 1 - 5.
-    #concisenessnonllm_evaluator = ConcisenessNonLLMEvaluator() # Evaluates the response for conciseness. Range is 1 - 5.
-    #pii_evaluator = PIIEvaluator(model_config=model_config) # Evaluates the response for presence of PII. Range
-    # Parse citation format from args
-    citation_format = CitationFormat.OAI_UNICODE if args.citation_format == 'oai_unicode' else CitationFormat.LEGACY_BRACKET
-    citations_evaluator = CitationsEvaluator(citation_format=citation_format) # Evaluates citations present in the response using regex pattern matching
-    tool_call_accuracy_evaluator = ToolCallAccuracyEvaluator(model_config) # Evaluate tool call accuracy if tool definitions are present in response
-    PASS_THRESHOLD = 3  # All evaluators must meet or exceed this value (out of 5) to pass
+    # Build available context for prerequisite checks
+    has_azure_openai = bool(
+        os.environ.get("AZURE_AI_OPENAI_ENDPOINT")
+        and os.environ.get("AZURE_AI_API_KEY")
+    )
+    DEFAULT_PASS_THRESHOLD = 3
-    def decorate_metric(metric_id: str, data):
+    def decorate_metric(metric_id: str, data, threshold: Optional[int] = None):
         """Augment raw evaluator output with standardized threshold + pass/fail result."""
+        pass_threshold = threshold if threshold is not None else DEFAULT_PASS_THRESHOLD
         payload = {}
-        # Preserve original structure if dict
         if isinstance(data, dict):
             payload.update(data)
         else:
             payload['raw'] = data
-        # Try to extract a numeric score
         score_val = None
         if isinstance(data, dict):
-            for k in (metric_id, f"{metric_id}_score", 'score', 'value'):
-                if k in data:
-                    score_val = data[k]
-                    break
+            if metric_id in data:
+                score_val = data[metric_id]
         if isinstance(score_val, (int, float)):
-            payload['threshold'] = PASS_THRESHOLD
-            payload['result'] = 'pass' if score_val >= PASS_THRESHOLD else 'fail'
+            payload['threshold'] = pass_threshold
+            payload['result'] = 'pass' if score_val >= pass_threshold else 'fail'
         else:
-            # If we cannot determine score, mark unknown (no pass/fail)
-            payload['threshold'] = PASS_THRESHOLD
+            payload['threshold'] = pass_threshold
             payload.setdefault('result', 'unknown')
         return json.dumps(payload, indent=4)
+    # Validate all evaluator names upfront (across defaults and all items)
+    all_evaluator_maps = [default_evaluators]
+    for eval_item in eval_items:
+        if "evaluators" in eval_item:
+            all_evaluator_maps.append(eval_item["evaluators"])
+    for emap in all_evaluator_maps:
+        validate_evaluator_names(emap)
     evaluation_results = []
-    for prompt, expected_response in zip(responses.keys(), expected_responses):
-        # Extract text response for evaluation (backward compatibility)
-        enhanced_response = responses[prompt]
+    for enhanced_response, eval_item in zip(responses, eval_items):
         actual_response_text = get_response_text_for_evaluation(enhanced_response)
-        # Run evaluations using text response
-        relevance_score = relevance_evaluator(
-            query=prompt,
-            response=actual_response_text
-        )
-        coherence_score = coherence_evaluator(
-            query=prompt,
-            response=actual_response_text
-        )
-        groundedness_score = groundedness_evaluator(
-            response=actual_response_text,
-            context=expected_response
+        prompt = eval_item.get("prompt", "")
+        expected_response = eval_item.get("expected_response", "")
+        prompt_evaluators = eval_item.get("evaluators")
+        evaluators_mode = eval_item.get("evaluators_mode", "extend")
+        # Resolve evaluators for this prompt
+        resolved = resolve_evaluators_for_prompt(
+            prompt_evaluators, evaluators_mode, prompt, default_evaluators,
         )
-        #PII_score = pii_evaluator(response=actual_response_text)
-        #concisenessNonLLM_score = concisenessnonllm_evaluator(response=actual_response_text)
-        citations_score = citations_evaluator(
-            response=actual_response_text
+        # Build runtime context for prerequisite checks
+        has_tool_defs = bool(
+            args.m365_agent_id and enhanced_response.get("tool_definitions")
         )
+        available_context = {
+            REQUIRES_AZURE_OPENAI: has_azure_openai,
+            REQUIRES_TOOL_DEFINITIONS: has_tool_defs,
+        }
-        tool_call_accuracy = None
-        if args.m365_agent_id and enhanced_response.get("tool_definitions"):
-            tool_call_accuracy = tool_call_accuracy_evaluator(
-                query=prompt,
-                response=enhanced_response.get("response", actual_response_text),
-                tool_definitions=enhanced_response["tool_definitions"]
-            )
+        results_dict: Dict[str, Optional[str]] = {}
+        evaluators_ran: List[str] = []
+        for eval_name, eval_options in resolved.items():
+            # Check prerequisites
+            can_run, warn_msg = check_prerequisites(eval_name, available_context)
+            if not can_run:
+                if warn_msg:
+                    emit_structured_log("warning", f"Evaluator '{eval_name}' prerequisite check failed: {warn_msg}", operation=Operation.EVALUATE)
+                results_dict[eval_name] = None
+                continue
+            evaluators_ran.append(eval_name)
+            threshold = get_evaluator_threshold(eval_name, eval_options)
+            if eval_name == RELEVANCE:
+                raw_score = RelevanceEvaluator(model_config=model_config)(query=prompt, response=actual_response_text)
+                results_dict[RELEVANCE] = decorate_metric(METRIC_IDS[RELEVANCE], raw_score, threshold)
+            elif eval_name == COHERENCE:
+                raw_score = CoherenceEvaluator(model_config=model_config)(query=prompt, response=actual_response_text)
+                results_dict[COHERENCE] = decorate_metric(METRIC_IDS[COHERENCE], raw_score, threshold)
+            elif eval_name == GROUNDEDNESS:
+                raw_score = GroundednessEvaluator(model_config=model_config)(response=actual_response_text, context=expected_response)
+                results_dict[GROUNDEDNESS] = decorate_metric(METRIC_IDS[GROUNDEDNESS], raw_score, threshold)
+            elif eval_name == TOOL_CALL_ACCURACY:
+                raw_score = ToolCallAccuracyEvaluator(model_config)(
+                    query=prompt,
+                    response=enhanced_response.get("response", actual_response_text),
+                    tool_definitions=enhanced_response["tool_definitions"],
+                )
+                results_dict[TOOL_CALL_ACCURACY] = decorate_metric(METRIC_IDS[TOOL_CALL_ACCURACY], raw_score, threshold)
+            elif eval_name == CITATIONS:
+                fmt_str = eval_options.get("citation_format", "oai_unicode")
+                fmt_map = {
+                    "oai_unicode": CitationFormat.OAI_UNICODE,
+                    "bracket": CitationFormat.LEGACY_BRACKET,
+                    "mixed": CitationFormat.AUTO,
+                }
+                raw_score = CitationsEvaluator(citation_format=fmt_map.get(fmt_str, CitationFormat.OAI_UNICODE))(response=actual_response_text)
+                results_dict[CITATIONS] = decorate_metric(METRIC_IDS[CITATIONS], raw_score, threshold)
+            elif eval_name == EXACT_MATCH:
+                # ExactMatch is binary (match/no-match) — it includes its own result
+                # field, so we skip decorate_metric which assumes a numeric score.
+                case_sensitive = eval_options.get("case_sensitive", False)
+                raw_score = ExactMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response_text, expected_answer=expected_response)
+                results_dict[EXACT_MATCH] = json.dumps(raw_score, indent=4)
+            elif eval_name == PARTIAL_MATCH:
+                case_sensitive = eval_options.get("case_sensitive", False)
+                raw_score = PartialMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response_text, expected_answer=expected_response)
+                results_dict[PARTIAL_MATCH] = decorate_metric(METRIC_IDS[PARTIAL_MATCH], raw_score, threshold)
         evaluation_result = {
             "prompt": prompt,
-            "response": actual_response_text,  # Keep simple text for backward compatibility
+            "response": actual_response_text,
             "expected_response": expected_response,
-            "results": {
-                "relevance_score": decorate_metric("relevance", relevance_score),
-                "coherence_score": decorate_metric("coherence", coherence_score),
-                "groundedness_score": decorate_metric("groundedness", groundedness_score),
-                #"concisenessnonllm_score": decorate_metric("concisenessnonllm", concisenessNonLLM_score),
-                #"pii_score": decorate_metric("pii", PII_score),
-                "citations_score": json.dumps(citations_score, indent=4),
-                "tool_call_accuracy_score": json.dumps(tool_call_accuracy, indent=4) if tool_call_accuracy else None
-            }
+            "evaluators_ran": evaluators_ran,
+            "results": results_dict,
         }
-        if args.verbose:
-            print(f".................................. Evaluation for prompt: {evaluation_result['prompt']} ..................................")
-            print(f"Scores: {evaluation_result['results']}")
-            print("...........................................................................................................................")
+        # Preserve evaluator config metadata for output
+        if "evaluators" in eval_item:
+            evaluation_result["evaluators"] = eval_item["evaluators"]
+        if "evaluators_mode" in eval_item:
+            evaluation_result["evaluators_mode"] = eval_item["evaluators_mode"]
+        if getattr(args, "effective_log_level", "info") == "debug":
+            emit_structured_log(
+                "debug",
+                f"Evaluation completed for prompt='{evaluation_result['prompt']}'. "
+                f"Evaluators: {', '.join(evaluators_ran)}. "
+                f"Scores: {evaluation_result['results']}",
+                operation=Operation.EVALUATE,
+            )
         evaluation_results.append(evaluation_result)
     return evaluation_results
-def write_results_to_console(results):
+def write_results_to_console(results, agent_name: Optional[str] = None,
+                             agent_id: Optional[str] = None,
+                             cli_version: Optional[str] = None):
     """Write the response to console."""
     # ANSI color codes
     BOLD = '\033[1m'
@@ -284,47 +410,66 @@ def write_results_to_console(results):
     ORANGE = '\033[38;5;208m'
     RED = '\033[91m'
     RESET = '\033[0m'
+    # Show metadata
+    metadata_parts = []
+    if agent_name:
+        metadata_parts.append(f"Agent Name: {agent_name}")
+    if agent_id:
+        metadata_parts.append(f"Agent ID: {agent_id}")
+    if cli_version:
+        metadata_parts.append(f"CLI Version: {cli_version}")
+    if metadata_parts:
+        print(f"{BOLD}{CYAN}{' | '.join(metadata_parts)}{RESET}")
+        print()
     # Show aggregate statistics if multiple results
     if len(results) > 1:
         aggregates = calculate_aggregate_statistics(results)
         if aggregates:
-            print(f"{BOLD}{BLUE}📊 Aggregate Statistics ({len(results)} prompts):{RESET}")
+            print(f"{BOLD}{BLUE}Aggregate Statistics ({len(results)} prompts):{RESET}")
             print(f"{BLUE}{'=' * 60}{RESET}")
             for metric_name, stats in aggregates.items():
                 pass_color = GREEN if stats['pass_rate'] >= 80 else YELLOW if stats['pass_rate'] >= 60 else RED
-                print(f"{BOLD}{CYAN}{metric_name}:{RESET}")
+                prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
+                total_prompts = stats.get('total_prompts', len(results))
+                print(f"{BOLD}{CYAN}{metric_name}:{RESET} ({prompts_evaluated}/{total_prompts} prompts)")
                 print(f"  Pass Rate: {pass_color}{stats['pass_rate']:.1f}%{RESET} ({stats['pass_count']}/{stats['total_evaluated']} passed)")
                 print(f"  Avg Score: {MAGENTA}{stats['avg_score']:.2f}{RESET}")
                 if stats.get('threshold') is not None:
                     print(f"  Threshold: {YELLOW}{stats['threshold']}{RESET}")
                 print()
             print(f"{BLUE}{'=' * 60}{RESET}")
             print()
-    print(f"{BOLD}{BLUE}📝 Individual Results:{RESET}")
+    print(f"{BOLD}{BLUE}Individual Results:{RESET}")
     print(f"{BLUE}{'=' * 50}{RESET}")
     for i, result in enumerate(results, 1):
         print(f"{BOLD}{GREEN}Prompt {i}:{RESET} {result['prompt']}")
+        # Show which evaluators ran for this prompt
+        evaluators_ran = result.get('evaluators_ran', [])
+        if evaluators_ran:
+            print(f"{BOLD}{CYAN}Evaluators:{RESET} {', '.join(evaluators_ran)}")
         print(f"{BOLD}{CYAN}Response:{RESET} {result['response']}")
         print(f"{BOLD}{YELLOW}Expected Response:{RESET} {result['expected_response']}")
-        # Print metric scores generically from nested results (fallback to flat keys for back-compat)
-        metrics = result.get('results') or {k: v for k, v in result.items() if isinstance(k, str) and k.endswith('_score')}
-        if metrics:
-            for k, v in metrics.items():
-                name = k.replace('_', ' ')
-                if 'relevance' in k:
-                    color = MAGENTA
-                elif 'coherence' in k:
-                    color = ORANGE
-                elif 'fluency' in k:
-                    color = GREEN
-                else:
-                    color = BLUE
-                print(f"{BOLD}{color}{name}:{RESET} {v}")
+        # Print metric scores from results
+        metrics = result.get('results', {})
+        for eval_name, v in metrics.items():
+            if v is None:
+                continue  # Skip null/N/A scores from skipped evaluators
+            display_name = pascal_case_to_title(eval_name)
+            if eval_name == RELEVANCE:
+                color = MAGENTA
+            elif eval_name == COHERENCE:
+                color = ORANGE
+            else:
+                color = BLUE
+            print(f"{BOLD}{color}{display_name}:{RESET} {v}")
         print(f"{BLUE}{'-' * 30}{RESET}")
 def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
@@ -336,10 +481,8 @@ def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
     DEFAULT_THRESHOLD = 3  # fallback; decorate_metric should always set this
     score_val = None
-    for k in (metric_id, f"{metric_id}_score", "score", "value"):
-        if k in data and isinstance(data[k], (int, float)):
-            score_val = data[k]
-            break
+    if metric_id in data and isinstance(data[metric_id], (int, float)):
+        score_val = data[metric_id]
     if score_val is None:
         return None
@@ -362,9 +505,11 @@ def convert_result_to_eval_item(result: Dict) -> Dict:
     """Convert an internal evaluation result dict to a schema-compliant EvalItem.
     Internal format (from run_evaluations):
-        {prompt, response, expected_response, results: {relevance_score: "JSON", ...}}
+        {prompt, response, expected_response, results: {Relevance: "JSON", ...},
+         evaluators_ran: [...], evaluators: {...}, evaluators_mode: "..."}
     Schema EvalItem format:
-        {prompt, response, expected_response, scores: {relevance: EvalScore, ...}}
+        {prompt, response, expected_response, scores: {relevance: EvalScore, ...},
+         evaluators: {...}, evaluators_mode: "..."}
     """
     item: Dict[str, Any] = {
         "prompt": result["prompt"],
@@ -372,30 +517,35 @@ def convert_result_to_eval_item(result: Dict) -> Dict:
         "expected_response": result["expected_response"],
     }
+    # Preserve evaluator config in output
+    if "evaluators" in result:
+        item["evaluators"] = result["evaluators"]
+    if "evaluators_mode" in result:
+        item["evaluators_mode"] = result["evaluators_mode"]
     scores: Dict[str, Any] = {}
     results_dict = result.get("results", {})
     # EvalScore metrics (all share the same schema shape: {score, result, threshold})
-    # Tuple: (internal results key, metric ID for score lookup, schema output key)
-    for internal_key, metric_id, schema_key in [
-        ("relevance_score", "relevance", "relevance"),
-        ("coherence_score", "coherence", "coherence"),
-        ("groundedness_score", "groundedness", "groundedness"),
-        ("tool_call_accuracy_score", "tool_call_accuracy", "toolCallAccuracy"),
+    for eval_key, schema_key in [
+        (RELEVANCE, "relevance"),
+        (COHERENCE, "coherence"),
+        (GROUNDEDNESS, "groundedness"),
+        (TOOL_CALL_ACCURACY, "toolCallAccuracy"),
     ]:
-        raw = results_dict.get(internal_key)
+        raw = results_dict.get(eval_key)
         if not raw:
             continue
         data = json.loads(raw) if isinstance(raw, str) else raw
-        eval_score = extract_eval_score(data, metric_id)
+        eval_score = extract_eval_score(data, METRIC_IDS[eval_key])
         if eval_score:
             scores[schema_key] = eval_score
-    # Citations → CitationScore (different schema shape: {count, result, threshold} + format)
-    raw_citations = results_dict.get("citations_score")
+    # Citations → CitationScore
+    raw_citations = results_dict.get(CITATIONS)
     if raw_citations:
         data = json.loads(raw_citations) if isinstance(raw_citations, str) else raw_citations
-        count = data.get("score", 0)
+        count = data.get("citations", 0)
         cit_result = data.get("result")
         if cit_result not in ("pass", "fail"):
             cit_result = "pass" if count >= data.get("threshold", 1) else "fail"
@@ -409,17 +559,42 @@ def convert_result_to_eval_item(result: Dict) -> Dict:
             citation_score["format"] = data["citation_format"]
         scores["citations"] = citation_score
+    # ExactMatch → ExactMatchScore
+    raw_exact = results_dict.get(EXACT_MATCH)
+    if raw_exact:
+        data = json.loads(raw_exact) if isinstance(raw_exact, str) else raw_exact
+        is_match = data.get("exact_match", 0.0) == 1.0
+        scores["exactMatch"] = {
+            "match": is_match,
+            "result": data.get("result", "pass" if is_match else "fail"),
+            "reason": data.get("exact_match_reason", ""),
+        }
+    # PartialMatch → PartialMatchScore
+    raw_partial = results_dict.get(PARTIAL_MATCH)
+    if raw_partial:
+        data = json.loads(raw_partial) if isinstance(raw_partial, str) else raw_partial
+        scores["partialMatch"] = {
+            "score": data.get("partial_match", 0.0),
+            "result": data.get("result", "fail"),
+            "threshold": data.get("threshold", 0.5),
+            "reason": data.get("partial_match_reason", ""),
+        }
     if scores:
         item["scores"] = scores
     return item
-def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None):
+def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None,
+                          default_evaluators: Optional[Dict[str, Any]] = None,
+                          agent_name: Optional[str] = None,
+                          cli_version: Optional[str] = None):
     """Write results to a schema-compliant eval document JSON file.
     Output follows the eval-document.schema.json format:
-    {schemaVersion, metadata, items: [EvalItem]}
+    {schemaVersion, metadata, default_evaluators?, items: [EvalItem]}
     """
     try:
         try:
@@ -434,43 +609,68 @@ def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optio
         }
         if agent_id:
             metadata["agentId"] = agent_id
+        if agent_name:
+            metadata["agentName"] = agent_name
+        if cli_version:
+            metadata["cliVersion"] = cli_version
         output_data: Dict[str, Any] = {
             "schemaVersion": current_version,
             "metadata": metadata,
-            "items": items,
         }
+        if default_evaluators is not None:
+            output_data["default_evaluators"] = default_evaluators
+        output_data["items"] = items
         with open(output_file, 'w', encoding='utf-8') as f:
             json.dump(output_data, f, indent=2, ensure_ascii=False)
-        print(f"Results saved to {output_file}")
+        emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
     except Exception as e:
-        print(f"Error writing to JSON file: {e}")
+        emit_structured_log("error", f"Error writing to JSON file: {e}", operation=Operation.WRITE_OUTPUT)
         sys.exit(1)
-def write_results_to_csv(results: List[Dict], output_file: str):
+def write_results_to_csv(results: List[Dict], output_file: str,
+                         agent_name: Optional[str] = None, agent_id: Optional[str] = None,
+                         cli_version: Optional[str] = None):
     """Write results to CSV file."""
     try:
         with open(output_file, 'w', newline='', encoding='utf-8') as f:
             if results:
+                # Write metadata header
+                metadata_parts = []
+                if agent_name:
+                    metadata_parts.append(f"Agent Name: {agent_name}")
+                if agent_id:
+                    metadata_parts.append(f"Agent ID: {agent_id}")
+                if cli_version:
+                    metadata_parts.append(f"CLI Version: {cli_version}")
+                if metadata_parts:
+                    f.write(f"# {' | '.join(metadata_parts)}\n")
                 # Write aggregate statistics first if multiple results
                 if len(results) > 1:
                     aggregates = calculate_aggregate_statistics(results)
                     if aggregates:
                         f.write("# AGGREGATE STATISTICS\n")
-                        f.write("Metric,Pass Rate (%),Passed,Failed,Avg Score,Threshold\n")
+                        f.write("Metric,Prompts Evaluated,Total Prompts,Pass Rate (%),Passed,Failed,Avg Score,Threshold\n")
                         for metric_name, stats in aggregates.items():
                             threshold_str = str(stats.get('threshold', 'N/A'))
-                            f.write(f"{metric_name},{stats['pass_rate']:.1f},{stats['pass_count']},{stats['fail_count']},{stats['avg_score']:.2f},{threshold_str}\n")
+                            prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
+                            total_prompts = stats.get('total_prompts', len(results))
+                            f.write(f"{metric_name},{prompts_evaluated},{total_prompts},{stats['pass_rate']:.1f},{stats['pass_count']},{stats['fail_count']},{stats['avg_score']:.2f},{threshold_str}\n")
                         f.write("\n# INDIVIDUAL RESULTS\n")
-                # Write individual results
-                writer = csv.DictWriter(f, fieldnames=results[0].keys())
+                # Write individual results (exclude internal fields)
+                exclude_keys = {'evaluators_ran', 'evaluators', 'evaluators_mode'}
+                fieldnames = [k for k in results[0].keys() if k not in exclude_keys]
+                writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
                 writer.writeheader()
                 writer.writerows(results)
-        print(f"Results saved to {output_file}")
+            emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
     except Exception as e:
-        print(f"Error writing to CSV file: {e}")
+        emit_structured_log("error", f"Error writing to CSV file: {e}", operation=Operation.WRITE_OUTPUT)
         sys.exit(1)
 def parse_arguments():
@@ -501,8 +701,8 @@ Examples:
   # Save results to HTML and open in browser
   python main.py --output report.html
-  # Verbose output
-  python main.py --verbose
+  # Debug-level diagnostics
+  python main.py --log-level debug
   # Sign out and clear cached authentication tokens
   python main.py --signout
@@ -551,21 +751,13 @@ Examples:
     # Behavior options
     parser.add_argument(
-        '--verbose',
-        action='store_true',
-        help='Enable verbose output'
-    )
-    parser.add_argument(
-        '--quiet',
-        action='store_true',
-        help='Suppress non-essential output'
-    )
-    parser.add_argument(
-        '--citation-format',
-        choices=['oai_unicode', 'legacy_bracket'],
-        default='oai_unicode',
-        help='Citation format to detect. "oai_unicode" for new OAI format (default), "legacy_bracket" for old [^i^] format'
+        '--log-level',
+        nargs='?',
+        const='info',
+        action='append',
+        help='Set log verbosity: debug, info, warning, error. Bare --log-level resolves to info.'
     )
     parser.add_argument(
         '--signout',
         action='store_true',
@@ -598,8 +790,13 @@ def validate_environment() -> CallPath:
     missing_vars = [var for var in required_env_vars if not os.environ.get(var)]
     if missing_vars:
-        print(f"Error: Missing required environment variables: {', '.join(missing_vars)}")
-        print("Please ensure your .env file contains all required Azure configuration.")
+        emit_structured_log(
+            "error",
+            "Missing required environment variables: "
+            f"{', '.join(missing_vars)}. Please ensure your .env file contains "
+            "all required Azure configuration.",
+            operation=Operation.VALIDATE_ENV,
+        )
         sys.exit(1)
     return call_path
@@ -633,23 +830,42 @@ def validate_endpoint_url(url: str, allowed_domains: List[str]) -> bool:
         # Convert other parsing errors to ValueError
         raise ValueError(f"Invalid URL format: {url}") from e
-def get_prompt_datasets(args) -> Tuple[List[str], List[str]]:
-    """Get prompts and expected responses based on command line arguments."""
+def get_prompt_datasets(args) -> Tuple[List[Dict], Optional[Dict]]:
+    """Get prompts and expected responses based on command line arguments.
+    Returns:
+        Tuple of (eval_items, default_evaluators).
+    """
     if args.prompts:
         if args.expected and len(args.prompts) != len(args.expected):
-            print("Error: Number of prompts must match number of expected responses")
+            emit_structured_log(
+                "error",
+                "Number of prompts must match number of expected responses. "
+                "Update --expected values to match the prompt count.",
+            )
             sys.exit(1)
-        prompts = args.prompts
-        expected_responses = args.expected or [""] * len(prompts)
+        expected_responses = args.expected or [""] * len(args.prompts)
+        eval_items = [
+            {"prompt": p, "expected_response": e}
+            for p, e in zip(args.prompts, expected_responses)
+        ]
+        return eval_items, None
     elif args.prompts_file:
-        prompts, expected_responses = load_prompts_from_file(args.prompts_file)
+        return load_prompts_from_file(args.prompts_file)
     elif args.interactive:
         prompts, expected_responses = get_interactive_prompts()
+        eval_items = [
+            {"prompt": p, "expected_response": e}
+            for p, e in zip(prompts, expected_responses)
+        ]
+        return eval_items, None
     else:
-        # Use default prompts
         prompts, expected_responses = get_default_prompts_and_responses()
-    return prompts, expected_responses
+        eval_items = [
+            {"prompt": p, "expected_response": e}
+            for p, e in zip(prompts, expected_responses)
+        ]
+        return eval_items, None
 def fetch_available_agents(copilot_api_endpoint: str, access_token: str, user_oid: str) -> List[Dict[str, Any]]:
     """
@@ -685,26 +901,27 @@ def fetch_available_agents(copilot_api_endpoint: str, access_token: str, user_oi
             return agents
     except urllib.error.HTTPError as e:
         # If endpoint doesn't exist or returns error, return empty list
-        print(f"Warning: Unable to fetch agents list (HTTP {e.code}).")
+        emit_structured_log("warning", f"Unable to fetch agents list (HTTP {e.code}).", operation=Operation.FETCH_AGENTS)
         return []
     except Exception as e:
-        print(f"Warning: Error fetching agents: {e}")
+        emit_structured_log("warning", f"Error fetching agents: {e}", operation=Operation.FETCH_AGENTS)
         return []
-def select_agent_interactively(agents: List[Dict[str, Any]]) -> Optional[str]:
+def select_agent_interactively(agents: List[Dict[str, Any]]) -> Tuple[Optional[str], Optional[str]]:
     """
     Display an interactive agent selector using questionary.
     Args:
         agents: List of agent dictionaries.
     Returns:
-        Selected agent ID or None if cancelled/skipped
+        Tuple of (agent_id, agent_name) or (None, None) if cancelled/skipped
     """
     if not agents:
-        return None
-    # Create choices for questionary
+        return None, None
+    # Build id→name lookup and choices
+    id_to_name: Dict[str, str] = {}
     choices = []
     sorted_agents = sorted(agents, key=lambda x: (not x.get('isOwner', False), x.get('name', '')))
     for agent in sorted_agents:
@@ -712,12 +929,13 @@ def select_agent_interactively(agents: List[Dict[str, Any]]) -> Optional[str]:
         agent_name = agent.get("name", "Unknown")
         agent_description = agent.get("description", "Unknown")
         agent_is_owner = agent.get('isOwner')
+        id_to_name[agent_id] = agent_name
         # Format the display text
         display_text = f"{agent_name} ({agent_id}, IsOwner: {agent_is_owner}) - {agent_description}"
         choices.append(questionary.Choice(title=display_text, value=agent_id))
     # Display the selection prompt
     selected_agent = questionary.select(
         "Select an agent to evaluate:",
@@ -725,8 +943,37 @@ def select_agent_interactively(agents: List[Dict[str, Any]]) -> Optional[str]:
         use_shortcuts=True,
         use_arrow_keys=True
     ).ask()
-    return selected_agent
+    return selected_agent, id_to_name.get(selected_agent) if selected_agent else None
+@functools.lru_cache(maxsize=1)
+def _get_iana_timezone_name() -> str:
+    """Get the IANA timezone name from the system using tzlocal.
+    Tries get_localzone_name() first; falls back to str(get_localzone()) when the
+    former raises (e.g. no zone configured on some Unix systems).  Result is cached
+    after the first call so tzlocal is only invoked once per session.
+    """
+    try:
+        return tzlocal.get_localzone_name()
+    except Exception:
+        return str(tzlocal.get_localzone())
+@functools.lru_cache(maxsize=1)
+def _get_location_info() -> Dict[str, Any]:
+    """Return a locationInfo dict containing the local UTC offset and IANA timezone name.
+    Result is cached after the first call so the computation runs only once per session.
+    """
+    now = datetime.now().astimezone()
+    utc_offset = now.utcoffset()
+    offset_hours = int(utc_offset.total_seconds() // 3600) if utc_offset is not None else 0
+    return {
+        "timeZoneOffset": offset_hours,
+        "timeZone": _get_iana_timezone_name(),
+    }
 def build_chat_payload(prompt: str, user_oid: str, agent_id: str|None) -> bytes:
     message = {
@@ -735,6 +982,7 @@ def build_chat_payload(prompt: str, user_oid: str, agent_id: str|None) -> bytes:
             "author": "user",
             "messageType": "chat",
             "timestamp": datetime.now(timezone.utc).isoformat(),
+            "locationInfo": _get_location_info(),
             "from": {
                     "id": user_oid,
             }
@@ -755,7 +1003,7 @@ def build_chat_payload(prompt: str, user_oid: str, agent_id: str|None) -> bytes:
     return json.dumps(message).encode("utf-8")
-def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str, access_token: str, user_oid: str, args) -> Dict[str, Dict[str, any]]:
+def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str, access_token: str, user_oid: str, args) -> List[Dict[str, Any]]:
     """ Send prompts to the chat API and return enhanced responses. """
     request_headers = {
@@ -763,15 +1011,15 @@ def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str
         "X-Scenario": os.environ.get("X_SCENARIO_HEADER"),
         "Authorization": f"Bearer {access_token}"
         }
-    raw_responses: Dict[str, str] = {}
+    raw_responses: List[Tuple[str, str]] = []
     for i, prompt in enumerate(prompts, 1):
-        if not args.quiet:
-            print(f"Processing prompt {i}/{len(prompts)}...")
+        if getattr(args, "effective_log_level", "info") in ("info", "debug"):
+            emit_structured_log("info", f"Processing prompt {i}/{len(prompts)}.", operation=Operation.SEND_PROMPT)
         # Build the payload
         payload = build_chat_payload(prompt, user_oid, args.m365_agent_id)
-        if args.verbose:
-            print(f"[Sydney] Sending payload: {payload.decode('utf-8')}")
+        if getattr(args, "effective_log_level", "info") == "debug":
+            emit_structured_log("debug", f"[Sydney] Sending payload: {payload.decode('utf-8')}", operation=Operation.SEND_PROMPT)
         # Send the request to /chat
         req = urllib.request.Request(f"{copilot_api_endpoint}/chat", data=payload, headers=request_headers, method="POST")
@@ -790,42 +1038,83 @@ def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str
             raise RuntimeError(msg) from e
         except urllib.error.URLError as e:
             raise RuntimeError(f"Chat API connection error: {getattr(e, 'reason', str(e))}") from e
-        if args.verbose:
-            print(f"[Sydney] Raw response: {raw}")
+        if getattr(args, "effective_log_level", "info") == "debug":
+            emit_structured_log("debug", f"[Sydney] Raw response: {raw}", operation=Operation.SEND_PROMPT)
         # Store raw response for enhancement
-        raw_responses[prompt] = raw.strip()
+        raw_responses.append((prompt, raw.strip()))
     # Extract enhanced responses using the new extractor
-    enhanced_responses = extract_enhanced_responses(raw_responses)
+    enhanced_responses = extract_enhanced_responses(raw_responses, log_level=getattr(args, "effective_log_level", "info"))
+    if getattr(args, "effective_log_level", "info") == "debug":
+        for idx, enhanced in enumerate(enhanced_responses, 1):
+            metadata = enhanced.get("metadata", {})
+            context = {
+                "request-id": metadata.get("request_id"),
+                "conversation-id": metadata.get("conversation_id"),
+                "message-id": metadata.get("message_id"),
+                "operation": Operation.SEND_PROMPT,
+            }
+            entry = format_structured_log_entry(
+                level="debug",
+                message=f"Response IDs for prompt {idx}/{len(enhanced_responses)}.",
+                logger_name=CLI_LOGGER_NAME,
+                run_context=context,
+            )
+            DIAGNOSTIC_RECORDS.append(entry)
+            CLI_LOGGER.log(logging.DEBUG, render_diagnostic(entry))
     return enhanced_responses
-def output_results(results: List[Dict], args):
+def output_results(results: List[Dict], args, default_evaluators: Optional[Dict[str, Any]] = None,
+                    agent_name: Optional[str] = None, cli_version: Optional[str] = None):
     """Output results based on specified format."""
+    metadata_kwargs = dict(
+        agent_name=agent_name,
+        agent_id=getattr(args, 'm365_agent_id', None),
+        cli_version=cli_version,
+    )
     if args.output:
         output_lower = args.output.lower()
         if output_lower.endswith('.json'):
-            write_results_to_json(results, args.output, agent_id=getattr(args, 'm365_agent_id', None))
+            write_results_to_json(results, args.output, default_evaluators=default_evaluators,
+                                  **metadata_kwargs)
         elif output_lower.endswith('.csv'):
-            write_results_to_csv(results, args.output)
+            write_results_to_csv(results, args.output, **metadata_kwargs)
         elif output_lower.endswith('.html'):
-            write_results_to_html(results, args.output)
+            write_results_to_html(results, args.output, **metadata_kwargs)
             abs_path = os.path.abspath(args.output)
             webbrowser.open(f'file://{abs_path}')
         else:
-            write_results_to_json(results, args.output, agent_id=getattr(args, 'm365_agent_id', None))
+            write_results_to_json(results, args.output, default_evaluators=default_evaluators,
+                                  **metadata_kwargs)
     else:
-        write_results_to_console(results)
+        write_results_to_console(results, **metadata_kwargs)
 def main():
     """Main function to orchestrate the evaluation process."""
     load_dotenv()
     args = parse_arguments()
+    effective_log_level, error_message = resolve_log_level(args.log_level)
+    if error_message:
+        print(error_message)
+        print(
+            "Next step: rerun with --log-level {debug|info|warning|error}. "
+            "For support, share the console diagnostics output from this run."
+        )
+        sys.exit(2)
+    args.effective_log_level = effective_log_level
+    configure_cli_logging(effective_log_level)
+    emit_structured_log("info", f"Log level set to '{effective_log_level}'.", operation=Operation.SETUP)
     # Check minimum version before proceeding
-    cli_version = get_cli_version(quiet=args.quiet)
-    if not should_bypass_min_version_check(args) and not check_min_version(cli_version, quiet=args.quiet):
+    quiet_for_version = effective_log_level in ("warning", "error")
+    cli_version = get_cli_version(quiet=quiet_for_version)
+    if not should_bypass_min_version_check(args) and not check_min_version(cli_version, quiet=quiet_for_version):
         sys.exit(1)
     # Validate environment variables required for evaluation
@@ -853,7 +1142,7 @@ def main():
           try:
               auth_handler.clear_cache()
           except Exception as e:
-              print(f"Error during signout: {e}")
+              emit_structured_log("error", f"Error during signout: {e}", operation=Operation.AUTHENTICATE)
               sys.exit(1)
           sys.exit(0)
@@ -866,67 +1155,87 @@ def main():
         id_token_claims = auth_result.get("id_token_claims")
         if not isinstance(id_token_claims, dict):
-            print("id_token_claims is missing or invalid in authentication result")
+            emit_structured_log(
+                "warning", "id_token_claims is missing or invalid in authentication result",
+                operation=Operation.AUTHENTICATE,
+            )
         else:
             user_oid = id_token_claims.get("oid") or ""
       except Exception as e:
-        print(f"\033[91mError during authentication: {e}\033[0m")
-        if args.verbose:
-            import traceback
-            traceback.print_exc()
-        sys.exit(1)
+          emit_structured_log("error", f"Error during authentication: {e}", operation=Operation.AUTHENTICATE)
+          if effective_log_level == "debug":
+              import traceback
+              traceback.print_exc()
+          sys.exit(1)
     if not user_oid and access_token:
         # Fallback: extract from access token.
         user_oid = AuthHandler.extract_user_oid_from_access_token(access_token)
-    # 1. Load evaluation datasets (prompts and expected_responses)
-    prompts, expected_responses = get_prompt_datasets(args)
+    # 1. Load evaluation datasets
+    eval_items, file_default_evaluators = get_prompt_datasets(args)
+    default_evaluators = resolve_default_evaluators(file_default_evaluators)
+    prompts = [eval_item.get("prompt", "") for eval_item in eval_items]
-    if not args.quiet:
-        print(f"Running evaluation on {len(prompts)} prompt(s)...")
+    if effective_log_level in ("info", "debug"):
+        emit_structured_log("info", f"Running evaluation on {len(prompts)} prompt(s).", operation=Operation.SETUP)
+    agent_name = None
     try:
         # 3. Agent selection - if no agent ID provided, prompt user to select
         if not args.m365_agent_id:
-            if not args.quiet:
-                print("No agent ID provided. Fetching available agents...")
+            if effective_log_level in ("info", "debug"):
+                emit_structured_log("info", "No agent ID provided. Fetching available agents.", operation=Operation.FETCH_AGENTS)
             available_agents = fetch_available_agents(copilot_api_endpoint, access_token, user_oid)
             if not available_agents:
-              print("No agents are available for interactive selection. Please re-run with --m365-agent-id or set the M365_AGENT_ID environment variable.")
-              sys.exit(1)
-            if available_agents:
-                selected_agent_id = select_agent_interactively(available_agents)
-                if selected_agent_id:
-                    args.m365_agent_id = selected_agent_id
-                    if not args.quiet:
-                        print(f"Selected agent: {args.m365_agent_id}")
-                else:
-                    print("No agent selected. Please re-run with --m365-agent-id or set the M365_AGENT_ID environment variable.")
-                    sys.exit(1)
+                emit_structured_log(
+                  "error",
+                  "No agents are available for interactive selection. Re-run with "
+                  "--m365-agent-id or set M365_AGENT_ID.",
+                  operation=Operation.FETCH_AGENTS,
+                )
+                sys.exit(1)
+            selected_agent_id, agent_name = select_agent_interactively(available_agents)
+            if selected_agent_id:
+                args.m365_agent_id = selected_agent_id
+                if effective_log_level in ("info", "debug"):
+                    emit_structured_log("info", f"Selected agent: {args.m365_agent_id}", operation=Operation.FETCH_AGENTS)
+            else:
+                emit_structured_log(
+                        "error",
+                        "No agent selected. Re-run with --m365-agent-id or set M365_AGENT_ID.",
+                        operation=Operation.FETCH_AGENTS,
+                )
+                sys.exit(1)
         # 4. Send prompts to chat API
         responses = send_prompt_to_agent_in_sydney(prompts, copilot_api_endpoint, access_token, user_oid, args)
     except Exception as e:
-        print(f"\033[91mError sending prompts to chat API: {e}\033[0m")
-        if args.verbose:
+        emit_structured_log("error", f"Error sending prompts to chat API: {e}", operation=Operation.SEND_PROMPT)
+        if effective_log_level == "debug":
             import traceback
             traceback.print_exc()
         sys.exit(1)
     # 5. Run evaluations
-    if not args.quiet:
-        print("Running evaluations...")
-    results = run_evaluations(args, responses, expected_responses)
+    if effective_log_level in ("info", "debug"):
+        emit_structured_log("info", "Running evaluations.", operation=Operation.EVALUATE)
+    results = run_evaluations(args, responses, eval_items, default_evaluators)
     # 6. Output results
-    output_results(results, args)
+    output_results(results, args, default_evaluators=default_evaluators,
+                   agent_name=agent_name, cli_version=str(cli_version) if cli_version else None)
-    if not args.quiet:
-        print(f"\nEvaluation completed successfully! Processed {len(prompts)} prompt(s).")
+    if effective_log_level in ("info", "debug"):
+        emit_structured_log(
+            "info",
+            f"Evaluation completed successfully. Processed {len(prompts)} prompt(s).",
+            operation=Operation.EVALUATE,
+        )
 # Call the main function when script is run directly
 if __name__ == "__main__":