npm - @microsoft/m365-copilot-eval - Versions diffs - 1.2.0-preview.1 → 1.3.0-preview.1 - Mend

@microsoft/m365-copilot-eval 1.2.0-preview.1 → 1.3.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/README.md +12 -4
package/package.json +3 -2
package/schema/CHANGELOG.md +8 -0
package/schema/v1/eval-document.schema.json +117 -1
package/schema/v1/examples/valid/comprehensive.json +27 -2
package/schema/version.json +2 -2
package/src/clients/cli/cli_logging/__init__.py +0 -0
package/src/clients/cli/cli_logging/console_diagnostics.py +55 -0
package/src/clients/cli/cli_logging/logging_utils.py +145 -0
package/src/clients/cli/common.py +51 -0
package/src/clients/cli/custom_evaluators/CitationsEvaluator.py +3 -3
package/src/clients/cli/custom_evaluators/ExactMatchEvaluator.py +11 -11
package/src/clients/cli/custom_evaluators/PartialMatchEvaluator.py +1 -11
package/src/clients/cli/evaluator_resolver.py +150 -0
package/src/clients/cli/generate_report.py +130 -110
package/src/clients/cli/main.py +545 -236
package/src/clients/cli/readme.md +14 -7
package/src/clients/cli/requirements.txt +1 -0
package/src/clients/cli/response_extractor.py +32 -14
package/src/clients/node-js/bin/runevals.js +58 -28
package/src/clients/node-js/config/default.js +1 -1

package/src/clients/cli/evaluator_resolver.py ADDED Viewed

@@ -0,0 +1,150 @@
+"""Evaluator resolution module for per-prompt evaluator configuration.
+Resolves which evaluators to run on each prompt by merging prompt-level config
+with file-level defaults and system defaults, following extend/replace modes.
+"""
+import difflib
+import logging
+from typing import Any, Dict, Optional, Tuple
+from common import (
+    RELEVANCE,
+    COHERENCE,
+    GROUNDEDNESS,
+    TOOL_CALL_ACCURACY,
+    CITATIONS,
+    EXACT_MATCH,
+    PARTIAL_MATCH,
+    REQUIRES_AZURE_OPENAI,
+    REQUIRES_TOOL_DEFINITIONS,
+    SYSTEM_DEFAULT_EVALUATORS,
+    RegistryEntry,
+)
+logger = logging.getLogger(__name__)
+# Static registry of available evaluators per data-model.md
+EVALUATOR_REGISTRY: Dict[str, RegistryEntry] = {
+    RELEVANCE: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
+    COHERENCE: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
+    GROUNDEDNESS: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
+    TOOL_CALL_ACCURACY: RegistryEntry(type="tool", requires=[REQUIRES_AZURE_OPENAI, REQUIRES_TOOL_DEFINITIONS], default_threshold=3),
+    CITATIONS: RegistryEntry(type="non-llm", requires=[], default_threshold=1),
+    EXACT_MATCH: RegistryEntry(type="non-llm", requires=[], default_threshold=None),
+    PARTIAL_MATCH: RegistryEntry(type="non-llm", requires=[], default_threshold=0.5),
+}
+def validate_evaluator_names(evaluator_map: Dict[str, Any]) -> None:
+    """Validate that all evaluator names in the map exist in the registry.
+    Raises ValueError with categorized valid names and
+    'Did you mean?' suggestions for close matches.
+    """
+    invalid_names = [name for name in evaluator_map if name not in EVALUATOR_REGISTRY]
+    if not invalid_names:
+        return
+    # Categorize valid evaluators for the error message
+    llm_evals = [n for n, r in EVALUATOR_REGISTRY.items() if r.type == "llm"]
+    tool_evals = [n for n, r in EVALUATOR_REGISTRY.items() if r.type == "tool"]
+    non_llm_evals = [n for n, r in EVALUATOR_REGISTRY.items() if r.type == "non-llm"]
+    lines = []
+    for name in invalid_names:
+        lines.append(f'Unknown evaluator "{name}".')
+        close = difflib.get_close_matches(name, EVALUATOR_REGISTRY.keys(), n=1, cutoff=0.5)
+        if close:
+            lines.append(f'Did you mean "{close[0]}"?')
+    lines.append("")
+    lines.append("Valid evaluators are:")
+    lines.append(f"  - {', '.join(llm_evals)} (LLM-based)")
+    lines.append(f"  - {', '.join(tool_evals)} (tool evaluation)")
+    lines.append(f"  - {', '.join(non_llm_evals)} (non-LLM)")
+    raise ValueError("\n".join(lines))
+def check_prerequisites(
+    evaluator_name: str,
+    available_context: Dict[str, bool],
+) -> Tuple[bool, Optional[str]]:
+    """Check if prerequisites for an evaluator are available.
+    Returns (True, None) if all prerequisites are met, or
+    (False, warning_message) if a prerequisite is missing.
+    """
+    registry_entry = EVALUATOR_REGISTRY.get(evaluator_name)
+    if not registry_entry:
+        return False, f"Unknown evaluator: {evaluator_name}"
+    for req in registry_entry.requires:
+        if not available_context.get(req, False):
+            msg = (
+                f"Skipping evaluator '{evaluator_name}': "
+                f"missing prerequisite '{req}'"
+            )
+            return False, msg
+    return True, None
+def resolve_default_evaluators(file_defaults: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+    """Resolve effective default evaluators, falling back to system defaults.
+    Precedence: file-level defaults > system defaults.
+    An explicit empty dict means "no default evaluators".
+    """
+    # File-level defaults (including explicit empty dict)
+    if file_defaults is not None:
+        return file_defaults
+    # System defaults
+    return {name: {} for name in SYSTEM_DEFAULT_EVALUATORS}
+def resolve_evaluators_for_prompt(
+    prompt_evaluators: Optional[Dict[str, Any]],
+    evaluators_mode: str,
+    prompt: str,
+    default_evaluators: Dict[str, Any],
+) -> Dict[str, Any]:
+    """Resolve which evaluators to run for a single prompt.
+    Args:
+        prompt_evaluators: Per-prompt evaluator config (None if not specified).
+        evaluators_mode: How to combine with defaults ("extend" or "replace").
+        prompt: The prompt text (used in warning messages).
+        default_evaluators: Resolved default evaluators (from resolve_default_evaluators).
+    Returns:
+        Resolved EvaluatorMap (dict of evaluator_name -> options).
+    """
+    # No prompt-level config → use defaults
+    if prompt_evaluators is None:
+        return dict(default_evaluators)
+    if evaluators_mode == "replace":
+        if not prompt_evaluators:
+            logger.warning(
+                "Empty evaluators with 'replace' mode for prompt: '%s'. "
+                "No evaluators will run.",
+                prompt[:80],
+            )
+        return dict(prompt_evaluators)
+    # mode == "extend": merge defaults with prompt overrides (prompt wins on conflict)
+    merged = dict(default_evaluators)
+    merged.update(prompt_evaluators)
+    return merged
+def get_evaluator_threshold(evaluator_name: str, options: Dict[str, Any]) -> Optional[float]:
+    """Get the threshold for an evaluator, with option override support."""
+    if "threshold" in options:
+        return options["threshold"]
+    entry = EVALUATOR_REGISTRY.get(evaluator_name)
+    return entry.default_threshold if entry else None

package/src/clients/cli/generate_report.py CHANGED Viewed

@@ -1,76 +1,75 @@
+import html as html_module
 import json
 import markdown
+from common import METRIC_IDS, pascal_case_to_title
 import pandas as pd
 from pathlib import Path
 def calculate_aggregate_statistics(results):
-    """Calculate aggregate statistics across all evaluation results."""
+    """Calculate aggregate statistics across all evaluation results.
+    Scans ALL results (not just the first) to discover which metrics were used,
+    correctly handling per-prompt evaluator variation. Each metric reports
+    prompts_evaluated (how many prompts it actually ran on) and total_prompts.
+    """
     if not results:
         return {}
-    # Extract all metrics from the first result to know what metrics we have
-    first_result = results[0]
-    metrics = first_result.get('results', {})
+    # Discover all metric keys across all results
+    all_metric_keys = set()
+    for result in results:
+        all_metric_keys.update(result.get('results', {}).keys())
     aggregates = {}
-    for metric_key in metrics.keys():
-        if not metric_key.endswith('_score'):
-            continue
-        metric_name = metric_key[:-6]  # Remove '_score' suffix
-        metric_display_name = metric_name.replace('_', ' ').title()
+    for eval_name in sorted(all_metric_keys):
+        display_name = pascal_case_to_title(eval_name)
+        metric_id = METRIC_IDS.get(eval_name, eval_name.lower())
         scores = []
         pass_count = 0
         fail_count = 0
         threshold_value = None
+        prompts_evaluated = 0
         for result in results:
-            metric_data = result.get('results', {}).get(metric_key)
-            if metric_data:
-                try:
-                    # Parse the JSON string to get the actual data
-                    parsed_data = json.loads(metric_data) if isinstance(metric_data, str) else metric_data
-                    # Extract score, result, and threshold
-                    score = parsed_data.get('score')
-                    if score is None:
-                        score = parsed_data.get(metric_name)
-                    if score is None:
-                        score = parsed_data.get(f'{metric_name}_score')
-                    result_status = parsed_data.get('result')
-                    if result_status is None:
-                        result_status = parsed_data.get(f'{metric_name}_result')
-                    threshold = parsed_data.get('threshold')
-                    if threshold is None:
-                        threshold = parsed_data.get(f'{metric_name}_threshold')
-                    if score is not None:
-                        scores.append(float(score))
-                    if result_status:
-                        if str(result_status).lower() == 'pass':
-                            pass_count += 1
-                        elif str(result_status).lower() == 'fail':
-                            fail_count += 1
-                    # Capture threshold (should be consistent across all results)
-                    if threshold is not None and threshold_value is None:
-                        threshold_value = threshold
-                except (json.JSONDecodeError, ValueError, TypeError):
-                    continue
-        if scores:
-            avg_score = sum(scores) / len(scores)
+            metric_data = result.get('results', {}).get(eval_name)
+            if metric_data is None:
+                continue  # This metric did not run for this prompt
+            prompts_evaluated += 1
+            try:
+                parsed_data = json.loads(metric_data) if isinstance(metric_data, str) else metric_data
+                score = parsed_data.get(metric_id)
+                result_status = parsed_data.get('result')
+                threshold = parsed_data.get('threshold')
+                if score is not None:
+                    scores.append(float(score))
+                if result_status:
+                    if str(result_status).lower() == 'pass':
+                        pass_count += 1
+                    elif str(result_status).lower() == 'fail':
+                        fail_count += 1
+                if threshold is not None and threshold_value is None:
+                    threshold_value = threshold
+            except (json.JSONDecodeError, ValueError, TypeError):
+                continue
+        if scores or pass_count > 0 or fail_count > 0:
+            avg_score = sum(scores) / len(scores) if scores else 0
             total_evaluated = pass_count + fail_count
             pass_rate = (pass_count / total_evaluated * 100) if total_evaluated > 0 else 0
-            aggregates[metric_display_name] = {
+            aggregates[display_name] = {
                 'total_prompts': len(results),
+                'prompts_evaluated': prompts_evaluated,
                 'total_evaluated': total_evaluated,
                 'pass_count': pass_count,
                 'fail_count': fail_count,
@@ -79,7 +78,7 @@ def calculate_aggregate_statistics(results):
                 'threshold': threshold_value,
                 'scores': scores
             }
     return aggregates
 def parse_score(score_str):
@@ -100,9 +99,9 @@ def format_score(score):
 def extract_metric_rows(entry):
     """
-    Build generic metric rows from any `*_score` keys on an entry.
+    Build generic metric rows from evaluation results.
     Each row has: Metric, Result, Score, Threshold, Reason.
-    Supports metrics under entry['results'] and falls back to top-level for backward compatibility.
+    Omits metrics that did not run (None values) for this prompt.
     """
     rows = []
@@ -112,29 +111,22 @@ def extract_metric_rows(entry):
                 return d[k]
         return ''
-    def iter_score_fields(e):
-        container = e.get('results') if isinstance(e, dict) else None
-        if isinstance(container, dict):
-            for k, v in container.items():
-                if isinstance(k, str) and k.endswith('_score'):
-                    yield k, v
-            return
-        # fallback to top-level flat structure
-        for k, v in e.items():
-            if isinstance(k, str) and k.endswith('_score'):
-                yield k, v
-    for key, raw in iter_score_fields(entry):
-        metric_id = key[:-6]  # strip "_score"
+    results_container = entry.get('results', {}) if isinstance(entry, dict) else {}
+    for eval_name, raw in results_container.items():
+        if raw is None:
+            continue  # Skip metrics that did not run for this prompt
         metric_obj = parse_score(raw) if isinstance(raw, (str, bytes)) else (raw or {})
-        display_name = metric_id.replace('_', ' ').title()
+        display_name = pascal_case_to_title(eval_name)
+        metric_id = METRIC_IDS.get(eval_name, eval_name.lower())
         # Candidate key patterns inside the parsed metric object
-        score_val = pick(metric_obj, [metric_id, f'{metric_id}_score', 'score', 'value'])
-        result_val = pick(metric_obj, [f'{metric_id}_result', 'result', 'status'])
-        threshold_val = pick(metric_obj, [f'{metric_id}_threshold', 'threshold', 'min_threshold', 'expected'])
-        reason_val = pick(metric_obj, [f'{metric_id}_reason', 'reason', 'rationale', 'explanation'])
+        score_val = pick(metric_obj, [metric_id])
+        result_val = pick(metric_obj, ['result'])
+        threshold_val = pick(metric_obj, ['threshold'])
+        reason_val = pick(metric_obj, [f'{metric_id}_reason', 'reason'])
         rows.append({
             'Metric': display_name,
@@ -145,7 +137,13 @@ def extract_metric_rows(entry):
         })
     return rows
-def generate_html_report(results):
+def _escape(text):
+    """HTML-escape user-controlled content to prevent XSS."""
+    if text is None:
+        return ""
+    return html_module.escape(str(text))
+def generate_html_report(results, agent_name=None, agent_id=None, cli_version=None):
     html = [
       '<!DOCTYPE html>',
       '<html lang="en">',
@@ -172,29 +170,43 @@ def generate_html_report(results):
       '        .pass-rate-good { background: #fef9e7; color: #7d6608; font-weight: bold; }',
       '        .pass-rate-poor { background: #fadbd8; color: #b03a2e; font-weight: bold; }',
       '        .individual-results { margin-top: 3em; }',
+      '        .evaluator-badge { display: inline-block; padding: 2px 8px; margin: 2px; border-radius: 4px; font-size: 0.85em; background: #e8eaf6; color: #283593; }',
       '    </style>',
       '</head>',
       '<body>',
       '    <h1> M365 Copilot Agents Evaluation Report</h1>',
     ]
+    # Add metadata section
+    metadata_items = []
+    if agent_name:
+        metadata_items.append(f'<strong>Agent Name:</strong> {_escape(agent_name)}')
+    if agent_id:
+        metadata_items.append(f'<strong>Agent ID:</strong> {_escape(agent_id)}')
+    if cli_version:
+        metadata_items.append(f'<strong>CLI Version:</strong> {_escape(cli_version)}')
+    if metadata_items:
+        html.append(f'    <p style="color: #666; font-size: 0.95em;">{" &nbsp;|&nbsp; ".join(metadata_items)}</p>')
     # Add aggregate statistics if multiple results
     if len(results) > 1:
         aggregates = calculate_aggregate_statistics(results)
         if aggregates:
             html.append('<div class="aggregate-section">')
             html.append(f'<h2> Aggregate Statistics ({len(results)} prompts evaluated)</h2>')
-            # Create aggregate table with same style as individual results
             html.append('<table>')
-            html.append('<tr><th>Metric</th><th>Pass Rate</th><th>Passed</th><th>Failed</th><th>Avg Score</th><th>Threshold</th></tr>')
+            html.append('<tr><th>Metric</th><th>Prompts</th><th>Pass Rate</th><th>Passed</th><th>Failed</th><th>Avg Score</th><th>Threshold</th></tr>')
             for metric_name, stats in aggregates.items():
                 pass_rate_class = 'pass-rate-excellent' if stats['pass_rate'] >= 80 else 'pass-rate-good' if stats['pass_rate'] >= 60 else 'pass-rate-poor'
-                threshold_display = stats.get('threshold', 'N/A')
+                threshold_display = _escape(str(stats.get('threshold', 'N/A')))
+                prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
+                total_prompts = stats.get('total_prompts', len(results))
                 html.append(f'''
                 <tr>
-                    <td><strong>{metric_name}</strong></td>
+                    <td><strong>{_escape(metric_name)}</strong></td>
+                    <td>{prompts_evaluated}/{total_prompts}</td>
                     <td class="{pass_rate_class}">{stats['pass_rate']:.1f}%</td>
                     <td class="pass">{stats['pass_count']}</td>
                     <td class="fail">{stats['fail_count']}</td>
@@ -202,7 +214,7 @@ def generate_html_report(results):
                     <td>{threshold_display}</td>
                 </tr>
                 ''')
             html.append('</table>')
             html.append('</div>')
@@ -211,34 +223,42 @@ def generate_html_report(results):
     html.append('<h2> Individual Results</h2>')
     for idx, entry in enumerate(results, 1):
-        html.append(f'<h3>Prompt {idx}: {entry.get("prompt", "")}</h3>')
+        html.append(f'<h3>Prompt {idx}: {_escape(entry.get("prompt", ""))}</h3>')
+        # Show evaluator badges for this prompt
+        evaluators_ran = entry.get('evaluators_ran', [])
+        if evaluators_ran:
+            badges = ''.join(f'<span class="evaluator-badge">{_escape(e)}</span>' for e in evaluators_ran)
+            html.append(f'<p>Evaluators: {badges}</p>')
         html.append('<table>')
-        html.append('<tr><th>Response</th><td>{}</td></tr>'.format(markdown.markdown(entry.get("response", ""))))
-        html.append('<tr><th>Expected Response</th><td>{}</td></tr>'.format(markdown.markdown(entry.get("expected_response", ""))))
+        html.append('<tr><th>Response</th><td>{}</td></tr>'.format(markdown.markdown(_escape(entry.get("response", "")))))
+        html.append('<tr><th>Expected Response</th><td>{}</td></tr>'.format(markdown.markdown(_escape(entry.get("expected_response", "")))))
         html.append('</table>')
         score_rows = extract_metric_rows(entry)
-        score_df = pd.DataFrame(score_rows)
-        def highlight_result(val):
-            lv = str(val).lower()
-            if lv == 'pass':
-                return 'background-color: #d4edda; color: #155724;'
-            elif lv == 'fail':
-                return 'background-color: #f8d7da; color: #721c24;'
-            return ''
-        score_html = (
-            score_df.style
-            .map(highlight_result, subset=['Result'])
-            .set_table_attributes('style="margin-top:1em;"')
-            .hide(axis="index")
-            .to_html()
-        )
-        html.append('<details open class="score-details"><summary>Show/Hide Evaluation Metric Scores</summary>')
-        html.append(score_html)
-        html.append('</details>')
+        if score_rows:
+            score_df = pd.DataFrame(score_rows)
+            def highlight_result(val):
+                lv = str(val).lower()
+                if lv == 'pass':
+                    return 'background-color: #d4edda; color: #155724;'
+                elif lv == 'fail':
+                    return 'background-color: #f8d7da; color: #721c24;'
+                return ''
+            score_html = (
+                score_df.style
+                .map(highlight_result, subset=['Result'])
+                .set_table_attributes('style="margin-top:1em;"')
+                .hide(axis="index")
+                .to_html()
+            )
+            html.append('<details open class="score-details"><summary>Show/Hide Evaluation Metric Scores</summary>')
+            html.append(score_html)
+            html.append('</details>')
         html.append(f'<p><em>Entry {idx} of {len(results)}</em></p>')
         if idx < len(results):  # Don't add HR after last item