npm - @microsoft/m365-copilot-eval - Versions diffs - 1.6.0-preview.1 → 1.7.0-preview.1 - Mend

@microsoft/m365-copilot-eval 1.6.0-preview.1 → 1.7.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/README.md +4 -1
package/package.json +2 -2
package/schema/v1/eval-document.schema.json +144 -333
package/schema/v1/examples/invalid/error-result-with-score.json +16 -0
package/schema/v1/examples/invalid/missing-error-on-error.json +13 -0
package/schema/v1/examples/valid/multi-turn-output.json +2 -0
package/schema/v1/examples/valid/scenarios-with-mixed-errors.json +239 -0
package/src/clients/cli/common.py +8 -14
package/src/clients/cli/error_messages.py +91 -0
package/src/clients/cli/evaluation_runner.py +108 -97
package/src/clients/cli/evaluator_resolver.py +8 -33
package/src/clients/cli/generate_report.py +125 -96
package/src/clients/cli/readme.md +1 -1
package/src/clients/cli/result_writer.py +129 -110
package/src/clients/cli/status_derivation.py +91 -0
package/src/clients/node-js/config/default.js +1 -1
package/src/clients/node-js/lib/env-loader.js +20 -13

package/src/clients/cli/generate_report.py CHANGED Viewed

@@ -2,6 +2,7 @@ import html as html_module
 import markdown
 from common import METRIC_IDS, STATUS_PASS, STATUS_FAIL, STATUS_ERROR, STATUS_PARTIAL, STATUS_UNKNOWN, pascal_case_to_title
 from datetime import datetime, timezone
+from evaluator_resolver import EVALUATOR_REGISTRY
 def calculate_aggregate_statistics(results):
     """Calculate aggregate statistics across all evaluation results.
@@ -36,6 +37,7 @@ def calculate_aggregate_statistics(results):
         scores = []
         pass_count = 0
         fail_count = 0
+        error_count = 0
         threshold_value = None
         prompts_evaluated = 0
@@ -49,19 +51,20 @@ def calculate_aggregate_statistics(results):
             prompts_evaluated += 1
             try:
                 score = parsed_data.get(metric_id)
                 result_status = parsed_data.get('result')
                 threshold = parsed_data.get('threshold')
                 if score is not None:
                     scores.append(float(score))
                 if result_status:
-                    if str(result_status).lower() == STATUS_PASS:
+                    status = str(result_status).lower()
+                    if status == STATUS_PASS:
                         pass_count += 1
-                    elif str(result_status).lower() == STATUS_FAIL:
+                    elif status == STATUS_FAIL:
                         fail_count += 1
+                    elif status == STATUS_ERROR:
+                        error_count += 1
                 if threshold is not None and threshold_value is None:
                     threshold_value = threshold
@@ -69,17 +72,33 @@ def calculate_aggregate_statistics(results):
             except (ValueError, TypeError):
                 continue
-        if scores or pass_count > 0 or fail_count > 0:
+        # Surface evaluators that ran in any form — including those whose only
+        # attempts errored. Suppressing error-only evaluators would hide them
+        # from the aggregate report (SC-001).
+        if scores or pass_count > 0 or fail_count > 0 or error_count > 0:
             avg_score = sum(scores) / len(scores) if scores else 0
+            # Per-evaluator pass rate is "agreement among completed evaluations" —
+            # errors are surfaced separately as a count, not folded into the rate.
             total_evaluated = pass_count + fail_count
             pass_rate = (pass_count / total_evaluated * 100) if total_evaluated > 0 else 0
+            # Defensive fallback: if no per-entry threshold was recorded
+            # (shouldn't happen — both successful and errored runtime entries
+            # carry it — but guard against malformed input), use the registry
+            # default. Evaluators with no registry default (e.g. ExactMatch)
+            # legitimately have threshold=None.
+            if threshold_value is None:
+                registry_entry = EVALUATOR_REGISTRY.get(eval_name)
+                if registry_entry is not None:
+                    threshold_value = registry_entry.default_threshold
             aggregates[display_name] = {
                 'total_prompts': len(flat_results),
                 'prompts_evaluated': prompts_evaluated,
                 'total_evaluated': total_evaluated,
                 'pass_count': pass_count,
                 'fail_count': fail_count,
+                'error_count': error_count,
                 'pass_rate': pass_rate,
                 'avg_score': avg_score,
                 'threshold': threshold_value,
@@ -127,7 +146,13 @@ def extract_metric_rows(entry):
         score_val = pick(metric_obj, [metric_id])
         result_val = pick(metric_obj, ['result'])
         threshold_val = pick(metric_obj, ['threshold'])
-        reason_val = pick(metric_obj, [f'{metric_id}_reason', 'reason'])
+        # Errored entries carry the per-evaluator failure description in `error`
+        # (e.g. "Evaluator failed: Connection timeout"). Surface it in the Reason
+        # column so HTML readers see why the evaluator couldn't produce a result.
+        if result_val == STATUS_ERROR:
+            reason_val = metric_obj.get('error', '')
+        else:
+            reason_val = pick(metric_obj, [f'{metric_id}_reason', 'reason'])
         rows.append({
             'Metric': display_name,
@@ -138,35 +163,29 @@ def extract_metric_rows(entry):
         })
     return rows
-def prompt_passed(entry):
-    """Determine whether a prompt passed all evaluations.
+_CHIP_CLASSES = {
+    STATUS_PASS: "status-pass",
+    STATUS_FAIL: "status-fail",
+    STATUS_PARTIAL: "status-partial",
+    STATUS_ERROR: "status-error",
+}
-    Centralized predicate used by both the summary banner and per-prompt
-    cards so that pass/fail counts stay consistent.
-    Called in two contexts:
-      - On un-flattened results: multi-turn threads have type="multi_turn"
-        and are evaluated via their summary.overall_status.
-      - On flattened results (banner counts): individual turns have
-        status="pass"/"fail"/"error" and are evaluated like single-turn items.
+def _chip_class(status):
+    """Map a status value to its chip CSS class. Unknown statuses fall back to status-error."""
+    return _CHIP_CLASSES.get(status, "status-error")
-    A prompt/turn fails when:
-      - it is a multi-turn thread with overall_status != 'pass', OR
-      - its status is explicitly 'fail' or 'error', OR
-      - any metric result is explicitly 'fail'.
-    Otherwise it is considered passed (including prompts with no metric rows).
+def classify_attempt(entry):
+    """Return one of {pass, fail, partial, error} for an attempt or a thread.
+    For an un-flattened multi-turn thread, returns the thread's overall_status.
+    For a single-turn item or a per-turn entry (from a flattened thread), returns
+    the entry's status — which is set authoritatively by the runner.
     """
     if entry.get("type") == "multi_turn":
-        summary = entry.get("summary", {})
-        return summary.get("overall_status") == STATUS_PASS
-    status = str(entry.get('status', '')).lower()
-    if status in (STATUS_FAIL, STATUS_ERROR):
-        return False
-    metric_rows = extract_metric_rows(entry)
-    if any(str(row.get('Result', '')).lower() == STATUS_FAIL for row in metric_rows):
-        return False
-    return True
+        return entry.get("summary", {}).get("overall_status", STATUS_UNKNOWN)
+    return entry.get("status", STATUS_UNKNOWN)
 def _escape(text):
     """HTML-escape user-controlled content to prevent XSS."""
@@ -174,6 +193,35 @@ def _escape(text):
         return ""
     return html_module.escape(str(text))
+_CELL_CLASSES = {
+    STATUS_PASS: "cell-pass",
+    STATUS_FAIL: "cell-fail",
+    STATUS_ERROR: "cell-error",
+}
+def _render_metric_table(html, rows):
+    """Append a metric-table block to ``html`` (no-op if rows is empty)."""
+    if not rows:
+        return
+    html.append('            <table class="metric-table">')
+    html.append('              <tr><th>Metric</th><th>Result</th><th>Score</th><th>Threshold</th><th>Reason</th></tr>')
+    for row in rows:
+        result_val = str(row.get("Result", "")).lower()
+        cell_class = _CELL_CLASSES.get(result_val)
+        result_attr = f' class="{cell_class}"' if cell_class else ""
+        html.append(
+            '<tr>'
+            f'<td>{_escape(row.get("Metric", ""))}</td>'
+            f'<td{result_attr}>{_escape(str(row.get("Result", "")))}</td>'
+            f'<td>{_escape(str(row.get("Score", "")))}</td>'
+            f'<td>{_escape(str(row.get("Threshold", "")))}</td>'
+            f'<td>{_escape(str(row.get("Reason", "")))}</td>'
+            '</tr>'
+        )
+    html.append('            </table>')
 def generate_html_report(results, agent_name=None, agent_id=None, cli_version=None):
     aggregates = calculate_aggregate_statistics(results)
@@ -186,9 +234,14 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
             flat_items.append(entry)
     total_prompts = len(flat_items)
-    passed_prompts = sum(1 for item in flat_items if prompt_passed(item))
-    failed_prompt_count = total_prompts - passed_prompts
-    overall_pass_rate = (passed_prompts / total_prompts * 100) if total_prompts else 0
+    counts = {STATUS_PASS: 0, STATUS_FAIL: 0, STATUS_PARTIAL: 0, STATUS_ERROR: 0}
+    for item in flat_items:
+        c = classify_attempt(item)
+        if c in counts:
+            counts[c] += 1
+    incomplete_count = counts[STATUS_PARTIAL] + counts[STATUS_ERROR]
+    decisive_count = counts[STATUS_PASS] + counts[STATUS_FAIL]
+    overall_pass_rate = (counts[STATUS_PASS] / decisive_count * 100) if decisive_count else 0
     generated_utc = datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
     html = [
@@ -208,6 +261,10 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
         '      --ok-ink: #15603a;',
         '      --bad-bg: #fdecec;',
         '      --bad-ink: #8b1e2f;',
+        '      --warn-bg: #fff4e0;',
+        '      --warn-ink: #8a5a00;',
+        '      --neutral-bg: #ececec;',
+        '      --neutral-ink: #4a4a4a;',
         '      --border: #dde2ea;',
         '      --bar-track: #e8edf5;',
         '      --bar-fill: #2b6cb0;',
@@ -217,7 +274,7 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
         '    .container { max-width: 1100px; margin: 0 auto; padding: 24px 18px 40px; }',
         '    h1 { margin: 0 0 8px; }',
         '    .meta { color: var(--muted); margin-bottom: 20px; }',
-        '    .summary-banner { display: grid; grid-template-columns: repeat(4, minmax(140px, 1fr)); gap: 12px; margin: 16px 0 24px; }',
+        '    .summary-banner { display: grid; grid-template-columns: repeat(5, minmax(120px, 1fr)); gap: 12px; margin: 16px 0 24px; }',
         '    .summary-tile { background: var(--panel); border: 1px solid var(--border); border-radius: 12px; padding: 14px; }',
         '    .summary-label { display: block; font-size: 12px; color: var(--muted); text-transform: uppercase; letter-spacing: .06em; }',
         '    .summary-value { display: block; margin-top: 6px; font-size: 24px; font-weight: 700; }',
@@ -231,6 +288,8 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
         '    .status-chip { display: inline-block; padding: 3px 8px; border-radius: 999px; font-size: 12px; font-weight: 600; margin-bottom: 10px; }',
         '    .status-pass { background: var(--ok-bg); color: var(--ok-ink); }',
         '    .status-fail { background: var(--bad-bg); color: var(--bad-ink); }',
+        '    .status-partial { background: var(--warn-bg); color: var(--warn-ink); }',
+        '    .status-error { background: var(--neutral-bg); color: var(--neutral-ink); }',
         '    .prompt-card h3 { margin: 0 0 8px; font-size: 16px; }',
         '    .kv { margin: 8px 0; }',
         '    .kv > strong { display: block; min-width: 130px; color: var(--muted); margin-bottom: 4px; }',
@@ -248,9 +307,10 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
         '    .metric-table th { background: #f4f6fa; }',
         '    .metric-table .cell-pass { background: var(--ok-bg); color: var(--ok-ink); font-weight: 600; }',
         '    .metric-table .cell-fail { background: var(--bad-bg); color: var(--bad-ink); font-weight: 600; }',
+        '    .metric-table .cell-error { background: var(--neutral-bg); color: var(--neutral-ink); font-weight: 600; }',
         '    .evaluator-badge { display: inline-block; padding: 2px 8px; margin: 2px; border-radius: 4px; font-size: 0.85em; background: #e8eaf6; color: #283593; }',
         '    .footer { margin-top: 20px; color: var(--muted); font-size: 13px; }',
-        '    @media (max-width: 760px) { .summary-banner { grid-template-columns: repeat(2, minmax(140px, 1fr)); } .kv strong { min-width: 90px; } }',
+        '    @media (max-width: 760px) { .summary-banner { grid-template-columns: repeat(2, minmax(120px, 1fr)); } .kv strong { min-width: 90px; } }',
         '  </style>',
         '</head>',
         '<body>',
@@ -269,9 +329,10 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
         html.append(f'    <p class="meta">{" | ".join(metadata_items)}</p>')
     html.append('    <section class="summary-banner" aria-label="summary banner">')
-    html.append(f'      <div class="summary-tile"><span class="summary-label">Total Prompts</span><span class="summary-value">{total_prompts}</span></div>')
-    html.append(f'      <div class="summary-tile"><span class="summary-label">Passed</span><span class="summary-value">{passed_prompts}</span></div>')
-    html.append(f'      <div class="summary-tile"><span class="summary-label">Failed</span><span class="summary-value">{failed_prompt_count}</span></div>')
+    html.append(f'      <div class="summary-tile"><span class="summary-label">Total</span><span class="summary-value">{total_prompts}</span></div>')
+    html.append(f'      <div class="summary-tile"><span class="summary-label">Passed</span><span class="summary-value">{counts[STATUS_PASS]}</span></div>')
+    html.append(f'      <div class="summary-tile"><span class="summary-label">Failed</span><span class="summary-value">{counts[STATUS_FAIL]}</span></div>')
+    html.append(f'      <div class="summary-tile"><span class="summary-label">Incomplete</span><span class="summary-value">{incomplete_count}</span></div>')
     html.append(f'      <div class="summary-tile"><span class="summary-label">Pass Rate</span><span class="summary-value">{overall_pass_rate:.1f}%</span></div>')
     html.append('    </section>')
@@ -283,11 +344,14 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
             prompts_evaluated = stats.get('prompts_evaluated', stats.get('total_evaluated', 0))
             html.append('<div class="evaluator-row">')
             avg_score = stats.get('avg_score', 0)
-            threshold = stats.get('threshold', 'N/A')
+            threshold_val = stats.get('threshold')
+            threshold_str = "N/A" if threshold_val is None else str(threshold_val)
+            error_count = stats.get('error_count', 0)
+            error_clause = f' / {error_count} error' if error_count else ''
             html.append(
                 f'<div class="evaluator-head"><strong>{_escape(metric_name)}</strong>'
-                f'<span>{pass_rate:.1f}% ({stats.get("pass_count", 0)} pass / {stats.get("fail_count", 0)} fail, {prompts_evaluated}/{total_prompts} prompts)'
-                f' &middot; Avg Score: {avg_score:.2f} &middot; Threshold: {_escape(str(threshold))}</span></div>'
+                f'<span>{pass_rate:.1f}% ({stats.get("pass_count", 0)} pass / {stats.get("fail_count", 0)} fail{error_clause}, {prompts_evaluated}/{total_prompts} prompts)'
+                f' &middot; Avg Score: {avg_score:.2f} &middot; Threshold: {_escape(threshold_str)}</span></div>'
             )
             html.append('<div class="progress-track" role="progressbar" aria-valuemin="0" aria-valuemax="100" aria-valuenow="{:.1f}" aria-label="{} pass rate">'.format(pass_rate, _escape(metric_name)))
             html.append(f'<div class="progress-fill" style="width:{pass_rate:.1f}%"></div></div>')
@@ -305,27 +369,18 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
             # Multi-turn thread card
             thread_name = _escape(entry.get("name", "Unnamed Thread"))
             summary = entry.get("summary", {})
-            status = summary.get("overall_status", STATUS_UNKNOWN)
-            is_passed = status == STATUS_PASS
-            chip_class = 'status-pass' if is_passed else 'status-fail'
-            chip_text = 'PASSED' if is_passed else ('PARTIAL' if status == STATUS_PARTIAL else 'FAILED')
+            thread_status = summary.get("overall_status", STATUS_UNKNOWN)
             html.append('        <article class="prompt-card">')
-            html.append(f'          <span class="status-chip {chip_class}">{chip_text}</span>')
+            html.append(f'          <span class="status-chip {_chip_class(thread_status)}">{thread_status.upper()}</span>')
             html.append(f'          <h3>Thread {idx}: {thread_name}</h3>')
             html.append(f'          <p>{summary.get("turns_passed", 0)}/{summary.get("turns_total", 0)} turns passed</p>')
             for t_idx, turn in enumerate(entry.get("turns", []), 1):
                 turn_status = turn.get("status", STATUS_UNKNOWN)
-                turn_chip_class = 'status-pass' if turn_status == STATUS_PASS else 'status-fail'
-                turn_chip_text = {
-                    STATUS_PASS: 'PASSED',
-                    STATUS_FAIL: 'FAILED',
-                    STATUS_ERROR: 'ERROR',
-                }.get(turn_status, turn_status.upper())
                 html.append(f'          <div style="margin-left:16px;padding:8px 0;border-top:1px solid var(--border);">')
-                html.append(f'            <span class="status-chip {turn_chip_class}">{turn_chip_text}</span>')
+                html.append(f'            <span class="status-chip {_chip_class(turn_status)}">{turn_status.upper()}</span>')
                 html.append(f'            <strong>Turn {t_idx}:</strong> {_escape(turn.get("prompt", ""))}')
                 turn_evaluators = turn.get('evaluators_ran', [])
@@ -335,38 +390,24 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
                 if turn.get("response"):
                     html.append(f'            <div class="kv"><strong>Response:</strong><div class="md-content">{markdown.markdown(_escape(turn.get("response", "")))}</div></div>')
-                if turn.get("error"):
-                    html.append(f'            <p class="kv"><strong>Error:</strong> {_escape(turn["error"])}</p>')
-                turn_rows = extract_metric_rows(turn)
-                if turn_rows:
-                    html.append('            <table class="metric-table">')
-                    html.append('              <tr><th>Metric</th><th>Result</th><th>Score</th><th>Threshold</th><th>Reason</th></tr>')
-                    for row in turn_rows:
-                        result_val = str(row.get("Result", "")).lower()
-                        result_class = ' class="cell-pass"' if result_val == STATUS_PASS else ' class="cell-fail"' if result_val == STATUS_FAIL else ""
-                        html.append(
-                            '<tr>'
-                            f'<td>{_escape(row.get("Metric", ""))}</td>'
-                            f'<td{result_class}>{_escape(str(row.get("Result", "")))}</td>'
-                            f'<td>{_escape(str(row.get("Score", "")))}</td>'
-                            f'<td>{_escape(str(row.get("Threshold", "")))}</td>'
-                            f'<td>{_escape(str(row.get("Reason", "")))}</td>'
-                            '</tr>'
-                        )
-                    html.append('            </table>')
+                turn_error = turn.get("error")
+                if turn_error:
+                    html.append(
+                        f'            <p class="kv" data-error-code="{_escape(turn_error.get("code", ""))}">'
+                        f'<strong>Error:</strong> {_escape(turn_error.get("message", ""))}</p>'
+                    )
+                _render_metric_table(html, extract_metric_rows(turn))
                 html.append('          </div>')
             html.append('        </article>')
         else:
             score_rows = extract_metric_rows(entry)
-            is_passed = prompt_passed(entry)
-            chip_class = 'status-pass' if is_passed else 'status-fail'
-            chip_text = 'PASSED' if is_passed else 'FAILED'
+            item_status = classify_attempt(entry)
             html.append('        <article class="prompt-card">')
-            html.append(f'          <span class="status-chip {chip_class}">{chip_text}</span>')
+            html.append(f'          <span class="status-chip {_chip_class(item_status)}">{item_status.upper()}</span>')
             html.append(f'          <h3>Prompt {idx}: {_escape(entry.get("prompt", ""))}</h3>')
             evaluators_ran = entry.get('evaluators_ran', [])
@@ -377,26 +418,14 @@ def generate_html_report(results, agent_name=None, agent_id=None, cli_version=No
             html.append(f'          <div class="kv"><strong>Response:</strong><div class="md-content">{markdown.markdown(_escape(entry.get("response", "")))}</div></div>')
             html.append(f'          <div class="kv"><strong>Expected:</strong><div class="md-content">{markdown.markdown(_escape(entry.get("expected_response", "")))}</div></div>')
-            error_details = entry.get('error_details') or entry.get('errorDetails')
-            if error_details:
-                html.append(f'          <p class="kv"><strong>Error Details:</strong> {_escape(error_details)}</p>')
+            item_error = entry.get('error')
+            if item_error:
+                html.append(
+                    f'          <p class="kv" data-error-code="{_escape(item_error.get("code", ""))}">'
+                    f'<strong>Error:</strong> {_escape(item_error.get("message", ""))}</p>'
+                )
-            if score_rows:
-                html.append('          <table class="metric-table">')
-                html.append('            <tr><th>Metric</th><th>Result</th><th>Score</th><th>Threshold</th><th>Reason</th></tr>')
-                for row in score_rows:
-                    result_val = str(row.get("Result", "")).lower()
-                    result_class = ' class="cell-pass"' if result_val == STATUS_PASS else ' class="cell-fail"' if result_val == STATUS_FAIL else ""
-                    html.append(
-                        '<tr>'
-                        f'<td>{_escape(row.get("Metric", ""))}</td>'
-                        f'<td{result_class}>{_escape(str(row.get("Result", "")))}</td>'
-                        f'<td>{_escape(str(row.get("Score", "")))}</td>'
-                        f'<td>{_escape(str(row.get("Threshold", "")))}</td>'
-                        f'<td>{_escape(str(row.get("Reason", "")))}</td>'
-                        '</tr>'
-                    )
-                html.append('          </table>')
+            _render_metric_table(html, score_rows)
             html.append('        </article>')

package/src/clients/cli/readme.md CHANGED Viewed

@@ -36,7 +36,7 @@ AZURE_AI_API_KEY="<azure-openai-key>"
 AZURE_AI_API_VERSION="2024-12-01-preview"
 AZURE_AI_MODEL_NAME="gpt-4o-mini"
-# Your Tenant Id
+# Your Tenant ID (or use TEAMS_APP_TENANT_ID from ATK .env.local)
 TENANT_ID="<aad-tenant-id>"
 # Optional: default agent id (overridable via --m365-agent-id)