npm - donobu - Versions diffs - 5.46.0 → 5.48.0 - Mend

donobu 5.46.0 → 5.48.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/dist/cli/donobu-cli.js +90 -34
package/dist/esm/cli/donobu-cli.js +90 -34
package/dist/esm/lib/test/testExtension.js +38 -0
package/dist/esm/lib/test/utils/triageTestFailure.d.ts +27 -5
package/dist/esm/lib/test/utils/triageTestFailure.js +80 -37
package/dist/esm/reporter/render.js +108 -15
package/dist/lib/test/testExtension.js +38 -0
package/dist/lib/test/utils/triageTestFailure.d.ts +27 -5
package/dist/lib/test/utils/triageTestFailure.js +80 -37
package/dist/reporter/render.js +108 -15
package/package.json +1 -1

package/dist/esm/lib/test/utils/triageTestFailure.js CHANGED Viewed

@@ -37,6 +37,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
 };
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.TRIAGE_PERSISTENCE_FILE_IDS = exports.TreatmentPlan = exports.RemediationStepSchema = exports.FailureReasonSchema = exports.AdditionalDataRequestSchema = void 0;
+exports.captureLivePageScreenshot = captureLivePageScreenshot;
 exports.deriveHeuristicAssessment = deriveHeuristicAssessment;
 exports.deriveHistoricalSignals = deriveHistoricalSignals;
 exports.gatherTestFailureEvidence = gatherTestFailureEvidence;
@@ -79,9 +80,10 @@ const cacheLocator_1 = require("../../ai/cache/cacheLocator");
  *      history from the persistence layer.
  *   3. Fetches **historical runs** of the same flow (by name) from the flows manager to
  *      detect flakiness, regression patterns, and prior self-heal success.
- *   4. Captures the **failure screenshot** (last tool call screenshot from the current
- *      run) and the **baseline screenshot** (last tool call screenshot from the most
- *      recent successful historical run) for visual comparison.
+ *   4. Captures the **failure screenshot** (a live screenshot taken at triage time, while
+ *      the page is still open during teardown, so it reflects the true final state) and the
+ *      **baseline screenshot** (last tool call screenshot from the most recent successful
+ *      historical run) for visual comparison.
  *   5. Reads the source of the failing test case for contextual grounding.
  *   6. Runs the **heuristic classifier** (`deriveHeuristicAssessment`) which uses
  *      rule-based pattern matching over errors, tool calls, stale-cache indicators,
@@ -121,7 +123,7 @@ const cacheLocator_1 = require("../../ai/cache/cacheLocator");
  * | Flow metadata           | `DonobuExtendedPage._dnb`     | Run mode, objective, allowed tools, timing         |
  * | Stale cache indicators  | Derived from above            | Whether page.ai cache staleness is the root cause  |
  * | Historical flow runs    | `DonobuFlowsManager.getFlows` | Flakiness, regression patterns, prior self-heal    |
- * | Failure screenshot      | Last tool call screenshot     | Visual state of the page when the failure occurred |
+ * | Failure screenshot      | Live capture at triage time   | True final visual state of the page when it failed |
  * | Baseline screenshot     | Last successful run's screenshot | Visual reference for what the page *should* look like |
  * | Test source snippet     | TypeScript AST parsing        | The test's expectations and structure               |
  *
@@ -331,6 +333,14 @@ const TRIAGE_PERSISTENCE_FILE_IDS = {
     evidence: 'triage-evidence.json',
     failureScreenshot: 'triage-failure-screenshot.png',
     baselineScreenshot: 'triage-baseline-screenshot.png',
+    /**
+     * Live screenshot of a flow's final visual state, captured at teardown while
+     * the page is still open. Persisted on successful runs so that a *later*
+     * failing run can use it as a true final-state baseline — symmetric with the
+     * failure screenshot, which is also a live end-of-test capture. Keyed per
+     * flow, like browser state.
+     */
+    finalStateScreenshot: 'triage-final-state-screenshot.png',
 };
 exports.TRIAGE_PERSISTENCE_FILE_IDS = TRIAGE_PERSISTENCE_FILE_IDS;
 /**
@@ -554,41 +564,59 @@ async function fetchFlowHistory(page) {
     }
 }
 /**
- * Retrieves the screenshot from the last completed tool call in the current flow.
- * Returns the raw PNG/JPEG buffer if available, or null. Fails open so triage
- * proceeds even if the screenshot cannot be loaded.
+ * Captures a fresh screenshot of the page's current visual state. Called at
+ * teardown (failure triage and successful-run baseline capture) while the
+ * page/context is still open, so it reflects the true *end state* of the test.
+ *
+ * This is deliberately preferred over the last Donobu tool-call screenshot:
+ * Playwright `expect`/`waitFor` are not tool calls, so the last tool-call image
+ * can predate the failing assertion and capture a transient state (e.g. a
+ * loading spinner that has since resolved), which misleads the vision model.
+ * Fails open — returns null if the page is gone or unresponsive (crash, closed
+ * context, hang), in which case the caller proceeds without a screenshot.
  */
-async function fetchLastToolCallScreenshot(page) {
-    const flowId = page._dnb?.donobuFlowMetadata?.id;
-    const persistence = page._dnb?.persistence;
-    if (!flowId || !persistence) {
-        return null;
-    }
+async function captureLivePageScreenshot(page) {
     try {
-        const toolCalls = await persistence.getToolCalls(flowId);
-        if (toolCalls.length === 0) {
-            return null;
-        }
-        // Walk backwards to find the last tool call with a screenshot
-        for (let i = toolCalls.length - 1; i >= 0; i--) {
-            const screenshotId = toolCalls[i].postCallImageId;
-            if (screenshotId) {
-                return await persistence.getScreenShot(flowId, screenshotId);
-            }
-        }
-        return null;
+        return await page.screenshot({ animations: 'disabled', timeout: 10000 });
     }
     catch (error) {
-        Logger_1.appLogger.debug(`Failed to fetch last tool call screenshot for flow ${flowId}.`, error);
+        Logger_1.appLogger.debug('Failed to capture live page screenshot; proceeding without it.', error);
         return null;
     }
 }
 /**
- * Loads the final screenshot from a historical successful run to serve as a
- * visual baseline for comparison with the current failure state. This enables
- * the GPT triage agent to detect page redesigns and stale cache scenarios by
- * comparing "what the page looked like when it last worked" vs "what it looks
- * like now." Fails open — returns null if the screenshot cannot be retrieved.
+ * The failure screenshot for the current run. Prefers the final-state
+ * screenshot persisted at teardown (the single source of truth shared with
+ * baselines), and falls back to a live capture when it is missing — e.g. triage
+ * invoked outside the standard teardown, or the teardown capture failed.
+ */
+async function fetchCurrentRunFinalStateScreenshot(page) {
+    const flowId = page._dnb?.donobuFlowMetadata?.id;
+    const persistence = page._dnb?.persistence;
+    if (flowId && persistence) {
+        try {
+            const persisted = await persistence.getFlowFile(flowId, TRIAGE_PERSISTENCE_FILE_IDS.finalStateScreenshot);
+            if (persisted) {
+                return persisted;
+            }
+        }
+        catch (error) {
+            Logger_1.appLogger.debug(`Failed to read persisted final-state screenshot for flow ${flowId}; falling back to a live capture.`, error);
+        }
+    }
+    return captureLivePageScreenshot(page);
+}
+/**
+ * Loads a baseline screenshot from a historical successful run so the GPT
+ * triage agent can compare "what the page looked like when it last worked" vs
+ * "what it looks like now" to detect redesigns and stale-cache scenarios.
+ *
+ * Prefers the persisted final-state screenshot (a live end-of-test capture
+ * written on successful runs) so the baseline is symmetric with the live
+ * failure screenshot — both true end states. Falls back to the last tool-call
+ * image for runs that predate final-state capture; that image can be a
+ * mid-flow frame, so callers should treat such baselines as approximate.
+ * Fails open — returns null if no screenshot can be retrieved.
  */
 async function fetchBaselineScreenshot(page, historicalFlowId) {
     const persistence = page._dnb?.persistence;
@@ -596,10 +624,12 @@ async function fetchBaselineScreenshot(page, historicalFlowId) {
         return null;
     }
     try {
-        const toolCalls = await persistence.getToolCalls(historicalFlowId);
-        if (toolCalls.length === 0) {
-            return null;
+        const finalState = await persistence.getFlowFile(historicalFlowId, TRIAGE_PERSISTENCE_FILE_IDS.finalStateScreenshot);
+        if (finalState) {
+            return finalState;
         }
+        // Fallback for runs predating final-state capture: last tool-call image.
+        const toolCalls = await persistence.getToolCalls(historicalFlowId);
         for (let i = toolCalls.length - 1; i >= 0; i--) {
             const screenshotId = toolCalls[i].postCallImageId;
             if (screenshotId) {
@@ -1601,7 +1631,7 @@ async function gatherTestFailureEvidence(testInfo, page, options = {}) {
     // Capture screenshots for visual triage: current failure + baseline from last success
     const lastSuccessfulRunId = failureContext.flowHistory?.lastSuccessfulRunId ?? null;
     const [screenshotBuffer, baselineBuffer] = await Promise.all([
-        fetchLastToolCallScreenshot(page),
+        fetchCurrentRunFinalStateScreenshot(page),
         lastSuccessfulRunId
             ? fetchBaselineScreenshot(page, lastSuccessfulRunId)
             : Promise.resolve(null),
@@ -1805,10 +1835,20 @@ passed to each tool invocation. Use these to improve diagnosis:
 SCREENSHOT EVIDENCE:
 You may receive one or two screenshots:
-1. "FAILURE SCREENSHOT" — the state of the page at or near the point of failure in the current run.
+1. "FAILURE SCREENSHOT" — a live screenshot captured at triage time, immediately after the test
+   failed and while the page was still open. It reflects the true FINAL visual state of the page.
 2. "BASELINE SCREENSHOT" — the state of the page at the end of the most recent successful run of
    this same flow. This serves as a visual reference for what the page *should* look like.
+IMPORTANT — a screenshot is a single moment in time, not a recording:
+- Describe only what the frame shows. Do NOT assert that a state persisted for a duration — e.g.
+  "stuck on a loading spinner THROUGHOUT the test", "the page never loaded", "remained on X the
+  whole time". A single frame cannot establish how long anything lasted.
+- Only claim a persistent or temporal condition when it is corroborated by NON-visual evidence:
+  tool-call outcomes/durations, error messages, or timeouts in failureContext. Absent that, state
+  the end condition factually (e.g. "the final screenshot shows a loading spinner") and let the
+  other evidence determine duration and cause.
 When both screenshots are provided, compare them to:
 - Detect UI changes (redesigns, layout shifts, new modals) that would explain selector or cache failures.
 - Identify whether the failure screenshot shows a fundamentally different page state (error page, login wall)
@@ -1837,7 +1877,10 @@ When only the failure screenshot is provided (no baseline available), use it to:
     if (evidence.failureScreenshotPath) {
         try {
             const failureBytes = await fs.readFile(evidence.failureScreenshotPath);
-            userItems.push({ type: 'text', text: 'FAILURE SCREENSHOT (current run):' }, { type: 'png', bytes: new Uint8Array(failureBytes) });
+            userItems.push({
+                type: 'text',
+                text: 'FAILURE SCREENSHOT (live capture at triage time — true final state of the page):',
+            }, { type: 'png', bytes: new Uint8Array(failureBytes) });
         }
         catch (screenshotError) {
             Logger_1.appLogger.debug('Failed to load failure screenshot for GPT triage, proceeding with text only.', screenshotError);

package/dist/esm/reporter/render.js CHANGED Viewed

@@ -449,6 +449,33 @@ const REASON_LABELS = {
 function reasonCfg(reason) {
     return REASON_LABELS[reason] ?? REASON_LABELS['UNKNOWN'];
 }
+// Triage-detail flags derived from a treatment plan. A test can carry several
+// at once, so these form a multi-valued filter dimension (OR semantics).
+// Declaration order is the display order in the filter menu and chips, and
+// mirrors the flag order in `renderTriageCard`. Colors match `.triage-flag`.
+const TRIAGE_LABELS = {
+    retryable: { label: 'Retryable', color: '#10b981' },
+    code: { label: 'Needs Code Change', color: '#f59e0b' },
+    product: { label: 'Needs Product Fix', color: '#ef4444' },
+};
+/** The triage-flag keys present on a test's treatment plan, in display order. */
+function triageKeysOf(test) {
+    if (!test.plan) {
+        return [];
+    }
+    const p = test.plan.plan;
+    const keys = [];
+    if (p.shouldRetryAutomation) {
+        keys.push('retryable');
+    }
+    if (p.requiresCodeChange) {
+        keys.push('code');
+    }
+    if (p.requiresProductFix) {
+        keys.push('product');
+    }
+    return keys;
+}
 function renderAttachments(attachments, outputDir, stepScreenshots = []) {
     const rendered = [];
     for (const att of attachments) {
@@ -1680,7 +1707,7 @@ function renderHtml(report, triage, outputDir) {
             ? `<div class="flow-id-detail"><span class="detail-label">Flow ID</span><span class="flow-id-value">${esc(test.flowId)}<button class="copy-flow-id" data-flow-id="${esc(test.flowId)}" title="Copy flow ID"><svg viewBox="0 0 24 24"><rect width="14" height="14" x="8" y="8" rx="2" ry="2"/><path d="M4 16c-1.1 0-2-.9-2-2V4c0-1.1.9-2 2-2h10c1.1 0 2 .9 2 2"/></svg></button></span></div>`
             : '';
         testSectionsHtml += `
-      <div class="test-card ${sc.label.toLowerCase().replace(/ /g, '')} ${expandableClass}" id="${testId}" data-status="${test.status}" data-file="${esc(test.file)}" data-search="${esc((displayFilePath + ' ' + test.specTitle).toLowerCase())}" data-tags="${esc(JSON.stringify(test.tags))}"${test.plan ? ` data-reason="${esc(test.plan.plan.failureReason)}"` : ''} ${hasDetails ? `data-detail="${testId}"` : ''}>
+      <div class="test-card ${sc.label.toLowerCase().replace(/ /g, '')} ${expandableClass}" id="${testId}" data-status="${test.status}" data-file="${esc(test.file)}" data-search="${esc((displayFilePath + ' ' + test.specTitle).toLowerCase())}" data-tags="${esc(JSON.stringify(test.tags))}"${test.plan ? ` data-reason="${esc(test.plan.plan.failureReason)}"` : ''}${triageKeysOf(test).length ? ` data-triage="${esc(triageKeysOf(test).join(','))}"` : ''} ${hasDetails ? `data-detail="${testId}"` : ''}>
         <div class="test-summary">
           ${chevron}
           <span class="status-dot" style="background:${sc.color}" title="${sc.label}"></span>
@@ -1801,14 +1828,15 @@ body::before{content:'';position:fixed;top:-750px;left:50%;transform:translateX(
 .add-tag-filter .add-tag-plus{font-size:15px;line-height:1}
 .add-tag-filter:hover{background:var(--surface-raised);border-color:var(--text-dim);color:var(--text)}
 .add-tag-filter.active{background:var(--accent);border-color:var(--accent);color:#fff}
-.tag-menu{position:absolute;top:calc(100% + 6px);left:0;min-width:200px;max-width:320px;max-height:280px;overflow-y:auto;background:var(--surface-raised);border:1px solid var(--border);border-radius:var(--radius);box-shadow:0 8px 24px rgba(0,0,0,.4);z-index:20;padding:4px;display:none}
+.tag-menu{position:absolute;top:calc(100% + 6px);right:0;min-width:200px;max-width:320px;max-height:280px;overflow-y:auto;background:var(--surface-raised);border:1px solid var(--border);border-radius:var(--radius);box-shadow:0 8px 24px rgba(0,0,0,.4);z-index:20;padding:4px;display:none}
 .tag-menu:not([hidden]){display:block}
 .tag-menu-item{display:flex;align-items:center;justify-content:space-between;gap:8px;padding:6px 10px;font-size:12px;font-family:var(--mono);color:var(--text);background:transparent;border:none;border-radius:4px;cursor:pointer;text-align:left;width:100%;transition:background .15s}
 .tag-menu-item:hover{background:var(--surface)}
 .tag-menu-item .tag-menu-count{color:var(--text-muted);font-size:11px;font-family:var(--mono)}
 .tag-menu-empty{padding:8px 10px;font-size:12px;color:var(--text-muted);font-style:italic}
-.tag-menu-section{padding:8px 10px 4px;font-size:10px;font-weight:700;letter-spacing:.05em;text-transform:uppercase;color:var(--text-dim);font-family:inherit}
+.tag-menu-section{padding:8px 10px 2px;font-size:10px;font-weight:700;letter-spacing:.05em;text-transform:uppercase;color:var(--text-dim);font-family:inherit}
 .tag-menu-section:not(:first-child){margin-top:4px;border-top:1px solid var(--border)}
+.tag-menu-hint{padding:0 10px 6px;font-size:11px;line-height:1.35;color:var(--text-muted);font-family:inherit;max-width:300px}
 .active-tag-filters{display:inline-flex;align-items:center;gap:6px;flex-wrap:wrap}
 .tag-chip{display:inline-flex;align-items:center;gap:6px;background:rgba(255,127,58,.12);border:1px solid rgba(255,127,58,.3);color:var(--accent);font-size:11px;font-family:var(--mono);padding:3px 4px 3px 8px;border-radius:4px}
 .tag-chip-remove{background:transparent;border:none;color:inherit;cursor:pointer;font-size:14px;line-height:1;padding:0 4px;font-family:inherit;opacity:.7;transition:opacity .15s}
@@ -2163,7 +2191,7 @@ details.ai-invocation[open]>summary .native-step-chevron{transform:rotate(90deg)
       </label>
       <div class="tag-filter-controls" data-tag-filter-controls hidden>
         <div class="tag-filter-trigger-wrap">
-          <button class="add-tag-filter" data-add-tag-filter title="Filter by tag or diagnosis"><span class="add-tag-plus">+</span> Filter</button>
+          <button class="add-tag-filter" data-add-tag-filter title="Filter by tag, diagnosis, or triage"><span class="add-tag-plus">+</span> Filter</button>
           <div class="tag-menu" data-tag-menu hidden></div>
         </div>
         <div class="active-tag-filters" data-active-tag-filters></div>
@@ -2191,16 +2219,22 @@ details.ai-invocation[open]>summary .native-step-chevron{transform:rotate(90deg)
   //   tags    — multi-select AND; card must carry every active tag.
   //   reasons — multi-select OR;  card.data-reason must match any active reason
   //             (a card has at most one diagnosis, so AND would always be 0/1).
-  // "Clear Filters" wipes all three.
+  //   triage  — multi-select OR;  card.data-triage (a comma list) must contain
+  //             any active flag (a card can carry several triage flags).
+  // "Clear Filters" wipes all of them.
   var activeStatus=null;
   var activeTags=new Set();
   var activeReasons=new Set();
+  var activeTriage=new Set();
   var activeSearch=''; // lowercase substring match against data-search
   var allTags=[];
   var allReasons=[]; // ordered list of REASON keys present in the report
+  var allTriage=[]; // ordered list of TRIAGE keys present in the report
   var REASON_LABELS=${JSON.stringify(REASON_LABELS)};
+  var TRIAGE_LABELS=${JSON.stringify(TRIAGE_LABELS)};
   function cardTags(card){var raw=card.getAttribute('data-tags');if(!raw)return [];try{var v=JSON.parse(raw);return Array.isArray(v)?v:[]}catch(_){return []}}
+  function cardTriage(card){var raw=card.getAttribute('data-triage');return raw?raw.split(','):[]}
   // Faceted-search counts. Each filter option's badge shows "how many tests
   // would this option contribute given the rest of the filters." The semantics
@@ -2208,8 +2242,9 @@ details.ai-invocation[open]>summary .native-step-chevron{transform:rotate(90deg)
   //   - Status pills (single-select replace): ignore current activeStatus.
   //   - Tag menu items (multi-select AND):    use ALL current filters.
   //   - Reason menu items (multi-select OR):  ignore current activeReasons.
+  //   - Triage menu items (multi-select OR):  ignore current activeTriage.
   // Search is free-form and not counted.
-  function cardsMatching(ignoreStatus,ignoreTags,ignoreReasons){
+  function cardsMatching(ignoreStatus,ignoreTags,ignoreReasons,ignoreTriage){
     var out=[];
     document.querySelectorAll('.test-card').forEach(function(card){
       var statusOk=ignoreStatus||activeStatus===null||card.getAttribute('data-status')===activeStatus;
@@ -2219,19 +2254,23 @@ details.ai-invocation[open]>summary .native-step-chevron{transform:rotate(90deg)
         activeTags.forEach(function(w){if(t.indexOf(w)===-1)tagsOk=false});
       }
       var reasonOk=ignoreReasons||activeReasons.size===0||activeReasons.has(card.getAttribute('data-reason')||'');
+      var triageOk=true;
+      if(!ignoreTriage&&activeTriage.size>0){
+        var ct=cardTriage(card);triageOk=ct.some(function(k){return activeTriage.has(k)});
+      }
       var searchOk=activeSearch.length===0||(card.getAttribute('data-search')||'').indexOf(activeSearch)!==-1;
-      if(statusOk&&tagsOk&&reasonOk&&searchOk)out.push(card);
+      if(statusOk&&tagsOk&&reasonOk&&triageOk&&searchOk)out.push(card);
     });
     return out;
   }
   function tagCount(t){
-    var pool=cardsMatching(false,false,false);
+    var pool=cardsMatching(false,false,false,false);
     var n=0;for(var i=0;i<pool.length;i++){if(cardTags(pool[i]).indexOf(t)!==-1)n++}
     return n;
   }
   function applyFilters(){
-    var anyActive=activeStatus!==null||activeTags.size>0||activeReasons.size>0||activeSearch.length>0;
+    var anyActive=activeStatus!==null||activeTags.size>0||activeReasons.size>0||activeTriage.size>0||activeSearch.length>0;
     document.querySelector('.clear-filter').classList.toggle('visible',anyActive);
     var visibleTests=0;
     var visibleFiles=Object.create(null);
@@ -2247,12 +2286,16 @@ details.ai-invocation[open]>summary .native-step-chevron{transform:rotate(90deg)
         var r=card.getAttribute('data-reason')||'';
         reasonOk=activeReasons.has(r);
       }
+      var triageOk=true;
+      if(activeTriage.size>0){
+        var ct=cardTriage(card);triageOk=ct.some(function(k){return activeTriage.has(k)});
+      }
       var searchOk=true;
       if(activeSearch.length>0){
         var hay=card.getAttribute('data-search')||'';
         searchOk=hay.indexOf(activeSearch)!==-1;
       }
-      var hide=!(statusOk&&tagsOk&&reasonOk&&searchOk);
+      var hide=!(statusOk&&tagsOk&&reasonOk&&triageOk&&searchOk);
       card.classList.toggle('hidden-by-filter',hide);
       if(!hide){
         visibleTests++;
@@ -2295,6 +2338,7 @@ details.ai-invocation[open]>summary .native-step-chevron{transform:rotate(90deg)
     if(activeStatus)p.set('status',activeStatus);
     activeTags.forEach(function(t){p.append('tag',t)});
     activeReasons.forEach(function(r){p.append('reason',r)});
+    activeTriage.forEach(function(t){p.append('triage',t)});
     if(activeSearch)p.set('q',activeSearch);
     var qs=p.toString();
     var next=location.pathname+(qs?'?'+qs:'')+(location.hash||'');
@@ -2340,19 +2384,37 @@ details.ai-invocation[open]>summary .native-step-chevron{transform:rotate(90deg)
       chip.appendChild(label);chip.appendChild(btn);
       c.appendChild(chip);
     });
+    activeTriage.forEach(function(t){
+      var meta=TRIAGE_LABELS[t];if(!meta)return;
+      var chip=document.createElement('span');chip.className='tag-chip reason-chip';
+      chip.style.background=hexToRgba(meta.color,0.14);
+      chip.style.borderColor=hexToRgba(meta.color,0.4);
+      chip.style.color=meta.color;
+      var label=document.createElement('span');label.textContent=meta.label;
+      var btn=document.createElement('button');btn.className='tag-chip-remove';btn.setAttribute('data-remove-triage',t);btn.setAttribute('title','Remove filter');btn.textContent='×';
+      chip.appendChild(label);chip.appendChild(btn);
+      c.appendChild(chip);
+    });
   }
   function addTag(t){if(!t||activeTags.has(t))return;activeTags.add(t);renderActiveChips();applyFilters()}
   function removeTag(t){if(!activeTags.delete(t))return;renderActiveChips();applyFilters()}
   function addReason(r){if(!r||activeReasons.has(r))return;activeReasons.add(r);renderActiveChips();applyFilters()}
   function removeReason(r){if(!activeReasons.delete(r))return;renderActiveChips();applyFilters()}
+  function addTriage(t){if(!t||activeTriage.has(t))return;activeTriage.add(t);renderActiveChips();applyFilters()}
+  function removeTriage(t){if(!activeTriage.delete(t))return;renderActiveChips();applyFilters()}
   function reasonCount(r){
-    var pool=cardsMatching(false,false,true);
+    var pool=cardsMatching(false,false,true,false);
     var n=0;for(var i=0;i<pool.length;i++){if(pool[i].getAttribute('data-reason')===r)n++}
     return n;
   }
+  function triageCount(t){
+    var pool=cardsMatching(false,false,false,true);
+    var n=0;for(var i=0;i<pool.length;i++){if(cardTriage(pool[i]).indexOf(t)!==-1)n++}
+    return n;
+  }
   function updateStatPillCounts(){
-    var pool=cardsMatching(true,false,false);
+    var pool=cardsMatching(true,false,false,false);
     var counts=Object.create(null);
     for(var i=0;i<pool.length;i++){var s=pool[i].getAttribute('data-status');counts[s]=(counts[s]||0)+1}
     document.querySelectorAll('.stat-pill[data-filter]').forEach(function(pill){
@@ -2372,9 +2434,11 @@ details.ai-invocation[open]>summary .native-step-chevron{transform:rotate(90deg)
     // to an empty view, so they're not useful to offer.
     var tagsWithCounts=allTags.filter(function(t){return !activeTags.has(t)}).map(function(t){return {key:t,count:tagCount(t)}}).filter(function(x){return x.count>0});
     var reasonsWithCounts=allReasons.filter(function(r){return !activeReasons.has(r)}).map(function(r){return {key:r,count:reasonCount(r)}}).filter(function(x){return x.count>0});
+    var triageWithCounts=allTriage.filter(function(t){return !activeTriage.has(t)}).map(function(t){return {key:t,count:triageCount(t)}}).filter(function(x){return x.count>0});
     var added=false;
     if(allTags.length>0){
       var hT=document.createElement('div');hT.className='tag-menu-section';hT.textContent='Tags';menu.appendChild(hT);
+      var hintT=document.createElement('div');hintT.className='tag-menu-hint';hintT.textContent='Labels you put on tests in code (e.g. @smoke). Match all selected.';menu.appendChild(hintT);
       if(tagsWithCounts.length===0){
         var emptyT=document.createElement('div');emptyT.className='tag-menu-empty';emptyT.textContent=allTags.length===activeTags.size?'All tags selected':'No matching tags';menu.appendChild(emptyT);
       }else{
@@ -2390,6 +2454,7 @@ details.ai-invocation[open]>summary .native-step-chevron{transform:rotate(90deg)
     }
     if(allReasons.length>0){
       var hR=document.createElement('div');hR.className='tag-menu-section';hR.textContent='Diagnoses';menu.appendChild(hR);
+      var hintR=document.createElement('div');hintR.className='tag-menu-hint';hintR.textContent='Why a test failed — the AI\\'s single root-cause assessment.';menu.appendChild(hintR);
       if(reasonsWithCounts.length===0){
         var emptyR=document.createElement('div');emptyR.className='tag-menu-empty';emptyR.textContent=allReasons.length===activeReasons.size?'All diagnoses selected':'No matching diagnoses';menu.appendChild(emptyR);
       }else{
@@ -2404,6 +2469,23 @@ details.ai-invocation[open]>summary .native-step-chevron{transform:rotate(90deg)
       }
       added=true;
     }
+    if(allTriage.length>0){
+      var hTr=document.createElement('div');hTr.className='tag-menu-section';hTr.textContent='Triage';menu.appendChild(hTr);
+      var hintTr=document.createElement('div');hintTr.className='tag-menu-hint';hintTr.textContent='What the failure calls for — a test can need more than one.';menu.appendChild(hintTr);
+      if(triageWithCounts.length===0){
+        var emptyTr=document.createElement('div');emptyTr.className='tag-menu-empty';emptyTr.textContent=allTriage.length===activeTriage.size?'All triage flags selected':'No matching triage flags';menu.appendChild(emptyTr);
+      }else{
+        triageWithCounts.forEach(function(x){
+          var meta=TRIAGE_LABELS[x.key];if(!meta)return;
+          var item=document.createElement('button');item.className='tag-menu-item';item.setAttribute('data-triage-menu-item',x.key);
+          var label=document.createElement('span');label.textContent=meta.label;label.style.color=meta.color;
+          var count=document.createElement('span');count.className='tag-menu-count';count.textContent=x.count;
+          item.appendChild(label);item.appendChild(count);
+          menu.appendChild(item);
+        });
+      }
+      added=true;
+    }
     if(!added){
       var empty=document.createElement('div');empty.className='tag-menu-empty';empty.textContent='No filters available';menu.appendChild(empty);
     }
@@ -2421,6 +2503,7 @@ details.ai-invocation[open]>summary .native-step-chevron{transform:rotate(90deg)
     activeStatus=null;
     activeTags.clear();
     activeReasons.clear();
+    activeTriage.clear();
     activeSearch='';
     document.querySelectorAll('.stat-pill').forEach(function(p){p.classList.remove('active')});
     var searchInput=document.querySelector('[data-filter-search]');
@@ -2470,10 +2553,14 @@ details.ai-invocation[open]>summary .native-step-chevron{transform:rotate(90deg)
     if(tagItem){addTag(tagItem.getAttribute('data-tag-menu-item'));closeTagMenu();return}
     var reasonItem=e.target.closest('[data-reason-menu-item]');
     if(reasonItem){addReason(reasonItem.getAttribute('data-reason-menu-item'));closeTagMenu();return}
+    var triageItem=e.target.closest('[data-triage-menu-item]');
+    if(triageItem){addTriage(triageItem.getAttribute('data-triage-menu-item'));closeTagMenu();return}
     var tagRemove=e.target.closest('[data-remove-tag]');
     if(tagRemove){removeTag(tagRemove.getAttribute('data-remove-tag'));return}
     var reasonRemove=e.target.closest('[data-remove-reason]');
     if(reasonRemove){removeReason(reasonRemove.getAttribute('data-remove-reason'));return}
+    var triageRemove=e.target.closest('[data-remove-triage]');
+    if(triageRemove){removeTriage(triageRemove.getAttribute('data-remove-triage'));return}
     // Stat pill filter
     var pill=e.target.closest('.stat-pill[data-filter]');
     if(pill){toggleStatus(pill.getAttribute('data-filter'));return}
@@ -2523,18 +2610,22 @@ details.ai-invocation[open]>summary .native-step-chevron{transform:rotate(90deg)
   (function(){
     var seenTags=Object.create(null);
     var seenReasons=Object.create(null);
+    var seenTriage=Object.create(null);
     document.querySelectorAll('.test-card').forEach(function(card){
       var raw=card.getAttribute('data-tags');
       if(raw){try{var tags=JSON.parse(raw);if(Array.isArray(tags)){tags.forEach(function(t){if(typeof t==='string'&&t)seenTags[t]=true})}}catch(_){}}
       var r=card.getAttribute('data-reason');
       if(r)seenReasons[r]=true;
+      cardTriage(card).forEach(function(t){if(t)seenTriage[t]=true});
     });
     allTags=Object.keys(seenTags).sort();
     // Preserve the REASON_LABELS declaration order rather than alphabetical —
     // they're already arranged from most-frequent/specific to UNKNOWN catch-all.
     allReasons=Object.keys(REASON_LABELS).filter(function(r){return seenReasons[r]});
+    // Preserve TRIAGE_LABELS declaration order (retryable → code → product).
+    allTriage=Object.keys(TRIAGE_LABELS).filter(function(t){return seenTriage[t]});
     var controls=document.querySelector('[data-tag-filter-controls]');
-    if(controls&&(allTags.length>0||allReasons.length>0))controls.hidden=false;
+    if(controls&&(allTags.length>0||allReasons.length>0||allTriage.length>0))controls.hidden=false;
   })();
   // Seed filter state from ?status=...&tag=...&reason=... so shared URLs
@@ -2554,6 +2645,8 @@ details.ai-invocation[open]>summary .native-step-chevron{transform:rotate(90deg)
     p.getAll('tag').forEach(function(t){if(tagSet[t])activeTags.add(t)});
     var reasonSet={};allReasons.forEach(function(r){reasonSet[r]=true});
     p.getAll('reason').forEach(function(r){if(reasonSet[r])activeReasons.add(r)});
+    var triageSet={};allTriage.forEach(function(t){triageSet[t]=true});
+    p.getAll('triage').forEach(function(t){if(triageSet[t])activeTriage.add(t)});
     var q=p.get('q');
     var searchInput=document.querySelector('[data-filter-search]');
     if(q){
@@ -2566,8 +2659,8 @@ details.ai-invocation[open]>summary .native-step-chevron{transform:rotate(90deg)
         applyFilters();
       });
     }
-    if(activeTags.size>0||activeReasons.size>0)renderActiveChips();
-    if(activeStatus!==null||activeTags.size>0||activeReasons.size>0||activeSearch.length>0)applyFilters();
+    if(activeTags.size>0||activeReasons.size>0||activeTriage.size>0)renderActiveChips();
+    if(activeStatus!==null||activeTags.size>0||activeReasons.size>0||activeTriage.size>0||activeSearch.length>0)applyFilters();
   })();
   // Open #?testId=<id> deep links to the matching test card. Used by the

package/dist/lib/test/testExtension.js CHANGED Viewed

@@ -843,6 +843,40 @@ async function attachStepScreenshots(sharedState, testInfo) {
         contentType: 'application/json',
     });
 }
+/**
+ * Capture a live screenshot of the flow's final visual state at teardown (page
+ * still open) and persist it as a per-flow file — the single source of truth
+ * for "what the page looked like when this run ended." It is read both as the
+ * current run's failure screenshot (when this run failed) and as the baseline
+ * for a later failing run (when this run succeeded), keeping the two symmetric.
+ * See `fetchBaselineScreenshot` / `gatherTestFailureEvidence` in
+ * triageTestFailure.ts.
+ *
+ * Runs for any meaningful end state; skipped only for `skipped` tests (no real
+ * page state), when triage is disabled, or for V1 (legacy self-heal) tests.
+ * Best-effort and fails open.
+ */
+async function captureAndPersistFinalState(page, testInfo) {
+    if (testInfo.status === 'skipped' ||
+        process.env.DONOBU_TRIAGE_DISABLED === '1' ||
+        isV1Test(testInfo)) {
+        return;
+    }
+    const flowId = page._dnb?.donobuFlowMetadata?.id;
+    const persistence = page._dnb?.persistence;
+    if (!flowId || !persistence) {
+        return;
+    }
+    try {
+        const screenshot = await (0, triageTestFailure_1.captureLivePageScreenshot)(page);
+        if (screenshot) {
+            await persistence.setFlowFile(flowId, triageTestFailure_1.TRIAGE_PERSISTENCE_FILE_IDS.finalStateScreenshot, screenshot);
+        }
+    }
+    catch (error) {
+        Logger_1.appLogger.error(`Failed to persist final-state screenshot for flow ${flowId}.`, error);
+    }
+}
 async function finalizeTest(page, testInfo, logBuffer, videoOption) {
     const sharedState = page._dnb;
     // Kick off video persistence early in teardown. The actual file copy is
@@ -919,6 +953,10 @@ async function finalizeTest(page, testInfo, logBuffer, videoOption) {
     catch (error) {
         Logger_1.appLogger.error(`Error during cleanup for test ${testInfo.title}:`, error);
     }
+    // Capture the flow's final visual state before the status-specific handling
+    // below: triage (failed branch) reads it as the failure screenshot, and a
+    // future failing run reads a successful run's copy as its baseline.
+    await captureAndPersistFinalState(page, testInfo);
     if (testInfo.status === 'failed') {
         if (isV1Test(testInfo)) {
             if (isV1SelfHealingEnabled(testInfo) &&

package/dist/lib/test/utils/triageTestFailure.d.ts CHANGED Viewed

@@ -28,9 +28,10 @@ import type { DonobuExtendedPage } from '../../page/DonobuExtendedPage';
  *      history from the persistence layer.
  *   3. Fetches **historical runs** of the same flow (by name) from the flows manager to
  *      detect flakiness, regression patterns, and prior self-heal success.
- *   4. Captures the **failure screenshot** (last tool call screenshot from the current
- *      run) and the **baseline screenshot** (last tool call screenshot from the most
- *      recent successful historical run) for visual comparison.
+ *   4. Captures the **failure screenshot** (a live screenshot taken at triage time, while
+ *      the page is still open during teardown, so it reflects the true final state) and the
+ *      **baseline screenshot** (last tool call screenshot from the most recent successful
+ *      historical run) for visual comparison.
  *   5. Reads the source of the failing test case for contextual grounding.
  *   6. Runs the **heuristic classifier** (`deriveHeuristicAssessment`) which uses
  *      rule-based pattern matching over errors, tool calls, stale-cache indicators,
@@ -70,7 +71,7 @@ import type { DonobuExtendedPage } from '../../page/DonobuExtendedPage';
  * | Flow metadata           | `DonobuExtendedPage._dnb`     | Run mode, objective, allowed tools, timing         |
  * | Stale cache indicators  | Derived from above            | Whether page.ai cache staleness is the root cause  |
  * | Historical flow runs    | `DonobuFlowsManager.getFlows` | Flakiness, regression patterns, prior self-heal    |
- * | Failure screenshot      | Last tool call screenshot     | Visual state of the page when the failure occurred |
+ * | Failure screenshot      | Live capture at triage time   | True final visual state of the page when it failed |
  * | Baseline screenshot     | Last successful run's screenshot | Visual reference for what the page *should* look like |
  * | Test source snippet     | TypeScript AST parsing        | The test's expectations and structure               |
  *
@@ -408,6 +409,14 @@ declare const TRIAGE_PERSISTENCE_FILE_IDS: {
     readonly evidence: "triage-evidence.json";
     readonly failureScreenshot: "triage-failure-screenshot.png";
     readonly baselineScreenshot: "triage-baseline-screenshot.png";
+    /**
+     * Live screenshot of a flow's final visual state, captured at teardown while
+     * the page is still open. Persisted on successful runs so that a *later*
+     * failing run can use it as a true final-state baseline — symmetric with the
+     * failure screenshot, which is also a live end-of-test capture. Keyed per
+     * flow, like browser state.
+     */
+    readonly finalStateScreenshot: "triage-final-state-screenshot.png";
 };
 /**
  * Compresses a set of historical flow runs into an aggregate summary compact
@@ -420,6 +429,19 @@ declare function summarizeFlowHistory(flowName: string, flows: FlowMetadata[]):
  * success, and whether the page.ai cache was recently validated.
  */
 declare function deriveHistoricalSignals(history: FlowHistorySummary): HistoricalSignals;
+/**
+ * Captures a fresh screenshot of the page's current visual state. Called at
+ * teardown (failure triage and successful-run baseline capture) while the
+ * page/context is still open, so it reflects the true *end state* of the test.
+ *
+ * This is deliberately preferred over the last Donobu tool-call screenshot:
+ * Playwright `expect`/`waitFor` are not tool calls, so the last tool-call image
+ * can predate the failing assertion and capture a transient state (e.g. a
+ * loading spinner that has since resolved), which misleads the vision model.
+ * Fails open — returns null if the page is gone or unresponsive (crash, closed
+ * context, hang), in which case the caller proceeds without a screenshot.
+ */
+declare function captureLivePageScreenshot(page: DonobuExtendedPage): Promise<Buffer | null>;
 /**
  * Builds the heuristic triage assessment by combining rule-based inference,
  * contextual flags, and derived remediation guidance ahead of GPT enrichment.
@@ -432,5 +454,5 @@ declare function deriveHeuristicAssessment(testInfo: TestInfo, errorSummaries: E
 declare function reconcileTreatmentPlan(plan: z.infer<typeof TreatmentPlan>, heuristics: HeuristicAssessment): z.infer<typeof TreatmentPlan>;
 declare function gatherTestFailureEvidence(testInfo: TestInfo, page: DonobuExtendedPage, options?: GatherTestFailureEvidenceOptions): Promise<GatherTestFailureEvidenceResult | null>;
 declare function generateTreatmentPlanFromEvidence(gptClient: GptClient, evidence: FailureEvidenceRecord): Promise<z.infer<typeof TreatmentPlan>>;
-export { type AdditionalDataRequest, AdditionalDataRequestSchema, type AutomationDirectives, deriveHeuristicAssessment, deriveHistoricalSignals, type ErrorSummary, type FailureEvidenceRecord, type FailureReason, FailureReasonSchema, type FlowHistorySummary, gatherTestFailureEvidence, type GatherTestFailureEvidenceOptions, type GatherTestFailureEvidenceResult, generateTreatmentPlanFromEvidence, type HeuristicAssessment, type HistoricalFlowRun, type HistoricalSignals, reconcileTreatmentPlan, type RemediationCategory, type RemediationStep, RemediationStepSchema, type SanitizedFlowMetadata, type SummarizedToolCall, summarizeFlowHistory, TreatmentPlan, TRIAGE_PERSISTENCE_FILE_IDS, };
+export { type AdditionalDataRequest, AdditionalDataRequestSchema, type AutomationDirectives, captureLivePageScreenshot, deriveHeuristicAssessment, deriveHistoricalSignals, type ErrorSummary, type FailureEvidenceRecord, type FailureReason, FailureReasonSchema, type FlowHistorySummary, gatherTestFailureEvidence, type GatherTestFailureEvidenceOptions, type GatherTestFailureEvidenceResult, generateTreatmentPlanFromEvidence, type HeuristicAssessment, type HistoricalFlowRun, type HistoricalSignals, reconcileTreatmentPlan, type RemediationCategory, type RemediationStep, RemediationStepSchema, type SanitizedFlowMetadata, type SummarizedToolCall, summarizeFlowHistory, TreatmentPlan, TRIAGE_PERSISTENCE_FILE_IDS, };
 //# sourceMappingURL=triageTestFailure.d.ts.map