donobu 5.46.0 → 5.47.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -960,6 +960,12 @@ function applyJsonReportEnv(env, outputDir) {
960
960
  */
961
961
  function evaluateAutoHealEligibility(plans) {
962
962
  const eligiblePlans = plans.filter((record) => {
963
+ // Respect the per-project `autoHeal: false` setting here (rather than
964
+ // suppressing the diagnosis): such tests still get a treatment plan for the
965
+ // report, but are never re-run.
966
+ if (record.evidence?.failureContext?.testCase?.autoHealEnabled === false) {
967
+ return false;
968
+ }
963
969
  const directives = record.plan.automationDirectives;
964
970
  return (record.plan.shouldRetryAutomation === true &&
965
971
  directives !== undefined &&
@@ -1194,11 +1200,12 @@ async function postProcessTriageRun(context, originalPlaywrightArgs, reportPath)
1194
1200
  const testLabel = evidence.failureContext.testCase.title ??
1195
1201
  evidence.failureContext.testCase.file ??
1196
1202
  'unknown test';
1197
- if (evidence.failureContext.testCase.autoHealEnabled === false) {
1198
- Logger_1.appLogger.info(`Skipping treatment plan for "${testLabel}" — auto-heal is disabled for this test's project.`);
1199
- continue;
1200
- }
1201
- Logger_1.appLogger.info(`Detected test failure for "${testLabel}". Generating treatment plan to facilitate healing...`);
1203
+ // Generate a treatment plan (diagnosis) for every failure, independent of
1204
+ // whether auto-heal is enabled for the test's project. Triage is a
1205
+ // standalone diagnostic surfaced in the reports; the per-project
1206
+ // `autoHeal: false` setting suppresses only the auto-heal *rerun*, which is
1207
+ // enforced separately in `evaluateAutoHealEligibility`.
1208
+ Logger_1.appLogger.info(`Detected test failure for "${testLabel}". Generating treatment plan...`);
1202
1209
  const heuristicFallback = () => {
1203
1210
  const h = evidence.failureContext.heuristics;
1204
1211
  return {
@@ -1499,7 +1506,12 @@ async function relocateTemporaryAttachments(report, outputDir) {
1499
1506
  * outcome. Reads the output path from the sidecar written by the reporter
1500
1507
  * during the initial run; does nothing if the reporter wasn't configured.
1501
1508
  */
1502
- async function regenerateDonobuReports(mergedReport) {
1509
+ async function regenerateDonobuReports(mergedReport, options = {}) {
1510
+ // Slack regeneration also (re-)posts to the webhook. Callers where the Slack
1511
+ // reporter already posted during the initial run (e.g. triage without
1512
+ // auto-heal) pass `includeSlack: false` to avoid a duplicate post — the Slack
1513
+ // summary carries no triage detail, so there is nothing to refresh anyway.
1514
+ const { includeSlack = true } = options;
1503
1515
  const outputs = mergedReport.metadata?.donobuOutputs;
1504
1516
  if (!outputs) {
1505
1517
  return;
@@ -1510,7 +1522,7 @@ async function regenerateDonobuReports(mergedReport) {
1510
1522
  if (outputs.markdown?.outputFile) {
1511
1523
  await regenerateMarkdownOutput(mergedReport, outputs.markdown.outputFile);
1512
1524
  }
1513
- if (outputs.slack?.outputFile) {
1525
+ if (includeSlack && outputs.slack?.outputFile) {
1514
1526
  await regenerateSlackOutput(mergedReport, outputs.slack.outputFile);
1515
1527
  }
1516
1528
  }
@@ -1744,6 +1756,15 @@ async function runTestCommand(cliArgs) {
1744
1756
  Logger_1.appLogger.info('Running with Page.AI cache clearing enabled for this run.');
1745
1757
  }
1746
1758
  let triageEnabled = options.triageEnabled;
1759
+ // Auto-heal is built on triage: it acts on the treatment plans triage
1760
+ // produces, so it cannot function without it. If the user explicitly disabled
1761
+ // triage (--no-triage) while enabling auto-heal, honor auto-heal and
1762
+ // re-enable triage with a warning rather than running an inert auto-heal.
1763
+ if (options.autoHeal && !triageEnabled) {
1764
+ Logger_1.appLogger.warn('--no-triage was ignored because --auto-heal requires triage to produce ' +
1765
+ 'the treatment plans it acts on; continuing with triage enabled.');
1766
+ triageEnabled = true;
1767
+ }
1747
1768
  let triageContext = null;
1748
1769
  if (triageEnabled) {
1749
1770
  try {
@@ -1793,19 +1814,18 @@ async function runTestCommand(cliArgs) {
1793
1814
  const { userHadJson, jsonOutputFile } = jsonReporterInfo;
1794
1815
  let generatedPlans = [];
1795
1816
  try {
1796
- if (exitCode === 0 || !effectiveOptions.autoHeal) {
1797
- // Auto-heal wasn't attempted (either tests passed or the user disabled
1798
- // it). If a Slack reporter deferred its POST waiting for us, deliver the
1799
- // initial run's payload now — nothing further will be re-rendered.
1817
+ // Tests passed nothing to triage or heal. If a Slack reporter deferred
1818
+ // its POST for us (auto-heal mode), deliver the initial run's payload now.
1819
+ if (exitCode === 0) {
1800
1820
  if (effectiveOptions.autoHeal) {
1801
1821
  await postDeferredSlackFromInitialRun(playwrightOutputDir);
1802
1822
  }
1803
1823
  return exitCode;
1804
1824
  }
1805
- // Tests failed and auto-heal is enabled load the initial report into
1806
- // memory so we can merge it with the heal-run report later. Prefer the
1807
- // Donobu reporter's state file (carries Donobu-specific metadata like the
1808
- // HTML output path); fall back to reading the user's JSON file directly.
1825
+ // Tests failed. Load the initial report so we can merge it with an auto-heal
1826
+ // rerun and/or regenerate reports enriched with triage. Prefer the Donobu
1827
+ // reporter's state file (carries Donobu-specific metadata like output
1828
+ // paths); fall back to the user's JSON file.
1809
1829
  let initialDonobuReport = null;
1810
1830
  if (triageEnabled) {
1811
1831
  const stateReport = await readJsonIfExists(path.join(playwrightOutputDir, model_1.DONOBU_REPORT_STATE_FILENAME));
@@ -1815,28 +1835,64 @@ async function runTestCommand(cliArgs) {
1815
1835
  ? await readJsonIfExists(jsonOutputFile)
1816
1836
  : null);
1817
1837
  }
1838
+ // Triage (evidence → treatment plans) runs on failures whenever enabled,
1839
+ // independent of auto-heal. Auto-heal consumes these plans; standalone
1840
+ // triage surfaces the diagnosis in the reports for humans.
1818
1841
  if (triageEnabled && triageContext) {
1819
1842
  generatedPlans = await postProcessTriageRun(triageContext, playwrightArgs, jsonOutputFile ?? undefined);
1820
1843
  }
1821
- const autoHealOutcome = await attemptAutoHealRun({
1822
- options: effectiveOptions,
1823
- playwrightArgs,
1824
- playwrightOutputDir,
1825
- generatedPlans,
1826
- currentExitCode: exitCode,
1827
- initialReport: initialDonobuReport,
1828
- initialReportSourcePath: jsonOutputFile ?? undefined,
1829
- triageRunDir: triageContext?.runDir,
1830
- userJsonOutputFile: userHadJson ? jsonOutputFile : null,
1831
- });
1832
- // When auto-heal was eligible-checked but didn't actually run a rerun (no
1833
- // actionable directives), nothing downstream re-renders the Slack payload
1834
- // deliver the pre-heal payload now so we honor the "one post per run"
1835
- // guarantee the reporter was deferring for.
1836
- if (!autoHealOutcome.attempted) {
1837
- await postDeferredSlackFromInitialRun(playwrightOutputDir);
1838
- }
1839
- return autoHealOutcome.exitCode;
1844
+ if (effectiveOptions.autoHeal) {
1845
+ const autoHealOutcome = await attemptAutoHealRun({
1846
+ options: effectiveOptions,
1847
+ playwrightArgs,
1848
+ playwrightOutputDir,
1849
+ generatedPlans,
1850
+ currentExitCode: exitCode,
1851
+ initialReport: initialDonobuReport,
1852
+ initialReportSourcePath: jsonOutputFile ?? undefined,
1853
+ triageRunDir: triageContext?.runDir,
1854
+ userJsonOutputFile: userHadJson ? jsonOutputFile : null,
1855
+ });
1856
+ // When auto-heal was eligible-checked but didn't actually run a rerun (no
1857
+ // actionable directives e.g. triage classified the failures as
1858
+ // application/product defects), nothing downstream re-renders the reports.
1859
+ if (!autoHealOutcome.attempted) {
1860
+ // The initial reports were rendered before triage finished, so they
1861
+ // carry no triage analysis. Regenerate them from the initial report
1862
+ // enriched with the triage run dir so the diagnosis surfaces. The
1863
+ // regeneration also performs the single authoritative Slack post the
1864
+ // reporter deferred for.
1865
+ const triageRunDir = triageContext?.runDir;
1866
+ const enrichedInitialReport = triageRunDir && initialDonobuReport?.metadata?.donobuOutputs
1867
+ ? {
1868
+ ...initialDonobuReport,
1869
+ metadata: { ...initialDonobuReport.metadata, triageRunDir },
1870
+ }
1871
+ : null;
1872
+ if (enrichedInitialReport) {
1873
+ await regenerateDonobuReports(enrichedInitialReport);
1874
+ }
1875
+ else {
1876
+ // No triage data to inject (no Donobu reporter configured). Deliver
1877
+ // the pre-heal Slack payload so we honor the "one post per run"
1878
+ // guarantee the reporter was deferring for.
1879
+ await postDeferredSlackFromInitialRun(playwrightOutputDir);
1880
+ }
1881
+ }
1882
+ return autoHealOutcome.exitCode;
1883
+ }
1884
+ // Auto-heal disabled: no rerun, and the Slack reporter already posted during
1885
+ // the initial run (its POST was not deferred). Regenerate the HTML/Markdown
1886
+ // reports so the triage analysis is visible, but leave Slack untouched to
1887
+ // avoid a duplicate post (the Slack summary carries no triage detail).
1888
+ const triageRunDir = triageContext?.runDir;
1889
+ if (triageRunDir && initialDonobuReport?.metadata?.donobuOutputs) {
1890
+ await regenerateDonobuReports({
1891
+ ...initialDonobuReport,
1892
+ metadata: { ...initialDonobuReport.metadata, triageRunDir },
1893
+ }, { includeSlack: false });
1894
+ }
1895
+ return exitCode;
1840
1896
  }
1841
1897
  finally {
1842
1898
  // Clean up the JSON artifacts Donobu created for its own internal use.
@@ -960,6 +960,12 @@ function applyJsonReportEnv(env, outputDir) {
960
960
  */
961
961
  function evaluateAutoHealEligibility(plans) {
962
962
  const eligiblePlans = plans.filter((record) => {
963
+ // Respect the per-project `autoHeal: false` setting here (rather than
964
+ // suppressing the diagnosis): such tests still get a treatment plan for the
965
+ // report, but are never re-run.
966
+ if (record.evidence?.failureContext?.testCase?.autoHealEnabled === false) {
967
+ return false;
968
+ }
963
969
  const directives = record.plan.automationDirectives;
964
970
  return (record.plan.shouldRetryAutomation === true &&
965
971
  directives !== undefined &&
@@ -1194,11 +1200,12 @@ async function postProcessTriageRun(context, originalPlaywrightArgs, reportPath)
1194
1200
  const testLabel = evidence.failureContext.testCase.title ??
1195
1201
  evidence.failureContext.testCase.file ??
1196
1202
  'unknown test';
1197
- if (evidence.failureContext.testCase.autoHealEnabled === false) {
1198
- Logger_1.appLogger.info(`Skipping treatment plan for "${testLabel}" — auto-heal is disabled for this test's project.`);
1199
- continue;
1200
- }
1201
- Logger_1.appLogger.info(`Detected test failure for "${testLabel}". Generating treatment plan to facilitate healing...`);
1203
+ // Generate a treatment plan (diagnosis) for every failure, independent of
1204
+ // whether auto-heal is enabled for the test's project. Triage is a
1205
+ // standalone diagnostic surfaced in the reports; the per-project
1206
+ // `autoHeal: false` setting suppresses only the auto-heal *rerun*, which is
1207
+ // enforced separately in `evaluateAutoHealEligibility`.
1208
+ Logger_1.appLogger.info(`Detected test failure for "${testLabel}". Generating treatment plan...`);
1202
1209
  const heuristicFallback = () => {
1203
1210
  const h = evidence.failureContext.heuristics;
1204
1211
  return {
@@ -1499,7 +1506,12 @@ async function relocateTemporaryAttachments(report, outputDir) {
1499
1506
  * outcome. Reads the output path from the sidecar written by the reporter
1500
1507
  * during the initial run; does nothing if the reporter wasn't configured.
1501
1508
  */
1502
- async function regenerateDonobuReports(mergedReport) {
1509
+ async function regenerateDonobuReports(mergedReport, options = {}) {
1510
+ // Slack regeneration also (re-)posts to the webhook. Callers where the Slack
1511
+ // reporter already posted during the initial run (e.g. triage without
1512
+ // auto-heal) pass `includeSlack: false` to avoid a duplicate post — the Slack
1513
+ // summary carries no triage detail, so there is nothing to refresh anyway.
1514
+ const { includeSlack = true } = options;
1503
1515
  const outputs = mergedReport.metadata?.donobuOutputs;
1504
1516
  if (!outputs) {
1505
1517
  return;
@@ -1510,7 +1522,7 @@ async function regenerateDonobuReports(mergedReport) {
1510
1522
  if (outputs.markdown?.outputFile) {
1511
1523
  await regenerateMarkdownOutput(mergedReport, outputs.markdown.outputFile);
1512
1524
  }
1513
- if (outputs.slack?.outputFile) {
1525
+ if (includeSlack && outputs.slack?.outputFile) {
1514
1526
  await regenerateSlackOutput(mergedReport, outputs.slack.outputFile);
1515
1527
  }
1516
1528
  }
@@ -1744,6 +1756,15 @@ async function runTestCommand(cliArgs) {
1744
1756
  Logger_1.appLogger.info('Running with Page.AI cache clearing enabled for this run.');
1745
1757
  }
1746
1758
  let triageEnabled = options.triageEnabled;
1759
+ // Auto-heal is built on triage: it acts on the treatment plans triage
1760
+ // produces, so it cannot function without it. If the user explicitly disabled
1761
+ // triage (--no-triage) while enabling auto-heal, honor auto-heal and
1762
+ // re-enable triage with a warning rather than running an inert auto-heal.
1763
+ if (options.autoHeal && !triageEnabled) {
1764
+ Logger_1.appLogger.warn('--no-triage was ignored because --auto-heal requires triage to produce ' +
1765
+ 'the treatment plans it acts on; continuing with triage enabled.');
1766
+ triageEnabled = true;
1767
+ }
1747
1768
  let triageContext = null;
1748
1769
  if (triageEnabled) {
1749
1770
  try {
@@ -1793,19 +1814,18 @@ async function runTestCommand(cliArgs) {
1793
1814
  const { userHadJson, jsonOutputFile } = jsonReporterInfo;
1794
1815
  let generatedPlans = [];
1795
1816
  try {
1796
- if (exitCode === 0 || !effectiveOptions.autoHeal) {
1797
- // Auto-heal wasn't attempted (either tests passed or the user disabled
1798
- // it). If a Slack reporter deferred its POST waiting for us, deliver the
1799
- // initial run's payload now — nothing further will be re-rendered.
1817
+ // Tests passed nothing to triage or heal. If a Slack reporter deferred
1818
+ // its POST for us (auto-heal mode), deliver the initial run's payload now.
1819
+ if (exitCode === 0) {
1800
1820
  if (effectiveOptions.autoHeal) {
1801
1821
  await postDeferredSlackFromInitialRun(playwrightOutputDir);
1802
1822
  }
1803
1823
  return exitCode;
1804
1824
  }
1805
- // Tests failed and auto-heal is enabled load the initial report into
1806
- // memory so we can merge it with the heal-run report later. Prefer the
1807
- // Donobu reporter's state file (carries Donobu-specific metadata like the
1808
- // HTML output path); fall back to reading the user's JSON file directly.
1825
+ // Tests failed. Load the initial report so we can merge it with an auto-heal
1826
+ // rerun and/or regenerate reports enriched with triage. Prefer the Donobu
1827
+ // reporter's state file (carries Donobu-specific metadata like output
1828
+ // paths); fall back to the user's JSON file.
1809
1829
  let initialDonobuReport = null;
1810
1830
  if (triageEnabled) {
1811
1831
  const stateReport = await readJsonIfExists(path.join(playwrightOutputDir, model_1.DONOBU_REPORT_STATE_FILENAME));
@@ -1815,28 +1835,64 @@ async function runTestCommand(cliArgs) {
1815
1835
  ? await readJsonIfExists(jsonOutputFile)
1816
1836
  : null);
1817
1837
  }
1838
+ // Triage (evidence → treatment plans) runs on failures whenever enabled,
1839
+ // independent of auto-heal. Auto-heal consumes these plans; standalone
1840
+ // triage surfaces the diagnosis in the reports for humans.
1818
1841
  if (triageEnabled && triageContext) {
1819
1842
  generatedPlans = await postProcessTriageRun(triageContext, playwrightArgs, jsonOutputFile ?? undefined);
1820
1843
  }
1821
- const autoHealOutcome = await attemptAutoHealRun({
1822
- options: effectiveOptions,
1823
- playwrightArgs,
1824
- playwrightOutputDir,
1825
- generatedPlans,
1826
- currentExitCode: exitCode,
1827
- initialReport: initialDonobuReport,
1828
- initialReportSourcePath: jsonOutputFile ?? undefined,
1829
- triageRunDir: triageContext?.runDir,
1830
- userJsonOutputFile: userHadJson ? jsonOutputFile : null,
1831
- });
1832
- // When auto-heal was eligible-checked but didn't actually run a rerun (no
1833
- // actionable directives), nothing downstream re-renders the Slack payload
1834
- // deliver the pre-heal payload now so we honor the "one post per run"
1835
- // guarantee the reporter was deferring for.
1836
- if (!autoHealOutcome.attempted) {
1837
- await postDeferredSlackFromInitialRun(playwrightOutputDir);
1838
- }
1839
- return autoHealOutcome.exitCode;
1844
+ if (effectiveOptions.autoHeal) {
1845
+ const autoHealOutcome = await attemptAutoHealRun({
1846
+ options: effectiveOptions,
1847
+ playwrightArgs,
1848
+ playwrightOutputDir,
1849
+ generatedPlans,
1850
+ currentExitCode: exitCode,
1851
+ initialReport: initialDonobuReport,
1852
+ initialReportSourcePath: jsonOutputFile ?? undefined,
1853
+ triageRunDir: triageContext?.runDir,
1854
+ userJsonOutputFile: userHadJson ? jsonOutputFile : null,
1855
+ });
1856
+ // When auto-heal was eligible-checked but didn't actually run a rerun (no
1857
+ // actionable directives e.g. triage classified the failures as
1858
+ // application/product defects), nothing downstream re-renders the reports.
1859
+ if (!autoHealOutcome.attempted) {
1860
+ // The initial reports were rendered before triage finished, so they
1861
+ // carry no triage analysis. Regenerate them from the initial report
1862
+ // enriched with the triage run dir so the diagnosis surfaces. The
1863
+ // regeneration also performs the single authoritative Slack post the
1864
+ // reporter deferred for.
1865
+ const triageRunDir = triageContext?.runDir;
1866
+ const enrichedInitialReport = triageRunDir && initialDonobuReport?.metadata?.donobuOutputs
1867
+ ? {
1868
+ ...initialDonobuReport,
1869
+ metadata: { ...initialDonobuReport.metadata, triageRunDir },
1870
+ }
1871
+ : null;
1872
+ if (enrichedInitialReport) {
1873
+ await regenerateDonobuReports(enrichedInitialReport);
1874
+ }
1875
+ else {
1876
+ // No triage data to inject (no Donobu reporter configured). Deliver
1877
+ // the pre-heal Slack payload so we honor the "one post per run"
1878
+ // guarantee the reporter was deferring for.
1879
+ await postDeferredSlackFromInitialRun(playwrightOutputDir);
1880
+ }
1881
+ }
1882
+ return autoHealOutcome.exitCode;
1883
+ }
1884
+ // Auto-heal disabled: no rerun, and the Slack reporter already posted during
1885
+ // the initial run (its POST was not deferred). Regenerate the HTML/Markdown
1886
+ // reports so the triage analysis is visible, but leave Slack untouched to
1887
+ // avoid a duplicate post (the Slack summary carries no triage detail).
1888
+ const triageRunDir = triageContext?.runDir;
1889
+ if (triageRunDir && initialDonobuReport?.metadata?.donobuOutputs) {
1890
+ await regenerateDonobuReports({
1891
+ ...initialDonobuReport,
1892
+ metadata: { ...initialDonobuReport.metadata, triageRunDir },
1893
+ }, { includeSlack: false });
1894
+ }
1895
+ return exitCode;
1840
1896
  }
1841
1897
  finally {
1842
1898
  // Clean up the JSON artifacts Donobu created for its own internal use.
@@ -843,6 +843,40 @@ async function attachStepScreenshots(sharedState, testInfo) {
843
843
  contentType: 'application/json',
844
844
  });
845
845
  }
846
+ /**
847
+ * Capture a live screenshot of the flow's final visual state at teardown (page
848
+ * still open) and persist it as a per-flow file — the single source of truth
849
+ * for "what the page looked like when this run ended." It is read both as the
850
+ * current run's failure screenshot (when this run failed) and as the baseline
851
+ * for a later failing run (when this run succeeded), keeping the two symmetric.
852
+ * See `fetchBaselineScreenshot` / `gatherTestFailureEvidence` in
853
+ * triageTestFailure.ts.
854
+ *
855
+ * Runs for any meaningful end state; skipped only for `skipped` tests (no real
856
+ * page state), when triage is disabled, or for V1 (legacy self-heal) tests.
857
+ * Best-effort and fails open.
858
+ */
859
+ async function captureAndPersistFinalState(page, testInfo) {
860
+ if (testInfo.status === 'skipped' ||
861
+ process.env.DONOBU_TRIAGE_DISABLED === '1' ||
862
+ isV1Test(testInfo)) {
863
+ return;
864
+ }
865
+ const flowId = page._dnb?.donobuFlowMetadata?.id;
866
+ const persistence = page._dnb?.persistence;
867
+ if (!flowId || !persistence) {
868
+ return;
869
+ }
870
+ try {
871
+ const screenshot = await (0, triageTestFailure_1.captureLivePageScreenshot)(page);
872
+ if (screenshot) {
873
+ await persistence.setFlowFile(flowId, triageTestFailure_1.TRIAGE_PERSISTENCE_FILE_IDS.finalStateScreenshot, screenshot);
874
+ }
875
+ }
876
+ catch (error) {
877
+ Logger_1.appLogger.error(`Failed to persist final-state screenshot for flow ${flowId}.`, error);
878
+ }
879
+ }
846
880
  async function finalizeTest(page, testInfo, logBuffer, videoOption) {
847
881
  const sharedState = page._dnb;
848
882
  // Kick off video persistence early in teardown. The actual file copy is
@@ -919,6 +953,10 @@ async function finalizeTest(page, testInfo, logBuffer, videoOption) {
919
953
  catch (error) {
920
954
  Logger_1.appLogger.error(`Error during cleanup for test ${testInfo.title}:`, error);
921
955
  }
956
+ // Capture the flow's final visual state before the status-specific handling
957
+ // below: triage (failed branch) reads it as the failure screenshot, and a
958
+ // future failing run reads a successful run's copy as its baseline.
959
+ await captureAndPersistFinalState(page, testInfo);
922
960
  if (testInfo.status === 'failed') {
923
961
  if (isV1Test(testInfo)) {
924
962
  if (isV1SelfHealingEnabled(testInfo) &&
@@ -28,9 +28,10 @@ import type { DonobuExtendedPage } from '../../page/DonobuExtendedPage';
28
28
  * history from the persistence layer.
29
29
  * 3. Fetches **historical runs** of the same flow (by name) from the flows manager to
30
30
  * detect flakiness, regression patterns, and prior self-heal success.
31
- * 4. Captures the **failure screenshot** (last tool call screenshot from the current
32
- * run) and the **baseline screenshot** (last tool call screenshot from the most
33
- * recent successful historical run) for visual comparison.
31
+ * 4. Captures the **failure screenshot** (a live screenshot taken at triage time, while
32
+ * the page is still open during teardown, so it reflects the true final state) and the
33
+ * **baseline screenshot** (last tool call screenshot from the most recent successful
34
+ * historical run) for visual comparison.
34
35
  * 5. Reads the source of the failing test case for contextual grounding.
35
36
  * 6. Runs the **heuristic classifier** (`deriveHeuristicAssessment`) which uses
36
37
  * rule-based pattern matching over errors, tool calls, stale-cache indicators,
@@ -70,7 +71,7 @@ import type { DonobuExtendedPage } from '../../page/DonobuExtendedPage';
70
71
  * | Flow metadata | `DonobuExtendedPage._dnb` | Run mode, objective, allowed tools, timing |
71
72
  * | Stale cache indicators | Derived from above | Whether page.ai cache staleness is the root cause |
72
73
  * | Historical flow runs | `DonobuFlowsManager.getFlows` | Flakiness, regression patterns, prior self-heal |
73
- * | Failure screenshot | Last tool call screenshot | Visual state of the page when the failure occurred |
74
+ * | Failure screenshot | Live capture at triage time | True final visual state of the page when it failed |
74
75
  * | Baseline screenshot | Last successful run's screenshot | Visual reference for what the page *should* look like |
75
76
  * | Test source snippet | TypeScript AST parsing | The test's expectations and structure |
76
77
  *
@@ -408,6 +409,14 @@ declare const TRIAGE_PERSISTENCE_FILE_IDS: {
408
409
  readonly evidence: "triage-evidence.json";
409
410
  readonly failureScreenshot: "triage-failure-screenshot.png";
410
411
  readonly baselineScreenshot: "triage-baseline-screenshot.png";
412
+ /**
413
+ * Live screenshot of a flow's final visual state, captured at teardown while
414
+ * the page is still open. Persisted on successful runs so that a *later*
415
+ * failing run can use it as a true final-state baseline — symmetric with the
416
+ * failure screenshot, which is also a live end-of-test capture. Keyed per
417
+ * flow, like browser state.
418
+ */
419
+ readonly finalStateScreenshot: "triage-final-state-screenshot.png";
411
420
  };
412
421
  /**
413
422
  * Compresses a set of historical flow runs into an aggregate summary compact
@@ -420,6 +429,19 @@ declare function summarizeFlowHistory(flowName: string, flows: FlowMetadata[]):
420
429
  * success, and whether the page.ai cache was recently validated.
421
430
  */
422
431
  declare function deriveHistoricalSignals(history: FlowHistorySummary): HistoricalSignals;
432
+ /**
433
+ * Captures a fresh screenshot of the page's current visual state. Called at
434
+ * teardown (failure triage and successful-run baseline capture) while the
435
+ * page/context is still open, so it reflects the true *end state* of the test.
436
+ *
437
+ * This is deliberately preferred over the last Donobu tool-call screenshot:
438
+ * Playwright `expect`/`waitFor` are not tool calls, so the last tool-call image
439
+ * can predate the failing assertion and capture a transient state (e.g. a
440
+ * loading spinner that has since resolved), which misleads the vision model.
441
+ * Fails open — returns null if the page is gone or unresponsive (crash, closed
442
+ * context, hang), in which case the caller proceeds without a screenshot.
443
+ */
444
+ declare function captureLivePageScreenshot(page: DonobuExtendedPage): Promise<Buffer | null>;
423
445
  /**
424
446
  * Builds the heuristic triage assessment by combining rule-based inference,
425
447
  * contextual flags, and derived remediation guidance ahead of GPT enrichment.
@@ -432,5 +454,5 @@ declare function deriveHeuristicAssessment(testInfo: TestInfo, errorSummaries: E
432
454
  declare function reconcileTreatmentPlan(plan: z.infer<typeof TreatmentPlan>, heuristics: HeuristicAssessment): z.infer<typeof TreatmentPlan>;
433
455
  declare function gatherTestFailureEvidence(testInfo: TestInfo, page: DonobuExtendedPage, options?: GatherTestFailureEvidenceOptions): Promise<GatherTestFailureEvidenceResult | null>;
434
456
  declare function generateTreatmentPlanFromEvidence(gptClient: GptClient, evidence: FailureEvidenceRecord): Promise<z.infer<typeof TreatmentPlan>>;
435
- export { type AdditionalDataRequest, AdditionalDataRequestSchema, type AutomationDirectives, deriveHeuristicAssessment, deriveHistoricalSignals, type ErrorSummary, type FailureEvidenceRecord, type FailureReason, FailureReasonSchema, type FlowHistorySummary, gatherTestFailureEvidence, type GatherTestFailureEvidenceOptions, type GatherTestFailureEvidenceResult, generateTreatmentPlanFromEvidence, type HeuristicAssessment, type HistoricalFlowRun, type HistoricalSignals, reconcileTreatmentPlan, type RemediationCategory, type RemediationStep, RemediationStepSchema, type SanitizedFlowMetadata, type SummarizedToolCall, summarizeFlowHistory, TreatmentPlan, TRIAGE_PERSISTENCE_FILE_IDS, };
457
+ export { type AdditionalDataRequest, AdditionalDataRequestSchema, type AutomationDirectives, captureLivePageScreenshot, deriveHeuristicAssessment, deriveHistoricalSignals, type ErrorSummary, type FailureEvidenceRecord, type FailureReason, FailureReasonSchema, type FlowHistorySummary, gatherTestFailureEvidence, type GatherTestFailureEvidenceOptions, type GatherTestFailureEvidenceResult, generateTreatmentPlanFromEvidence, type HeuristicAssessment, type HistoricalFlowRun, type HistoricalSignals, reconcileTreatmentPlan, type RemediationCategory, type RemediationStep, RemediationStepSchema, type SanitizedFlowMetadata, type SummarizedToolCall, summarizeFlowHistory, TreatmentPlan, TRIAGE_PERSISTENCE_FILE_IDS, };
436
458
  //# sourceMappingURL=triageTestFailure.d.ts.map
@@ -37,6 +37,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
37
37
  };
38
38
  Object.defineProperty(exports, "__esModule", { value: true });
39
39
  exports.TRIAGE_PERSISTENCE_FILE_IDS = exports.TreatmentPlan = exports.RemediationStepSchema = exports.FailureReasonSchema = exports.AdditionalDataRequestSchema = void 0;
40
+ exports.captureLivePageScreenshot = captureLivePageScreenshot;
40
41
  exports.deriveHeuristicAssessment = deriveHeuristicAssessment;
41
42
  exports.deriveHistoricalSignals = deriveHistoricalSignals;
42
43
  exports.gatherTestFailureEvidence = gatherTestFailureEvidence;
@@ -79,9 +80,10 @@ const cacheLocator_1 = require("../../ai/cache/cacheLocator");
79
80
  * history from the persistence layer.
80
81
  * 3. Fetches **historical runs** of the same flow (by name) from the flows manager to
81
82
  * detect flakiness, regression patterns, and prior self-heal success.
82
- * 4. Captures the **failure screenshot** (last tool call screenshot from the current
83
- * run) and the **baseline screenshot** (last tool call screenshot from the most
84
- * recent successful historical run) for visual comparison.
83
+ * 4. Captures the **failure screenshot** (a live screenshot taken at triage time, while
84
+ * the page is still open during teardown, so it reflects the true final state) and the
85
+ * **baseline screenshot** (last tool call screenshot from the most recent successful
86
+ * historical run) for visual comparison.
85
87
  * 5. Reads the source of the failing test case for contextual grounding.
86
88
  * 6. Runs the **heuristic classifier** (`deriveHeuristicAssessment`) which uses
87
89
  * rule-based pattern matching over errors, tool calls, stale-cache indicators,
@@ -121,7 +123,7 @@ const cacheLocator_1 = require("../../ai/cache/cacheLocator");
121
123
  * | Flow metadata | `DonobuExtendedPage._dnb` | Run mode, objective, allowed tools, timing |
122
124
  * | Stale cache indicators | Derived from above | Whether page.ai cache staleness is the root cause |
123
125
  * | Historical flow runs | `DonobuFlowsManager.getFlows` | Flakiness, regression patterns, prior self-heal |
124
- * | Failure screenshot | Last tool call screenshot | Visual state of the page when the failure occurred |
126
+ * | Failure screenshot | Live capture at triage time | True final visual state of the page when it failed |
125
127
  * | Baseline screenshot | Last successful run's screenshot | Visual reference for what the page *should* look like |
126
128
  * | Test source snippet | TypeScript AST parsing | The test's expectations and structure |
127
129
  *
@@ -331,6 +333,14 @@ const TRIAGE_PERSISTENCE_FILE_IDS = {
331
333
  evidence: 'triage-evidence.json',
332
334
  failureScreenshot: 'triage-failure-screenshot.png',
333
335
  baselineScreenshot: 'triage-baseline-screenshot.png',
336
+ /**
337
+ * Live screenshot of a flow's final visual state, captured at teardown while
338
+ * the page is still open. Persisted on successful runs so that a *later*
339
+ * failing run can use it as a true final-state baseline — symmetric with the
340
+ * failure screenshot, which is also a live end-of-test capture. Keyed per
341
+ * flow, like browser state.
342
+ */
343
+ finalStateScreenshot: 'triage-final-state-screenshot.png',
334
344
  };
335
345
  exports.TRIAGE_PERSISTENCE_FILE_IDS = TRIAGE_PERSISTENCE_FILE_IDS;
336
346
  /**
@@ -554,41 +564,59 @@ async function fetchFlowHistory(page) {
554
564
  }
555
565
  }
556
566
  /**
557
- * Retrieves the screenshot from the last completed tool call in the current flow.
558
- * Returns the raw PNG/JPEG buffer if available, or null. Fails open so triage
559
- * proceeds even if the screenshot cannot be loaded.
567
+ * Captures a fresh screenshot of the page's current visual state. Called at
568
+ * teardown (failure triage and successful-run baseline capture) while the
569
+ * page/context is still open, so it reflects the true *end state* of the test.
570
+ *
571
+ * This is deliberately preferred over the last Donobu tool-call screenshot:
572
+ * Playwright `expect`/`waitFor` are not tool calls, so the last tool-call image
573
+ * can predate the failing assertion and capture a transient state (e.g. a
574
+ * loading spinner that has since resolved), which misleads the vision model.
575
+ * Fails open — returns null if the page is gone or unresponsive (crash, closed
576
+ * context, hang), in which case the caller proceeds without a screenshot.
560
577
  */
561
- async function fetchLastToolCallScreenshot(page) {
562
- const flowId = page._dnb?.donobuFlowMetadata?.id;
563
- const persistence = page._dnb?.persistence;
564
- if (!flowId || !persistence) {
565
- return null;
566
- }
578
+ async function captureLivePageScreenshot(page) {
567
579
  try {
568
- const toolCalls = await persistence.getToolCalls(flowId);
569
- if (toolCalls.length === 0) {
570
- return null;
571
- }
572
- // Walk backwards to find the last tool call with a screenshot
573
- for (let i = toolCalls.length - 1; i >= 0; i--) {
574
- const screenshotId = toolCalls[i].postCallImageId;
575
- if (screenshotId) {
576
- return await persistence.getScreenShot(flowId, screenshotId);
577
- }
578
- }
579
- return null;
580
+ return await page.screenshot({ animations: 'disabled', timeout: 10000 });
580
581
  }
581
582
  catch (error) {
582
- Logger_1.appLogger.debug(`Failed to fetch last tool call screenshot for flow ${flowId}.`, error);
583
+ Logger_1.appLogger.debug('Failed to capture live page screenshot; proceeding without it.', error);
583
584
  return null;
584
585
  }
585
586
  }
586
587
  /**
587
- * Loads the final screenshot from a historical successful run to serve as a
588
- * visual baseline for comparison with the current failure state. This enables
589
- * the GPT triage agent to detect page redesigns and stale cache scenarios by
590
- * comparing "what the page looked like when it last worked" vs "what it looks
591
- * like now." Fails open — returns null if the screenshot cannot be retrieved.
588
+ * The failure screenshot for the current run. Prefers the final-state
589
+ * screenshot persisted at teardown (the single source of truth shared with
590
+ * baselines), and falls back to a live capture when it is missing — e.g. triage
591
+ * invoked outside the standard teardown, or the teardown capture failed.
592
+ */
593
+ async function fetchCurrentRunFinalStateScreenshot(page) {
594
+ const flowId = page._dnb?.donobuFlowMetadata?.id;
595
+ const persistence = page._dnb?.persistence;
596
+ if (flowId && persistence) {
597
+ try {
598
+ const persisted = await persistence.getFlowFile(flowId, TRIAGE_PERSISTENCE_FILE_IDS.finalStateScreenshot);
599
+ if (persisted) {
600
+ return persisted;
601
+ }
602
+ }
603
+ catch (error) {
604
+ Logger_1.appLogger.debug(`Failed to read persisted final-state screenshot for flow ${flowId}; falling back to a live capture.`, error);
605
+ }
606
+ }
607
+ return captureLivePageScreenshot(page);
608
+ }
609
+ /**
610
+ * Loads a baseline screenshot from a historical successful run so the GPT
611
+ * triage agent can compare "what the page looked like when it last worked" vs
612
+ * "what it looks like now" to detect redesigns and stale-cache scenarios.
613
+ *
614
+ * Prefers the persisted final-state screenshot (a live end-of-test capture
615
+ * written on successful runs) so the baseline is symmetric with the live
616
+ * failure screenshot — both true end states. Falls back to the last tool-call
617
+ * image for runs that predate final-state capture; that image can be a
618
+ * mid-flow frame, so callers should treat such baselines as approximate.
619
+ * Fails open — returns null if no screenshot can be retrieved.
592
620
  */
593
621
  async function fetchBaselineScreenshot(page, historicalFlowId) {
594
622
  const persistence = page._dnb?.persistence;
@@ -596,10 +624,12 @@ async function fetchBaselineScreenshot(page, historicalFlowId) {
596
624
  return null;
597
625
  }
598
626
  try {
599
- const toolCalls = await persistence.getToolCalls(historicalFlowId);
600
- if (toolCalls.length === 0) {
601
- return null;
627
+ const finalState = await persistence.getFlowFile(historicalFlowId, TRIAGE_PERSISTENCE_FILE_IDS.finalStateScreenshot);
628
+ if (finalState) {
629
+ return finalState;
602
630
  }
631
+ // Fallback for runs predating final-state capture: last tool-call image.
632
+ const toolCalls = await persistence.getToolCalls(historicalFlowId);
603
633
  for (let i = toolCalls.length - 1; i >= 0; i--) {
604
634
  const screenshotId = toolCalls[i].postCallImageId;
605
635
  if (screenshotId) {
@@ -1601,7 +1631,7 @@ async function gatherTestFailureEvidence(testInfo, page, options = {}) {
1601
1631
  // Capture screenshots for visual triage: current failure + baseline from last success
1602
1632
  const lastSuccessfulRunId = failureContext.flowHistory?.lastSuccessfulRunId ?? null;
1603
1633
  const [screenshotBuffer, baselineBuffer] = await Promise.all([
1604
- fetchLastToolCallScreenshot(page),
1634
+ fetchCurrentRunFinalStateScreenshot(page),
1605
1635
  lastSuccessfulRunId
1606
1636
  ? fetchBaselineScreenshot(page, lastSuccessfulRunId)
1607
1637
  : Promise.resolve(null),
@@ -1805,10 +1835,20 @@ passed to each tool invocation. Use these to improve diagnosis:
1805
1835
 
1806
1836
  SCREENSHOT EVIDENCE:
1807
1837
  You may receive one or two screenshots:
1808
- 1. "FAILURE SCREENSHOT" — the state of the page at or near the point of failure in the current run.
1838
+ 1. "FAILURE SCREENSHOT" — a live screenshot captured at triage time, immediately after the test
1839
+ failed and while the page was still open. It reflects the true FINAL visual state of the page.
1809
1840
  2. "BASELINE SCREENSHOT" — the state of the page at the end of the most recent successful run of
1810
1841
  this same flow. This serves as a visual reference for what the page *should* look like.
1811
1842
 
1843
+ IMPORTANT — a screenshot is a single moment in time, not a recording:
1844
+ - Describe only what the frame shows. Do NOT assert that a state persisted for a duration — e.g.
1845
+ "stuck on a loading spinner THROUGHOUT the test", "the page never loaded", "remained on X the
1846
+ whole time". A single frame cannot establish how long anything lasted.
1847
+ - Only claim a persistent or temporal condition when it is corroborated by NON-visual evidence:
1848
+ tool-call outcomes/durations, error messages, or timeouts in failureContext. Absent that, state
1849
+ the end condition factually (e.g. "the final screenshot shows a loading spinner") and let the
1850
+ other evidence determine duration and cause.
1851
+
1812
1852
  When both screenshots are provided, compare them to:
1813
1853
  - Detect UI changes (redesigns, layout shifts, new modals) that would explain selector or cache failures.
1814
1854
  - Identify whether the failure screenshot shows a fundamentally different page state (error page, login wall)
@@ -1837,7 +1877,10 @@ When only the failure screenshot is provided (no baseline available), use it to:
1837
1877
  if (evidence.failureScreenshotPath) {
1838
1878
  try {
1839
1879
  const failureBytes = await fs.readFile(evidence.failureScreenshotPath);
1840
- userItems.push({ type: 'text', text: 'FAILURE SCREENSHOT (current run):' }, { type: 'png', bytes: new Uint8Array(failureBytes) });
1880
+ userItems.push({
1881
+ type: 'text',
1882
+ text: 'FAILURE SCREENSHOT (live capture at triage time — true final state of the page):',
1883
+ }, { type: 'png', bytes: new Uint8Array(failureBytes) });
1841
1884
  }
1842
1885
  catch (screenshotError) {
1843
1886
  Logger_1.appLogger.debug('Failed to load failure screenshot for GPT triage, proceeding with text only.', screenshotError);
@@ -843,6 +843,40 @@ async function attachStepScreenshots(sharedState, testInfo) {
843
843
  contentType: 'application/json',
844
844
  });
845
845
  }
846
+ /**
847
+ * Capture a live screenshot of the flow's final visual state at teardown (page
848
+ * still open) and persist it as a per-flow file — the single source of truth
849
+ * for "what the page looked like when this run ended." It is read both as the
850
+ * current run's failure screenshot (when this run failed) and as the baseline
851
+ * for a later failing run (when this run succeeded), keeping the two symmetric.
852
+ * See `fetchBaselineScreenshot` / `gatherTestFailureEvidence` in
853
+ * triageTestFailure.ts.
854
+ *
855
+ * Runs for any meaningful end state; skipped only for `skipped` tests (no real
856
+ * page state), when triage is disabled, or for V1 (legacy self-heal) tests.
857
+ * Best-effort and fails open.
858
+ */
859
+ async function captureAndPersistFinalState(page, testInfo) {
860
+ if (testInfo.status === 'skipped' ||
861
+ process.env.DONOBU_TRIAGE_DISABLED === '1' ||
862
+ isV1Test(testInfo)) {
863
+ return;
864
+ }
865
+ const flowId = page._dnb?.donobuFlowMetadata?.id;
866
+ const persistence = page._dnb?.persistence;
867
+ if (!flowId || !persistence) {
868
+ return;
869
+ }
870
+ try {
871
+ const screenshot = await (0, triageTestFailure_1.captureLivePageScreenshot)(page);
872
+ if (screenshot) {
873
+ await persistence.setFlowFile(flowId, triageTestFailure_1.TRIAGE_PERSISTENCE_FILE_IDS.finalStateScreenshot, screenshot);
874
+ }
875
+ }
876
+ catch (error) {
877
+ Logger_1.appLogger.error(`Failed to persist final-state screenshot for flow ${flowId}.`, error);
878
+ }
879
+ }
846
880
  async function finalizeTest(page, testInfo, logBuffer, videoOption) {
847
881
  const sharedState = page._dnb;
848
882
  // Kick off video persistence early in teardown. The actual file copy is
@@ -919,6 +953,10 @@ async function finalizeTest(page, testInfo, logBuffer, videoOption) {
919
953
  catch (error) {
920
954
  Logger_1.appLogger.error(`Error during cleanup for test ${testInfo.title}:`, error);
921
955
  }
956
+ // Capture the flow's final visual state before the status-specific handling
957
+ // below: triage (failed branch) reads it as the failure screenshot, and a
958
+ // future failing run reads a successful run's copy as its baseline.
959
+ await captureAndPersistFinalState(page, testInfo);
922
960
  if (testInfo.status === 'failed') {
923
961
  if (isV1Test(testInfo)) {
924
962
  if (isV1SelfHealingEnabled(testInfo) &&
@@ -28,9 +28,10 @@ import type { DonobuExtendedPage } from '../../page/DonobuExtendedPage';
28
28
  * history from the persistence layer.
29
29
  * 3. Fetches **historical runs** of the same flow (by name) from the flows manager to
30
30
  * detect flakiness, regression patterns, and prior self-heal success.
31
- * 4. Captures the **failure screenshot** (last tool call screenshot from the current
32
- * run) and the **baseline screenshot** (last tool call screenshot from the most
33
- * recent successful historical run) for visual comparison.
31
+ * 4. Captures the **failure screenshot** (a live screenshot taken at triage time, while
32
+ * the page is still open during teardown, so it reflects the true final state) and the
33
+ * **baseline screenshot** (last tool call screenshot from the most recent successful
34
+ * historical run) for visual comparison.
34
35
  * 5. Reads the source of the failing test case for contextual grounding.
35
36
  * 6. Runs the **heuristic classifier** (`deriveHeuristicAssessment`) which uses
36
37
  * rule-based pattern matching over errors, tool calls, stale-cache indicators,
@@ -70,7 +71,7 @@ import type { DonobuExtendedPage } from '../../page/DonobuExtendedPage';
70
71
  * | Flow metadata | `DonobuExtendedPage._dnb` | Run mode, objective, allowed tools, timing |
71
72
  * | Stale cache indicators | Derived from above | Whether page.ai cache staleness is the root cause |
72
73
  * | Historical flow runs | `DonobuFlowsManager.getFlows` | Flakiness, regression patterns, prior self-heal |
73
- * | Failure screenshot | Last tool call screenshot | Visual state of the page when the failure occurred |
74
+ * | Failure screenshot | Live capture at triage time | True final visual state of the page when it failed |
74
75
  * | Baseline screenshot | Last successful run's screenshot | Visual reference for what the page *should* look like |
75
76
  * | Test source snippet | TypeScript AST parsing | The test's expectations and structure |
76
77
  *
@@ -408,6 +409,14 @@ declare const TRIAGE_PERSISTENCE_FILE_IDS: {
408
409
  readonly evidence: "triage-evidence.json";
409
410
  readonly failureScreenshot: "triage-failure-screenshot.png";
410
411
  readonly baselineScreenshot: "triage-baseline-screenshot.png";
412
+ /**
413
+ * Live screenshot of a flow's final visual state, captured at teardown while
414
+ * the page is still open. Persisted on successful runs so that a *later*
415
+ * failing run can use it as a true final-state baseline — symmetric with the
416
+ * failure screenshot, which is also a live end-of-test capture. Keyed per
417
+ * flow, like browser state.
418
+ */
419
+ readonly finalStateScreenshot: "triage-final-state-screenshot.png";
411
420
  };
412
421
  /**
413
422
  * Compresses a set of historical flow runs into an aggregate summary compact
@@ -420,6 +429,19 @@ declare function summarizeFlowHistory(flowName: string, flows: FlowMetadata[]):
420
429
  * success, and whether the page.ai cache was recently validated.
421
430
  */
422
431
  declare function deriveHistoricalSignals(history: FlowHistorySummary): HistoricalSignals;
432
+ /**
433
+ * Captures a fresh screenshot of the page's current visual state. Called at
434
+ * teardown (failure triage and successful-run baseline capture) while the
435
+ * page/context is still open, so it reflects the true *end state* of the test.
436
+ *
437
+ * This is deliberately preferred over the last Donobu tool-call screenshot:
438
+ * Playwright `expect`/`waitFor` are not tool calls, so the last tool-call image
439
+ * can predate the failing assertion and capture a transient state (e.g. a
440
+ * loading spinner that has since resolved), which misleads the vision model.
441
+ * Fails open — returns null if the page is gone or unresponsive (crash, closed
442
+ * context, hang), in which case the caller proceeds without a screenshot.
443
+ */
444
+ declare function captureLivePageScreenshot(page: DonobuExtendedPage): Promise<Buffer | null>;
423
445
  /**
424
446
  * Builds the heuristic triage assessment by combining rule-based inference,
425
447
  * contextual flags, and derived remediation guidance ahead of GPT enrichment.
@@ -432,5 +454,5 @@ declare function deriveHeuristicAssessment(testInfo: TestInfo, errorSummaries: E
432
454
  declare function reconcileTreatmentPlan(plan: z.infer<typeof TreatmentPlan>, heuristics: HeuristicAssessment): z.infer<typeof TreatmentPlan>;
433
455
  declare function gatherTestFailureEvidence(testInfo: TestInfo, page: DonobuExtendedPage, options?: GatherTestFailureEvidenceOptions): Promise<GatherTestFailureEvidenceResult | null>;
434
456
  declare function generateTreatmentPlanFromEvidence(gptClient: GptClient, evidence: FailureEvidenceRecord): Promise<z.infer<typeof TreatmentPlan>>;
435
- export { type AdditionalDataRequest, AdditionalDataRequestSchema, type AutomationDirectives, deriveHeuristicAssessment, deriveHistoricalSignals, type ErrorSummary, type FailureEvidenceRecord, type FailureReason, FailureReasonSchema, type FlowHistorySummary, gatherTestFailureEvidence, type GatherTestFailureEvidenceOptions, type GatherTestFailureEvidenceResult, generateTreatmentPlanFromEvidence, type HeuristicAssessment, type HistoricalFlowRun, type HistoricalSignals, reconcileTreatmentPlan, type RemediationCategory, type RemediationStep, RemediationStepSchema, type SanitizedFlowMetadata, type SummarizedToolCall, summarizeFlowHistory, TreatmentPlan, TRIAGE_PERSISTENCE_FILE_IDS, };
457
+ export { type AdditionalDataRequest, AdditionalDataRequestSchema, type AutomationDirectives, captureLivePageScreenshot, deriveHeuristicAssessment, deriveHistoricalSignals, type ErrorSummary, type FailureEvidenceRecord, type FailureReason, FailureReasonSchema, type FlowHistorySummary, gatherTestFailureEvidence, type GatherTestFailureEvidenceOptions, type GatherTestFailureEvidenceResult, generateTreatmentPlanFromEvidence, type HeuristicAssessment, type HistoricalFlowRun, type HistoricalSignals, reconcileTreatmentPlan, type RemediationCategory, type RemediationStep, RemediationStepSchema, type SanitizedFlowMetadata, type SummarizedToolCall, summarizeFlowHistory, TreatmentPlan, TRIAGE_PERSISTENCE_FILE_IDS, };
436
458
  //# sourceMappingURL=triageTestFailure.d.ts.map
@@ -37,6 +37,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
37
37
  };
38
38
  Object.defineProperty(exports, "__esModule", { value: true });
39
39
  exports.TRIAGE_PERSISTENCE_FILE_IDS = exports.TreatmentPlan = exports.RemediationStepSchema = exports.FailureReasonSchema = exports.AdditionalDataRequestSchema = void 0;
40
+ exports.captureLivePageScreenshot = captureLivePageScreenshot;
40
41
  exports.deriveHeuristicAssessment = deriveHeuristicAssessment;
41
42
  exports.deriveHistoricalSignals = deriveHistoricalSignals;
42
43
  exports.gatherTestFailureEvidence = gatherTestFailureEvidence;
@@ -79,9 +80,10 @@ const cacheLocator_1 = require("../../ai/cache/cacheLocator");
79
80
  * history from the persistence layer.
80
81
  * 3. Fetches **historical runs** of the same flow (by name) from the flows manager to
81
82
  * detect flakiness, regression patterns, and prior self-heal success.
82
- * 4. Captures the **failure screenshot** (last tool call screenshot from the current
83
- * run) and the **baseline screenshot** (last tool call screenshot from the most
84
- * recent successful historical run) for visual comparison.
83
+ * 4. Captures the **failure screenshot** (a live screenshot taken at triage time, while
84
+ * the page is still open during teardown, so it reflects the true final state) and the
85
+ * **baseline screenshot** (last tool call screenshot from the most recent successful
86
+ * historical run) for visual comparison.
85
87
  * 5. Reads the source of the failing test case for contextual grounding.
86
88
  * 6. Runs the **heuristic classifier** (`deriveHeuristicAssessment`) which uses
87
89
  * rule-based pattern matching over errors, tool calls, stale-cache indicators,
@@ -121,7 +123,7 @@ const cacheLocator_1 = require("../../ai/cache/cacheLocator");
121
123
  * | Flow metadata | `DonobuExtendedPage._dnb` | Run mode, objective, allowed tools, timing |
122
124
  * | Stale cache indicators | Derived from above | Whether page.ai cache staleness is the root cause |
123
125
  * | Historical flow runs | `DonobuFlowsManager.getFlows` | Flakiness, regression patterns, prior self-heal |
124
- * | Failure screenshot | Last tool call screenshot | Visual state of the page when the failure occurred |
126
+ * | Failure screenshot | Live capture at triage time | True final visual state of the page when it failed |
125
127
  * | Baseline screenshot | Last successful run's screenshot | Visual reference for what the page *should* look like |
126
128
  * | Test source snippet | TypeScript AST parsing | The test's expectations and structure |
127
129
  *
@@ -331,6 +333,14 @@ const TRIAGE_PERSISTENCE_FILE_IDS = {
331
333
  evidence: 'triage-evidence.json',
332
334
  failureScreenshot: 'triage-failure-screenshot.png',
333
335
  baselineScreenshot: 'triage-baseline-screenshot.png',
336
+ /**
337
+ * Live screenshot of a flow's final visual state, captured at teardown while
338
+ * the page is still open. Persisted on successful runs so that a *later*
339
+ * failing run can use it as a true final-state baseline — symmetric with the
340
+ * failure screenshot, which is also a live end-of-test capture. Keyed per
341
+ * flow, like browser state.
342
+ */
343
+ finalStateScreenshot: 'triage-final-state-screenshot.png',
334
344
  };
335
345
  exports.TRIAGE_PERSISTENCE_FILE_IDS = TRIAGE_PERSISTENCE_FILE_IDS;
336
346
  /**
@@ -554,41 +564,59 @@ async function fetchFlowHistory(page) {
554
564
  }
555
565
  }
556
566
  /**
557
- * Retrieves the screenshot from the last completed tool call in the current flow.
558
- * Returns the raw PNG/JPEG buffer if available, or null. Fails open so triage
559
- * proceeds even if the screenshot cannot be loaded.
567
+ * Captures a fresh screenshot of the page's current visual state. Called at
568
+ * teardown (failure triage and successful-run baseline capture) while the
569
+ * page/context is still open, so it reflects the true *end state* of the test.
570
+ *
571
+ * This is deliberately preferred over the last Donobu tool-call screenshot:
572
+ * Playwright `expect`/`waitFor` are not tool calls, so the last tool-call image
573
+ * can predate the failing assertion and capture a transient state (e.g. a
574
+ * loading spinner that has since resolved), which misleads the vision model.
575
+ * Fails open — returns null if the page is gone or unresponsive (crash, closed
576
+ * context, hang), in which case the caller proceeds without a screenshot.
560
577
  */
561
- async function fetchLastToolCallScreenshot(page) {
562
- const flowId = page._dnb?.donobuFlowMetadata?.id;
563
- const persistence = page._dnb?.persistence;
564
- if (!flowId || !persistence) {
565
- return null;
566
- }
578
+ async function captureLivePageScreenshot(page) {
567
579
  try {
568
- const toolCalls = await persistence.getToolCalls(flowId);
569
- if (toolCalls.length === 0) {
570
- return null;
571
- }
572
- // Walk backwards to find the last tool call with a screenshot
573
- for (let i = toolCalls.length - 1; i >= 0; i--) {
574
- const screenshotId = toolCalls[i].postCallImageId;
575
- if (screenshotId) {
576
- return await persistence.getScreenShot(flowId, screenshotId);
577
- }
578
- }
579
- return null;
580
+ return await page.screenshot({ animations: 'disabled', timeout: 10000 });
580
581
  }
581
582
  catch (error) {
582
- Logger_1.appLogger.debug(`Failed to fetch last tool call screenshot for flow ${flowId}.`, error);
583
+ Logger_1.appLogger.debug('Failed to capture live page screenshot; proceeding without it.', error);
583
584
  return null;
584
585
  }
585
586
  }
586
587
  /**
587
- * Loads the final screenshot from a historical successful run to serve as a
588
- * visual baseline for comparison with the current failure state. This enables
589
- * the GPT triage agent to detect page redesigns and stale cache scenarios by
590
- * comparing "what the page looked like when it last worked" vs "what it looks
591
- * like now." Fails open — returns null if the screenshot cannot be retrieved.
588
+ * The failure screenshot for the current run. Prefers the final-state
589
+ * screenshot persisted at teardown (the single source of truth shared with
590
+ * baselines), and falls back to a live capture when it is missing — e.g. triage
591
+ * invoked outside the standard teardown, or the teardown capture failed.
592
+ */
593
+ async function fetchCurrentRunFinalStateScreenshot(page) {
594
+ const flowId = page._dnb?.donobuFlowMetadata?.id;
595
+ const persistence = page._dnb?.persistence;
596
+ if (flowId && persistence) {
597
+ try {
598
+ const persisted = await persistence.getFlowFile(flowId, TRIAGE_PERSISTENCE_FILE_IDS.finalStateScreenshot);
599
+ if (persisted) {
600
+ return persisted;
601
+ }
602
+ }
603
+ catch (error) {
604
+ Logger_1.appLogger.debug(`Failed to read persisted final-state screenshot for flow ${flowId}; falling back to a live capture.`, error);
605
+ }
606
+ }
607
+ return captureLivePageScreenshot(page);
608
+ }
609
+ /**
610
+ * Loads a baseline screenshot from a historical successful run so the GPT
611
+ * triage agent can compare "what the page looked like when it last worked" vs
612
+ * "what it looks like now" to detect redesigns and stale-cache scenarios.
613
+ *
614
+ * Prefers the persisted final-state screenshot (a live end-of-test capture
615
+ * written on successful runs) so the baseline is symmetric with the live
616
+ * failure screenshot — both true end states. Falls back to the last tool-call
617
+ * image for runs that predate final-state capture; that image can be a
618
+ * mid-flow frame, so callers should treat such baselines as approximate.
619
+ * Fails open — returns null if no screenshot can be retrieved.
592
620
  */
593
621
  async function fetchBaselineScreenshot(page, historicalFlowId) {
594
622
  const persistence = page._dnb?.persistence;
@@ -596,10 +624,12 @@ async function fetchBaselineScreenshot(page, historicalFlowId) {
596
624
  return null;
597
625
  }
598
626
  try {
599
- const toolCalls = await persistence.getToolCalls(historicalFlowId);
600
- if (toolCalls.length === 0) {
601
- return null;
627
+ const finalState = await persistence.getFlowFile(historicalFlowId, TRIAGE_PERSISTENCE_FILE_IDS.finalStateScreenshot);
628
+ if (finalState) {
629
+ return finalState;
602
630
  }
631
+ // Fallback for runs predating final-state capture: last tool-call image.
632
+ const toolCalls = await persistence.getToolCalls(historicalFlowId);
603
633
  for (let i = toolCalls.length - 1; i >= 0; i--) {
604
634
  const screenshotId = toolCalls[i].postCallImageId;
605
635
  if (screenshotId) {
@@ -1601,7 +1631,7 @@ async function gatherTestFailureEvidence(testInfo, page, options = {}) {
1601
1631
  // Capture screenshots for visual triage: current failure + baseline from last success
1602
1632
  const lastSuccessfulRunId = failureContext.flowHistory?.lastSuccessfulRunId ?? null;
1603
1633
  const [screenshotBuffer, baselineBuffer] = await Promise.all([
1604
- fetchLastToolCallScreenshot(page),
1634
+ fetchCurrentRunFinalStateScreenshot(page),
1605
1635
  lastSuccessfulRunId
1606
1636
  ? fetchBaselineScreenshot(page, lastSuccessfulRunId)
1607
1637
  : Promise.resolve(null),
@@ -1805,10 +1835,20 @@ passed to each tool invocation. Use these to improve diagnosis:
1805
1835
 
1806
1836
  SCREENSHOT EVIDENCE:
1807
1837
  You may receive one or two screenshots:
1808
- 1. "FAILURE SCREENSHOT" — the state of the page at or near the point of failure in the current run.
1838
+ 1. "FAILURE SCREENSHOT" — a live screenshot captured at triage time, immediately after the test
1839
+ failed and while the page was still open. It reflects the true FINAL visual state of the page.
1809
1840
  2. "BASELINE SCREENSHOT" — the state of the page at the end of the most recent successful run of
1810
1841
  this same flow. This serves as a visual reference for what the page *should* look like.
1811
1842
 
1843
+ IMPORTANT — a screenshot is a single moment in time, not a recording:
1844
+ - Describe only what the frame shows. Do NOT assert that a state persisted for a duration — e.g.
1845
+ "stuck on a loading spinner THROUGHOUT the test", "the page never loaded", "remained on X the
1846
+ whole time". A single frame cannot establish how long anything lasted.
1847
+ - Only claim a persistent or temporal condition when it is corroborated by NON-visual evidence:
1848
+ tool-call outcomes/durations, error messages, or timeouts in failureContext. Absent that, state
1849
+ the end condition factually (e.g. "the final screenshot shows a loading spinner") and let the
1850
+ other evidence determine duration and cause.
1851
+
1812
1852
  When both screenshots are provided, compare them to:
1813
1853
  - Detect UI changes (redesigns, layout shifts, new modals) that would explain selector or cache failures.
1814
1854
  - Identify whether the failure screenshot shows a fundamentally different page state (error page, login wall)
@@ -1837,7 +1877,10 @@ When only the failure screenshot is provided (no baseline available), use it to:
1837
1877
  if (evidence.failureScreenshotPath) {
1838
1878
  try {
1839
1879
  const failureBytes = await fs.readFile(evidence.failureScreenshotPath);
1840
- userItems.push({ type: 'text', text: 'FAILURE SCREENSHOT (current run):' }, { type: 'png', bytes: new Uint8Array(failureBytes) });
1880
+ userItems.push({
1881
+ type: 'text',
1882
+ text: 'FAILURE SCREENSHOT (live capture at triage time — true final state of the page):',
1883
+ }, { type: 'png', bytes: new Uint8Array(failureBytes) });
1841
1884
  }
1842
1885
  catch (screenshotError) {
1843
1886
  Logger_1.appLogger.debug('Failed to load failure screenshot for GPT triage, proceeding with text only.', screenshotError);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "donobu",
3
- "version": "5.46.0",
3
+ "version": "5.47.0",
4
4
  "description": "Create browser automations with an LLM agent and replay them as Playwright scripts.",
5
5
  "main": "dist/main.js",
6
6
  "module": "dist/esm/main.js",