autokap 1.8.5 → 1.8.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,6 +12,12 @@ export interface ActionVerification {
12
12
  changes: ActionChange[];
13
13
  /** Summary for logging */
14
14
  summary: string;
15
+ /**
16
+ * AUT-240 (decision 2): the after-state was unreadable even after a settle +
17
+ * retry, so the effect was assumed as a last resort. The capture this taints
18
+ * is flagged low-confidence rather than failed.
19
+ */
20
+ lowConfidence?: boolean;
15
21
  }
16
22
  export type ActionChangeKind = 'url_changed' | 'tree_structure_changed' | 'node_appeared' | 'node_disappeared' | 'node_state_changed' | 'scroll_changed' | 'overlay_changed' | 'no_change';
17
23
  export interface ActionChange {
@@ -32,23 +32,36 @@ export class ActionVerifier {
32
32
  summary: 'no before state captured, assuming action had effect',
33
33
  };
34
34
  }
35
- let afterUrl;
36
- let afterTree;
35
+ // AUT-240 (decision 2): "assume OK, but smart". A first `page.evaluate`
36
+ // hiccup (typically mid-navigation) is no longer assumed-OK immediately —
37
+ // settle the page and retry once. Only if it still throws do we assume the
38
+ // effect, and then we flag the result low-confidence for downstream scrutiny.
39
+ let after = null;
37
40
  try {
38
- [afterUrl, afterTree] = await Promise.all([
39
- adapter.getCurrentUrl(),
40
- adapter.getAKTree(),
41
- ]);
41
+ const [url, tree] = await Promise.all([adapter.getCurrentUrl(), adapter.getAKTree()]);
42
+ after = { url, tree };
42
43
  }
43
44
  catch {
44
- this.beforeTree = null;
45
- this.beforeUrl = null;
46
- return {
47
- hadEffect: true,
48
- changes: [],
49
- summary: 'after state unavailable, skipping AKTree effect verification',
50
- };
45
+ try {
46
+ if (adapter.waitForVisuallyStable) {
47
+ await adapter.waitForVisuallyStable({ maxWaitMs: 2000 });
48
+ }
49
+ const [url, tree] = await Promise.all([adapter.getCurrentUrl(), adapter.getAKTree()]);
50
+ after = { url, tree };
51
+ }
52
+ catch {
53
+ this.beforeTree = null;
54
+ this.beforeUrl = null;
55
+ return {
56
+ hadEffect: true,
57
+ changes: [],
58
+ summary: 'after state unavailable after settle, assuming effect (low-confidence)',
59
+ lowConfidence: true,
60
+ };
61
+ }
51
62
  }
63
+ const afterUrl = after.url;
64
+ const afterTree = after.tree;
52
65
  const changes = [];
53
66
  // 1. URL change
54
67
  if (afterUrl !== this.beforeUrl) {
@@ -105,11 +118,11 @@ export class ActionVerifier {
105
118
  const beforeState = collectVisibleNodeState(this.beforeTree.root);
106
119
  const afterState = collectVisibleNodeState(afterTree.root);
107
120
  const changedStates = [];
108
- for (const [key, before] of beforeState) {
109
- const after = afterState.get(key);
110
- if (!after)
121
+ for (const [key, beforeNodeState] of beforeState) {
122
+ const afterNodeState = afterState.get(key);
123
+ if (!afterNodeState)
111
124
  continue;
112
- if (before !== after) {
125
+ if (beforeNodeState !== afterNodeState) {
113
126
  changedStates.push(key);
114
127
  if (changedStates.length >= 5)
115
128
  break;
package/dist/browser.d.ts CHANGED
@@ -1,5 +1,6 @@
1
1
  import { type BrowserContext, type Page } from 'playwright';
2
2
  import type { AKNode, AKNodeRuntimeIndexEntry, AKTree, BrowserOptions, BrowserSessionStorageState, BrowserStorageState, InteractiveElement, ObservedRequest, OutscaleConfig, PageState, PageStateLite, ResolvedMock, SelectorValidationResult, VideoPageSignals } from './types.js';
3
+ import type { ProgressSnapshot, VisualStabilityResult } from './execution-types.js';
3
4
  export interface SelectorCaptureError {
4
5
  error: 'no_match' | 'ambiguous' | 'invisible' | 'zero_size';
5
6
  errorMessage: string;
@@ -103,6 +104,18 @@ export declare class Browser {
103
104
  private poolContext;
104
105
  private persistentContext;
105
106
  private ownedChromiumProfileDir;
107
+ /**
108
+ * Network-progress counters (AUT-240, Layer C). Maintained by request
109
+ * lifecycle listeners attached lazily and idempotently per page via
110
+ * `ensureProgressListeners()`, so every page-creation path is covered without
111
+ * touching each one. Count FIRST-PARTY traffic only (same site as the live
112
+ * main-frame origin) so background third-party telemetry does not read as
113
+ * progress. Read cheaply by `getProgressSnapshot()`.
114
+ */
115
+ private progressListenersPage;
116
+ private inflightRequests;
117
+ private networkEventCount;
118
+ private lastNetworkActivityAtMs;
106
119
  /**
107
120
  * Xvfb instance backing the headed Chromium used by clip capture on Cloud
108
121
  * Run with NVIDIA L4. Set when forClipCapture spawns Xvfb; null otherwise
@@ -168,6 +181,52 @@ export declare class Browser {
168
181
  */
169
182
  private waitForDomStability;
170
183
  private waitForFontsBeforeScreenshot;
184
+ /**
185
+ * Attach network-progress listeners to `page` once. Idempotent per page: a
186
+ * no-op if already attached, and resets counters only when the `Page` object
187
+ * itself changes (recreated context, pooled re-acquire). A `page.reload()`
188
+ * keeps the same object, so counters stay monotonic across it — harmless, as
189
+ * the watchdog only ever compares deltas via `hasProgress`. Covers every
190
+ * page-creation path without editing each one.
191
+ */
192
+ private ensureProgressListeners;
193
+ /**
194
+ * Cheap snapshot of page activity for the runner's progress watchdog
195
+ * (AUT-240, Layer C). Network counters are free (maintained by listeners);
196
+ * `readyState`/`domNodeCount` come from one light `evaluate`. Never rejects:
197
+ * an `evaluate` failure (navigation in flight) is itself progress, surfaced
198
+ * as `navigating: true`.
199
+ */
200
+ getProgressSnapshot(): Promise<ProgressSnapshot>;
201
+ /**
202
+ * Wait until the page is visually stable enough to screenshot (AUT-240,
203
+ * Layer B). Best-effort and non-blocking: composes light semantic signals —
204
+ * fonts ready, images loaded, no visible `[aria-busy]`/`[role=progressbar]`,
205
+ * DOM quiet — and only falls back to a bounded pixel-convergence check when
206
+ * loaders never clear. Returns `stable: false` with a reason instead of
207
+ * throwing or blocking; the caller captures anyway (a perpetual animation is
208
+ * cosmetic, not a reason to fail the capture — decision 1).
209
+ */
210
+ waitForVisuallyStable(options?: {
211
+ maxWaitMs?: number;
212
+ }): Promise<VisualStabilityResult>;
213
+ private remainingMs;
214
+ /** Poll until in-DOM images have finished loading, or the budget elapses. */
215
+ private waitForImagesSettled;
216
+ /**
217
+ * Poll until no semantically-marked loader is visible, or the deadline hits.
218
+ * Condition: zero visible elements matching
219
+ * `[aria-busy="true"], [role="progressbar"]:not([hidden])` — covers shadcn,
220
+ * Radix, MUI, etc. without a hardcoded class list. Returns true if cleared.
221
+ */
222
+ private waitForNoVisibleLoaders;
223
+ /**
224
+ * Bounded pixel-convergence fallback (AUT-240 decision 1): take up to
225
+ * `PIXEL_FALLBACK_MAX_PASSES` raw frames; if two consecutive frames are within
226
+ * `PIXEL_FALLBACK_DIFF_THRESHOLD`, the page is settled. Never blocks past the
227
+ * deadline; returns false (capture anyway) for perpetual animations.
228
+ */
229
+ private pixelConvergenceFallback;
171
230
  takeScreenshot(): Promise<Buffer>;
172
231
  takeScreenshotForAI(options?: {
173
232
  timeoutMs?: number;
package/dist/browser.js CHANGED
@@ -3,7 +3,9 @@ import sharp from 'sharp';
3
3
  import { createHash } from 'crypto';
4
4
  import { cp, mkdir, readFile, rm, writeFile } from 'fs/promises';
5
5
  import { join } from 'path';
6
+ import { DOM_QUIET_WINDOW_MS, GLOBAL_WAIT_CAP_MS, PIXEL_FALLBACK_DIFF_THRESHOLD, PIXEL_FALLBACK_MAX_PASSES, } from './wait-contract.js';
6
7
  import { buildAKNodeRuntimeIndex, deriveInteractiveElementsFromAKTree, disambiguateFingerprint, focusAKTree, fingerprintAKNode, serializeAKTree, } from './ak-tree.js';
8
+ import { isFirstPartyUrl } from './security.js';
7
9
  /**
8
10
  * Set-of-Marks (SoM) annotation: overlays colored [N] badges on each visible
9
11
  * interactive element so the vision model can reference elements by their badge index.
@@ -876,6 +878,29 @@ export function describeObservationChange(before, after) {
876
878
  : 'No visible state change detected after the action.',
877
879
  };
878
880
  }
881
+ function delayMs(ms) {
882
+ return new Promise((resolve) => setTimeout(resolve, ms));
883
+ }
884
+ /**
885
+ * Sampled byte-diff ratio between two screenshot buffers (AUT-240, Layer B
886
+ * pixel fallback). Cheap O(sampleCount) approximation — good enough to tell a
887
+ * settled frame from a moving one without decoding the PNG.
888
+ */
889
+ function sampledBufferDiffRatio(a, b) {
890
+ const maxLength = Math.max(a.length, b.length);
891
+ if (maxLength === 0)
892
+ return 0;
893
+ const sampleCount = Math.min(10000, maxLength);
894
+ const step = Math.max(1, Math.floor(maxLength / sampleCount));
895
+ let checked = 0;
896
+ let changed = 0;
897
+ for (let i = 0; i < maxLength; i += step) {
898
+ checked++;
899
+ if (a[i] !== b[i])
900
+ changed++;
901
+ }
902
+ return checked === 0 ? 0 : changed / checked;
903
+ }
879
904
  export class Browser {
880
905
  options;
881
906
  browser = null;
@@ -886,6 +911,18 @@ export class Browser {
886
911
  poolContext = false;
887
912
  persistentContext = false;
888
913
  ownedChromiumProfileDir = null;
914
+ /**
915
+ * Network-progress counters (AUT-240, Layer C). Maintained by request
916
+ * lifecycle listeners attached lazily and idempotently per page via
917
+ * `ensureProgressListeners()`, so every page-creation path is covered without
918
+ * touching each one. Count FIRST-PARTY traffic only (same site as the live
919
+ * main-frame origin) so background third-party telemetry does not read as
920
+ * progress. Read cheaply by `getProgressSnapshot()`.
921
+ */
922
+ progressListenersPage = null;
923
+ inflightRequests = 0;
924
+ networkEventCount = 0;
925
+ lastNetworkActivityAtMs = 0;
889
926
  /**
890
927
  * Xvfb instance backing the headed Chromium used by clip capture on Cloud
891
928
  * Run with NVIDIA L4. Set when forClipCapture spawns Xvfb; null otherwise
@@ -1554,6 +1591,228 @@ export class Browser {
1554
1591
  logger.debug(`[capture] font paint gate resolved via=${gate.via} elapsedMs=${gate.elapsedMs} totalMs=${Date.now() - startedAt}`);
1555
1592
  }
1556
1593
  }
1594
+ /**
1595
+ * Attach network-progress listeners to `page` once. Idempotent per page: a
1596
+ * no-op if already attached, and resets counters only when the `Page` object
1597
+ * itself changes (recreated context, pooled re-acquire). A `page.reload()`
1598
+ * keeps the same object, so counters stay monotonic across it — harmless, as
1599
+ * the watchdog only ever compares deltas via `hasProgress`. Covers every
1600
+ * page-creation path without editing each one.
1601
+ */
1602
+ ensureProgressListeners(page) {
1603
+ if (this.progressListenersPage === page)
1604
+ return;
1605
+ this.progressListenersPage = page;
1606
+ this.inflightRequests = 0;
1607
+ this.networkEventCount = 0;
1608
+ this.lastNetworkActivityAtMs = Date.now();
1609
+ const bump = () => {
1610
+ this.networkEventCount++;
1611
+ this.lastNetworkActivityAtMs = Date.now();
1612
+ };
1613
+ // Only the app's OWN traffic counts as progress (AUT-240). Third-party
1614
+ // telemetry (PostHog beacons, analytics/ad pixels, Sentry, polling/ws-fallback
1615
+ // XHR to other origins) chatters indefinitely on a real app and would
1616
+ // otherwise keep `hasProgress` true forever, so the stuck watchdog could never
1617
+ // cut a wait whose condition will never be met. First-party-ness is decided
1618
+ // once, at request time, against the live main-frame origin; the same request
1619
+ // object is remembered so finished/failed decrement symmetrically even if the
1620
+ // page navigates mid-flight (a foreign decrement is simply skipped).
1621
+ const firstPartyInflight = new WeakSet();
1622
+ page.on('request', (request) => {
1623
+ if (!isFirstPartyUrl(page.url(), request.url()))
1624
+ return;
1625
+ firstPartyInflight.add(request);
1626
+ this.inflightRequests++;
1627
+ bump();
1628
+ });
1629
+ page.on('requestfinished', (request) => {
1630
+ if (!firstPartyInflight.has(request))
1631
+ return;
1632
+ this.inflightRequests = Math.max(0, this.inflightRequests - 1);
1633
+ bump();
1634
+ });
1635
+ page.on('requestfailed', (request) => {
1636
+ if (!firstPartyInflight.has(request))
1637
+ return;
1638
+ this.inflightRequests = Math.max(0, this.inflightRequests - 1);
1639
+ bump();
1640
+ });
1641
+ }
1642
+ /**
1643
+ * Cheap snapshot of page activity for the runner's progress watchdog
1644
+ * (AUT-240, Layer C). Network counters are free (maintained by listeners);
1645
+ * `readyState`/`domNodeCount` come from one light `evaluate`. Never rejects:
1646
+ * an `evaluate` failure (navigation in flight) is itself progress, surfaced
1647
+ * as `navigating: true`.
1648
+ */
1649
+ async getProgressSnapshot() {
1650
+ const page = this.ensurePage();
1651
+ this.ensureProgressListeners(page);
1652
+ let readyState = 'unknown';
1653
+ let domNodeCount = -1;
1654
+ let navigating = false;
1655
+ try {
1656
+ const probe = await page.evaluate(() => ({
1657
+ rs: document.readyState,
1658
+ n: document.getElementsByTagName('*').length,
1659
+ }));
1660
+ readyState = probe.rs;
1661
+ domNodeCount = probe.n;
1662
+ }
1663
+ catch {
1664
+ navigating = true;
1665
+ }
1666
+ return {
1667
+ networkEventCount: this.networkEventCount,
1668
+ inflightRequests: this.inflightRequests,
1669
+ lastNetworkActivityAtMs: this.lastNetworkActivityAtMs,
1670
+ readyState,
1671
+ domNodeCount,
1672
+ navigating,
1673
+ };
1674
+ }
1675
+ /**
1676
+ * Wait until the page is visually stable enough to screenshot (AUT-240,
1677
+ * Layer B). Best-effort and non-blocking: composes light semantic signals —
1678
+ * fonts ready, images loaded, no visible `[aria-busy]`/`[role=progressbar]`,
1679
+ * DOM quiet — and only falls back to a bounded pixel-convergence check when
1680
+ * loaders never clear. Returns `stable: false` with a reason instead of
1681
+ * throwing or blocking; the caller captures anyway (a perpetual animation is
1682
+ * cosmetic, not a reason to fail the capture — decision 1).
1683
+ */
1684
+ async waitForVisuallyStable(options) {
1685
+ const page = this.ensurePage();
1686
+ const startedAt = Date.now();
1687
+ const maxWait = Math.max(1000, options?.maxWaitMs ?? GLOBAL_WAIT_CAP_MS.screenshot);
1688
+ const deadline = startedAt + maxWait;
1689
+ // 1. Fonts + images painted (avoid FOUT / broken images).
1690
+ await this.waitForFontsBeforeScreenshot(page).catch(() => { });
1691
+ await this.waitForImagesSettled(page, Math.min(3000, this.remainingMs(deadline)));
1692
+ // 2. Semantic loaders cleared (replaces the legacy 8 hardcoded CSS classes).
1693
+ const loadersCleared = await this.waitForNoVisibleLoaders(page, deadline);
1694
+ // 3. DOM quiet.
1695
+ const domBudget = Math.min(3000, this.remainingMs(deadline));
1696
+ if (domBudget > 0) {
1697
+ await this.waitForDomStability(page, DOM_QUIET_WINDOW_MS, domBudget);
1698
+ }
1699
+ if (loadersCleared) {
1700
+ return { stable: true, reason: 'page visually stable', waitedMs: Date.now() - startedAt };
1701
+ }
1702
+ // 4. Loaders persisted ⇒ bounded pixel-convergence fallback (decision 1).
1703
+ // Two converging frames ⇒ the page is effectively static (e.g. a non-ARIA
1704
+ // spinner that finished); otherwise it is a perpetual animation — capture
1705
+ // anyway rather than block.
1706
+ const converged = await this.pixelConvergenceFallback(page, deadline);
1707
+ return converged
1708
+ ? {
1709
+ stable: true,
1710
+ reason: 'stable via pixel convergence (loaders not ARIA-marked)',
1711
+ waitedMs: Date.now() - startedAt,
1712
+ }
1713
+ : {
1714
+ stable: false,
1715
+ reason: 'loaders or animation still active at deadline; capturing anyway',
1716
+ waitedMs: Date.now() - startedAt,
1717
+ };
1718
+ }
1719
+ remainingMs(deadline) {
1720
+ return Math.max(0, deadline - Date.now());
1721
+ }
1722
+ /** Poll until in-DOM images have finished loading, or the budget elapses. */
1723
+ async waitForImagesSettled(page, budgetMs) {
1724
+ if (budgetMs <= 0)
1725
+ return;
1726
+ try {
1727
+ await page.evaluate((maxWait) => {
1728
+ return new Promise((resolve) => {
1729
+ const deadline = Date.now() + maxWait;
1730
+ const check = () => {
1731
+ const imgs = document.querySelectorAll('img');
1732
+ for (const img of imgs) {
1733
+ if (!img.src || img.src.startsWith('data:'))
1734
+ continue;
1735
+ if (img.width <= 1 && img.height <= 1)
1736
+ continue;
1737
+ if (!img.complete || img.naturalWidth === 0)
1738
+ return false;
1739
+ }
1740
+ return true;
1741
+ };
1742
+ if (check()) {
1743
+ resolve();
1744
+ return;
1745
+ }
1746
+ const iv = setInterval(() => {
1747
+ if (check() || Date.now() >= deadline) {
1748
+ clearInterval(iv);
1749
+ resolve();
1750
+ }
1751
+ }, 100);
1752
+ });
1753
+ }, budgetMs);
1754
+ }
1755
+ catch {
1756
+ // Page navigated during the wait — not stable yet, but best-effort.
1757
+ }
1758
+ }
1759
+ /**
1760
+ * Poll until no semantically-marked loader is visible, or the deadline hits.
1761
+ * Condition: zero visible elements matching
1762
+ * `[aria-busy="true"], [role="progressbar"]:not([hidden])` — covers shadcn,
1763
+ * Radix, MUI, etc. without a hardcoded class list. Returns true if cleared.
1764
+ */
1765
+ async waitForNoVisibleLoaders(page, deadline) {
1766
+ const selector = '[aria-busy="true"], [role="progressbar"]:not([hidden])';
1767
+ while (Date.now() < deadline) {
1768
+ let count = -1;
1769
+ try {
1770
+ count = await page.evaluate((sel) => {
1771
+ const isVisible = (el) => {
1772
+ const rect = el.getBoundingClientRect();
1773
+ if (rect.width === 0 || rect.height === 0)
1774
+ return false;
1775
+ const style = getComputedStyle(el);
1776
+ return style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0';
1777
+ };
1778
+ return Array.from(document.querySelectorAll(sel)).filter(isVisible).length;
1779
+ }, selector);
1780
+ }
1781
+ catch {
1782
+ count = -1; // navigation in flight — not settled, keep waiting
1783
+ }
1784
+ if (count === 0)
1785
+ return true;
1786
+ await delayMs(Math.min(150, this.remainingMs(deadline)));
1787
+ }
1788
+ return false;
1789
+ }
1790
+ /**
1791
+ * Bounded pixel-convergence fallback (AUT-240 decision 1): take up to
1792
+ * `PIXEL_FALLBACK_MAX_PASSES` raw frames; if two consecutive frames are within
1793
+ * `PIXEL_FALLBACK_DIFF_THRESHOLD`, the page is settled. Never blocks past the
1794
+ * deadline; returns false (capture anyway) for perpetual animations.
1795
+ */
1796
+ async pixelConvergenceFallback(page, deadline) {
1797
+ let prev = null;
1798
+ for (let pass = 0; pass < PIXEL_FALLBACK_MAX_PASSES; pass++) {
1799
+ if (Date.now() >= deadline)
1800
+ break;
1801
+ let frame;
1802
+ try {
1803
+ frame = await page.screenshot();
1804
+ }
1805
+ catch {
1806
+ return false;
1807
+ }
1808
+ if (prev && sampledBufferDiffRatio(prev, frame) <= PIXEL_FALLBACK_DIFF_THRESHOLD) {
1809
+ return true;
1810
+ }
1811
+ prev = frame;
1812
+ await delayMs(Math.min(250, this.remainingMs(deadline)));
1813
+ }
1814
+ return false;
1815
+ }
1557
1816
  async takeScreenshot() {
1558
1817
  const page = this.ensurePage();
1559
1818
  // Move cursor off-screen to avoid hover effects in screenshots
@@ -131,6 +131,11 @@ export interface ArtifactUploadMetadata {
131
131
  targetLabel?: string | null;
132
132
  programVersion?: number | null;
133
133
  compileFingerprint?: string | null;
134
+ engineVersion?: number | null;
135
+ programSchemaVersion?: number | null;
136
+ programSchemaVersionOrigin?: number | null;
137
+ cliVersion?: string | null;
138
+ programHash?: string | null;
134
139
  mediaMode: "screenshot" | "clip" | "video";
135
140
  mimeType: string;
136
141
  captureType: "fullpage" | "element";
@@ -19,7 +19,6 @@ export interface RecordableBrowserSettings {
19
19
  runtimeDeviceScaleFactor: number;
20
20
  }
21
21
  export declare function resolveRecordableBrowserSettings(program: ExecutionProgram, variant: VariantSpec): RecordableBrowserSettings;
22
- export declare function normalizeVideoCaptureProgram(program: ExecutionProgram): ExecutionProgram;
23
22
  export interface CLIRunnerOptions {
24
23
  /** Preset ID to run */
25
24
  presetId: string;