autokap 1.9.3 → 1.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -54,5 +54,15 @@ export interface OpcodeActionResult {
54
54
  * click instead of when the cursor was still travelling.
55
55
  */
56
56
  clickTimestampsMs?: number[];
57
+ /**
58
+ * For CAPTURE_SCREENSHOT: index in the variant's `artifacts` array of the
59
+ * screenshot this action just delivered. The deterministic capture pushes the
60
+ * artifact immediately; LLM enrichment (quality verification + alt text) runs
61
+ * afterwards in `executeOpcode` and mutates this artifact in place, so it can
62
+ * never void an already-captured screenshot or trip the action timeout.
63
+ */
64
+ captureArtifactIndex?: number;
65
+ /** For CAPTURE_SCREENSHOT: page URL at capture time, reused by enrichment. */
66
+ captureUrl?: string;
57
67
  }
58
68
  export declare function executeOpcodeCoreAction(opcode: ExecutionOpcode, adapter: RuntimeAdapter, context?: OpcodeActionContext): Promise<OpcodeActionResult>;
@@ -55,6 +55,15 @@ export class NoOpRecoveryChain {
55
55
  }
56
56
  const MIN_CLIP_FINALIZATION_TIMEOUT_MS = 30000;
57
57
  const DEFAULT_VIDEO_RECORDING_RESOLUTION = { width: 1920, height: 1080 };
58
+ /**
59
+ * The compiled per-opcode action budget. For CAPTURE_SCREENSHOT this governs
60
+ * ONLY the deterministic capture (visual stabilize + screenshot + favicon/title
61
+ * + artifact push); LLM enrichment (quality verification + alt text) runs AFTER
62
+ * the action under the global wait deadline (see `enrichCaptureArtifact`) and
63
+ * must never be folded back under this timeout — that is the regression this
64
+ * separation prevents. END_CLIP finalization gets a floor since muxing a
65
+ * recording is inherently slow.
66
+ */
58
67
  function resolveOpcodeTimeoutMs(opcode) {
59
68
  if (opcode.kind === 'END_CLIP') {
60
69
  return Math.max(opcode.timeoutMs, MIN_CLIP_FINALIZATION_TIMEOUT_MS);
@@ -543,6 +552,18 @@ async function executeOpcode(opcode, index, adapter, verifier, breaker, recovery
543
552
  await sleep(VIDEO_POST_ACTION_SETTLE_MS);
544
553
  }
545
554
  breaker.recordSuccess(index);
555
+ // Enrich the just-captured screenshot OFF the timed action path: quality
556
+ // verification + alt text run here, under the global wait deadline, and
557
+ // mutate the artifact in place — so slow LLM calls can never void a
558
+ // delivered screenshot (the failure this fixes). Recaptured screenshots from
559
+ // the recovery path are intentionally left un-enriched (recovery stays
560
+ // deterministic and fast).
561
+ if (opcode.kind === 'CAPTURE_SCREENSHOT'
562
+ && options.llmConfig
563
+ && result.captureArtifactIndex !== undefined
564
+ && result.captureUrl !== undefined) {
565
+ await enrichCaptureArtifact(artifacts[result.captureArtifactIndex], opcode, adapter, result.captureUrl, currentVariant, options, telemetry, globalDeadlineMs);
566
+ }
546
567
  return {
547
568
  opcodeIndex: index,
548
569
  kind: opcode.kind,
@@ -760,13 +781,7 @@ async function executeOpcodeAction(opcode, opcodeIndex, adapter, artifacts, tele
760
781
  // semantic loaders/DOM-quiet + bounded pixel fallback); fall back to the
761
782
  // legacy smart-wait for adapters that don't implement it. A page that
762
783
  // never fully settles (e.g. a perpetual animation) is captured anyway.
763
- const stabilize = (maxWaitMs) => adapter.waitForVisuallyStable
764
- ? adapter.waitForVisuallyStable({ maxWaitMs })
765
- : smartWaitForStability(adapter, { maxWaitMs }).then((r) => ({
766
- stable: r.stable,
767
- reason: r.waitedFor.join(', ') || 'unknown',
768
- }));
769
- const stability = await stabilize(5000);
784
+ const stability = await stabilizeForCapture(adapter, 5000);
770
785
  if (!stability.stable) {
771
786
  logger.debug(`[opcode ${opcodeIndex}] capturing despite unstable page: ${stability.reason}`);
772
787
  }
@@ -784,85 +799,7 @@ async function executeOpcodeAction(opcode, opcodeIndex, adapter, artifacts, tele
784
799
  }
785
800
  const captureLowConfidenceReason = lowConfidenceReasons.join('; ') || undefined;
786
801
  const captureUrl = await adapter.getCurrentUrl();
787
- const takeBuffer = async () => {
788
- if (opcode.elementSelector && adapter.takeElementScreenshot) {
789
- return adapter.takeElementScreenshot(opcode.elementSelector, opcode.outscale);
790
- }
791
- if (opcode.elementSelector) {
792
- throw new Error(`element capture requires adapter support for selector "${opcode.elementSelector}"`);
793
- }
794
- return adapter.takeScreenshot();
795
- };
796
- let buffer = await takeBuffer();
797
- if (runOptions?.llmConfig) {
798
- const verification = await verifyCaptureQuality(buffer, {
799
- expectedDescription: opcode.description,
800
- url: captureUrl,
801
- locale: currentVariant?.locale,
802
- theme: currentVariant?.theme,
803
- }, runOptions.llmConfig);
804
- if (verification.llmResult) {
805
- telemetry.llmCallCount++;
806
- telemetry.llmCostEur += verification.llmResult.costEur;
807
- telemetry.llmStepUsages.push({
808
- stepType: 'capture_verification',
809
- generationId: verification.llmResult.generationId,
810
- model: verification.llmResult.model,
811
- promptTokens: verification.llmResult.promptTokens,
812
- completionTokens: verification.llmResult.completionTokens,
813
- });
814
- }
815
- if (!verification.passed) {
816
- await stabilize(8000);
817
- const retryBuffer = await takeBuffer();
818
- const retryVerification = await verifyCaptureQuality(retryBuffer, {
819
- expectedDescription: opcode.description,
820
- url: captureUrl,
821
- locale: currentVariant?.locale,
822
- theme: currentVariant?.theme,
823
- }, runOptions.llmConfig);
824
- if (retryVerification.llmResult) {
825
- telemetry.llmCallCount++;
826
- telemetry.llmCostEur += retryVerification.llmResult.costEur;
827
- telemetry.llmStepUsages.push({
828
- stepType: 'capture_verification',
829
- generationId: retryVerification.llmResult.generationId,
830
- model: retryVerification.llmResult.model,
831
- promptTokens: retryVerification.llmResult.promptTokens,
832
- completionTokens: retryVerification.llmResult.completionTokens,
833
- });
834
- }
835
- if (retryVerification.passed) {
836
- buffer = retryBuffer;
837
- }
838
- }
839
- }
840
- let altText;
841
- if (runOptions?.llmConfig) {
842
- try {
843
- const altResult = await generateAltText(buffer, {
844
- description: opcode.description,
845
- url: captureUrl,
846
- locale: currentVariant?.locale,
847
- presetName: runOptions.presetName,
848
- }, runOptions.llmConfig);
849
- altText = altResult.altText;
850
- if (altResult.llmResult) {
851
- telemetry.llmCallCount++;
852
- telemetry.llmCostEur += altResult.llmResult.costEur;
853
- telemetry.llmStepUsages.push({
854
- stepType: 'alt_text_generation',
855
- generationId: altResult.llmResult.generationId,
856
- model: altResult.llmResult.model,
857
- promptTokens: altResult.llmResult.promptTokens,
858
- completionTokens: altResult.llmResult.completionTokens,
859
- });
860
- }
861
- }
862
- catch {
863
- // Alt text generation failed — non-fatal
864
- }
865
- }
802
+ const buffer = await takeCaptureBuffer(adapter, opcode);
866
803
  // Extract page favicon for browser bar mockup
867
804
  let tabIconData;
868
805
  let tabIconMimeType;
@@ -892,7 +829,7 @@ async function executeOpcodeAction(opcode, opcodeIndex, adapter, artifacts, tele
892
829
  captureId: opcode.captureId,
893
830
  captureName: opcode.captureName ?? opcode.description,
894
831
  elementSelector: opcode.elementSelector,
895
- altText,
832
+ altText: undefined,
896
833
  stepDescription: opcode.description,
897
834
  stepIndex: opcodeIndex,
898
835
  variantId: currentVariant?.id,
@@ -901,7 +838,11 @@ async function executeOpcodeAction(opcode, opcodeIndex, adapter, artifacts, tele
901
838
  lowConfidence: captureLowConfidence || undefined,
902
839
  lowConfidenceReason: captureLowConfidence ? captureLowConfidenceReason : undefined,
903
840
  });
904
- break;
841
+ // Deliver the screenshot artifact NOW. LLM enrichment (quality
842
+ // verification + alt text) runs afterwards in `executeOpcode`, off this
843
+ // timed action path, and mutates the artifact in place — it can never
844
+ // void a captured screenshot or push the action past its timeout.
845
+ return { success: true, captureArtifactIndex: artifacts.length - 1, captureUrl };
905
846
  }
906
847
  case 'BEGIN_CLIP': {
907
848
  if (executionState.activeClip) {
@@ -967,6 +908,127 @@ async function executeOpcodeAction(opcode, opcodeIndex, adapter, artifacts, tele
967
908
  };
968
909
  }
969
910
  }
911
+ // ── Capture: deterministic capture + best-effort enrichment ─────────
912
+ /**
913
+ * Re-stabilize budget for the verification retry. Sized for the post-action
914
+ * enrichment budget (under the ~30s global deadline), not the tight per-opcode
915
+ * action timeout — it only needs to clear transient skeletons/spinners before a
916
+ * second look. Was 8000ms when this ran inside the 10s action; that starved the
917
+ * capture and is exactly what this separation removes.
918
+ */
919
+ const VERIFY_RETRY_STABILIZE_MS = 3000;
920
+ /** Visual stabilization shared by the capture action and the verify retry.
921
+ * Never fails the capture — a page that never settles is captured anyway. */
922
+ function stabilizeForCapture(adapter, maxWaitMs) {
923
+ return adapter.waitForVisuallyStable
924
+ ? adapter.waitForVisuallyStable({ maxWaitMs })
925
+ : smartWaitForStability(adapter, { maxWaitMs }).then((r) => ({
926
+ stable: r.stable,
927
+ reason: r.waitedFor.join(', ') || 'unknown',
928
+ }));
929
+ }
930
+ /** Take the screenshot buffer for a CAPTURE_SCREENSHOT opcode (element clip or
931
+ * full page). Shared by the capture action and the verify retry. */
932
+ function takeCaptureBuffer(adapter, opcode) {
933
+ if (opcode.elementSelector && adapter.takeElementScreenshot) {
934
+ return adapter.takeElementScreenshot(opcode.elementSelector, opcode.outscale);
935
+ }
936
+ if (opcode.elementSelector) {
937
+ throw new Error(`element capture requires adapter support for selector "${opcode.elementSelector}"`);
938
+ }
939
+ return adapter.takeScreenshot();
940
+ }
941
+ function recordCaptureVerificationTelemetry(telemetry, llmResult) {
942
+ if (!llmResult)
943
+ return;
944
+ telemetry.llmCallCount++;
945
+ telemetry.llmCostEur += llmResult.costEur;
946
+ telemetry.llmStepUsages.push({
947
+ stepType: 'capture_verification',
948
+ generationId: llmResult.generationId,
949
+ model: llmResult.model,
950
+ promptTokens: llmResult.promptTokens,
951
+ completionTokens: llmResult.completionTokens,
952
+ });
953
+ }
954
+ /**
955
+ * Best-effort LLM enrichment of an already-delivered screenshot artifact:
956
+ * quality verification (with one budget-bounded re-stabilize + re-shoot) and
957
+ * alt text. Runs AFTER the deterministic capture action, off its timed path,
958
+ * and mutates the artifact in place. By construction it never changes
959
+ * `artifacts.length`, so it can never turn a captured screenshot into a
960
+ * "no artifact" failure — the regression this fixes.
961
+ *
962
+ * Bounded by the per-opcode global wait deadline. If the budget elapses
963
+ * mid-call the in-flight LLM promise is orphaned; `settled` makes any late
964
+ * in-place write a no-op so it cannot mutate an artifact the runner moved past.
965
+ */
966
+ async function enrichCaptureArtifact(artifact, opcode, adapter, captureUrl, currentVariant, runOptions, telemetry, globalDeadlineMs) {
967
+ const llmConfig = runOptions.llmConfig;
968
+ if (!llmConfig)
969
+ return;
970
+ const budgetMs = getRemainingTimeMs(globalDeadlineMs);
971
+ if (budgetMs <= 0)
972
+ return;
973
+ let settled = false;
974
+ const verificationContext = {
975
+ expectedDescription: opcode.description,
976
+ url: captureUrl,
977
+ locale: currentVariant?.locale,
978
+ theme: currentVariant?.theme,
979
+ };
980
+ try {
981
+ await withTimeout(async () => {
982
+ const verification = await verifyCaptureQuality(artifact.buffer, verificationContext, llmConfig);
983
+ recordCaptureVerificationTelemetry(telemetry, verification.llmResult);
984
+ // On a failed verdict, give the page a brief settle and re-shoot once,
985
+ // bounded by the remaining global budget. Swap the buffer in place only if
986
+ // the second shot verifies clean.
987
+ if (!verification.passed) {
988
+ const retryStabilizeMs = Math.min(VERIFY_RETRY_STABILIZE_MS, getRemainingTimeMs(globalDeadlineMs));
989
+ if (retryStabilizeMs > 0) {
990
+ await stabilizeForCapture(adapter, retryStabilizeMs);
991
+ const retryBuffer = await takeCaptureBuffer(adapter, opcode);
992
+ const retryVerification = await verifyCaptureQuality(retryBuffer, verificationContext, llmConfig);
993
+ recordCaptureVerificationTelemetry(telemetry, retryVerification.llmResult);
994
+ if (retryVerification.passed && !settled) {
995
+ artifact.buffer = retryBuffer;
996
+ }
997
+ }
998
+ }
999
+ try {
1000
+ const altResult = await generateAltText(artifact.buffer, {
1001
+ description: opcode.description,
1002
+ url: captureUrl,
1003
+ locale: currentVariant?.locale,
1004
+ presetName: runOptions.presetName,
1005
+ }, llmConfig);
1006
+ if (!settled)
1007
+ artifact.altText = altResult.altText;
1008
+ if (altResult.llmResult) {
1009
+ telemetry.llmCallCount++;
1010
+ telemetry.llmCostEur += altResult.llmResult.costEur;
1011
+ telemetry.llmStepUsages.push({
1012
+ stepType: 'alt_text_generation',
1013
+ generationId: altResult.llmResult.generationId,
1014
+ model: altResult.llmResult.model,
1015
+ promptTokens: altResult.llmResult.promptTokens,
1016
+ completionTokens: altResult.llmResult.completionTokens,
1017
+ });
1018
+ }
1019
+ }
1020
+ catch {
1021
+ // Alt text generation failed — non-fatal.
1022
+ }
1023
+ }, budgetMs);
1024
+ }
1025
+ catch (err) {
1026
+ logger.debug(`[opcode CAPTURE_SCREENSHOT] enrichment skipped: ${err instanceof Error ? err.message : String(err)}`);
1027
+ }
1028
+ finally {
1029
+ settled = true;
1030
+ }
1031
+ }
970
1032
  /**
971
1033
  * Snapshot per-opcode timing + element bbox before the action runs. Returns
972
1034
  * null when no timing should be emitted (mediaMode != video, or no active
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "autokap",
3
- "version": "1.9.3",
3
+ "version": "1.9.4",
4
4
  "description": "AI-powered CLI tool for capturing clean screenshots of websites",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",