@predicatelabs/sdk 0.99.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +24 -0
- package/README.md +252 -0
- package/dist/actions.d.ts +185 -0
- package/dist/actions.d.ts.map +1 -0
- package/dist/actions.js +1120 -0
- package/dist/actions.js.map +1 -0
- package/dist/agent-runtime.d.ts +352 -0
- package/dist/agent-runtime.d.ts.map +1 -0
- package/dist/agent-runtime.js +1170 -0
- package/dist/agent-runtime.js.map +1 -0
- package/dist/agent.d.ts +164 -0
- package/dist/agent.d.ts.map +1 -0
- package/dist/agent.js +408 -0
- package/dist/agent.js.map +1 -0
- package/dist/asserts/expect.d.ts +159 -0
- package/dist/asserts/expect.d.ts.map +1 -0
- package/dist/asserts/expect.js +547 -0
- package/dist/asserts/expect.js.map +1 -0
- package/dist/asserts/index.d.ts +58 -0
- package/dist/asserts/index.d.ts.map +1 -0
- package/dist/asserts/index.js +70 -0
- package/dist/asserts/index.js.map +1 -0
- package/dist/asserts/query.d.ts +199 -0
- package/dist/asserts/query.d.ts.map +1 -0
- package/dist/asserts/query.js +288 -0
- package/dist/asserts/query.js.map +1 -0
- package/dist/backends/actions.d.ts +119 -0
- package/dist/backends/actions.d.ts.map +1 -0
- package/dist/backends/actions.js +291 -0
- package/dist/backends/actions.js.map +1 -0
- package/dist/backends/browser-use-adapter.d.ts +131 -0
- package/dist/backends/browser-use-adapter.d.ts.map +1 -0
- package/dist/backends/browser-use-adapter.js +219 -0
- package/dist/backends/browser-use-adapter.js.map +1 -0
- package/dist/backends/cdp-backend.d.ts +66 -0
- package/dist/backends/cdp-backend.d.ts.map +1 -0
- package/dist/backends/cdp-backend.js +273 -0
- package/dist/backends/cdp-backend.js.map +1 -0
- package/dist/backends/index.d.ts +80 -0
- package/dist/backends/index.d.ts.map +1 -0
- package/dist/backends/index.js +101 -0
- package/dist/backends/index.js.map +1 -0
- package/dist/backends/protocol.d.ts +156 -0
- package/dist/backends/protocol.d.ts.map +1 -0
- package/dist/backends/protocol.js +16 -0
- package/dist/backends/protocol.js.map +1 -0
- package/dist/backends/sentience-context.d.ts +143 -0
- package/dist/backends/sentience-context.d.ts.map +1 -0
- package/dist/backends/sentience-context.js +359 -0
- package/dist/backends/sentience-context.js.map +1 -0
- package/dist/backends/snapshot.d.ts +188 -0
- package/dist/backends/snapshot.d.ts.map +1 -0
- package/dist/backends/snapshot.js +360 -0
- package/dist/backends/snapshot.js.map +1 -0
- package/dist/browser.d.ts +154 -0
- package/dist/browser.d.ts.map +1 -0
- package/dist/browser.js +920 -0
- package/dist/browser.js.map +1 -0
- package/dist/canonicalization.d.ts +126 -0
- package/dist/canonicalization.d.ts.map +1 -0
- package/dist/canonicalization.js +161 -0
- package/dist/canonicalization.js.map +1 -0
- package/dist/captcha/strategies.d.ts +12 -0
- package/dist/captcha/strategies.d.ts.map +1 -0
- package/dist/captcha/strategies.js +43 -0
- package/dist/captcha/strategies.js.map +1 -0
- package/dist/captcha/types.d.ts +45 -0
- package/dist/captcha/types.d.ts.map +1 -0
- package/dist/captcha/types.js +12 -0
- package/dist/captcha/types.js.map +1 -0
- package/dist/cli.d.ts +5 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +422 -0
- package/dist/cli.js.map +1 -0
- package/dist/conversational-agent.d.ts +123 -0
- package/dist/conversational-agent.d.ts.map +1 -0
- package/dist/conversational-agent.js +341 -0
- package/dist/conversational-agent.js.map +1 -0
- package/dist/cursor-policy.d.ts +41 -0
- package/dist/cursor-policy.d.ts.map +1 -0
- package/dist/cursor-policy.js +81 -0
- package/dist/cursor-policy.js.map +1 -0
- package/dist/debugger.d.ts +28 -0
- package/dist/debugger.d.ts.map +1 -0
- package/dist/debugger.js +107 -0
- package/dist/debugger.js.map +1 -0
- package/dist/expect.d.ts +16 -0
- package/dist/expect.d.ts.map +1 -0
- package/dist/expect.js +67 -0
- package/dist/expect.js.map +1 -0
- package/dist/failure-artifacts.d.ts +95 -0
- package/dist/failure-artifacts.d.ts.map +1 -0
- package/dist/failure-artifacts.js +805 -0
- package/dist/failure-artifacts.js.map +1 -0
- package/dist/generator.d.ts +16 -0
- package/dist/generator.d.ts.map +1 -0
- package/dist/generator.js +205 -0
- package/dist/generator.js.map +1 -0
- package/dist/index.d.ts +37 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +160 -0
- package/dist/index.js.map +1 -0
- package/dist/inspector.d.ts +13 -0
- package/dist/inspector.d.ts.map +1 -0
- package/dist/inspector.js +153 -0
- package/dist/inspector.js.map +1 -0
- package/dist/llm-provider.d.ts +144 -0
- package/dist/llm-provider.d.ts.map +1 -0
- package/dist/llm-provider.js +460 -0
- package/dist/llm-provider.js.map +1 -0
- package/dist/ordinal.d.ts +90 -0
- package/dist/ordinal.d.ts.map +1 -0
- package/dist/ordinal.js +249 -0
- package/dist/ordinal.js.map +1 -0
- package/dist/overlay.d.ts +63 -0
- package/dist/overlay.d.ts.map +1 -0
- package/dist/overlay.js +102 -0
- package/dist/overlay.js.map +1 -0
- package/dist/protocols/browser-protocol.d.ts +79 -0
- package/dist/protocols/browser-protocol.d.ts.map +1 -0
- package/dist/protocols/browser-protocol.js +9 -0
- package/dist/protocols/browser-protocol.js.map +1 -0
- package/dist/query.d.ts +66 -0
- package/dist/query.d.ts.map +1 -0
- package/dist/query.js +482 -0
- package/dist/query.js.map +1 -0
- package/dist/read.d.ts +47 -0
- package/dist/read.d.ts.map +1 -0
- package/dist/read.js +128 -0
- package/dist/read.js.map +1 -0
- package/dist/recorder.d.ts +44 -0
- package/dist/recorder.d.ts.map +1 -0
- package/dist/recorder.js +262 -0
- package/dist/recorder.js.map +1 -0
- package/dist/runtime-agent.d.ts +72 -0
- package/dist/runtime-agent.d.ts.map +1 -0
- package/dist/runtime-agent.js +357 -0
- package/dist/runtime-agent.js.map +1 -0
- package/dist/screenshot.d.ts +17 -0
- package/dist/screenshot.d.ts.map +1 -0
- package/dist/screenshot.js +40 -0
- package/dist/screenshot.js.map +1 -0
- package/dist/snapshot-diff.d.ts +23 -0
- package/dist/snapshot-diff.d.ts.map +1 -0
- package/dist/snapshot-diff.js +119 -0
- package/dist/snapshot-diff.js.map +1 -0
- package/dist/snapshot.d.ts +47 -0
- package/dist/snapshot.d.ts.map +1 -0
- package/dist/snapshot.js +358 -0
- package/dist/snapshot.js.map +1 -0
- package/dist/textSearch.d.ts +64 -0
- package/dist/textSearch.d.ts.map +1 -0
- package/dist/textSearch.js +113 -0
- package/dist/textSearch.js.map +1 -0
- package/dist/tools/context.d.ts +18 -0
- package/dist/tools/context.d.ts.map +1 -0
- package/dist/tools/context.js +40 -0
- package/dist/tools/context.js.map +1 -0
- package/dist/tools/defaults.d.ts +5 -0
- package/dist/tools/defaults.d.ts.map +1 -0
- package/dist/tools/defaults.js +368 -0
- package/dist/tools/defaults.js.map +1 -0
- package/dist/tools/filesystem.d.ts +12 -0
- package/dist/tools/filesystem.d.ts.map +1 -0
- package/dist/tools/filesystem.js +137 -0
- package/dist/tools/filesystem.js.map +1 -0
- package/dist/tools/index.d.ts +5 -0
- package/dist/tools/index.d.ts.map +1 -0
- package/dist/tools/index.js +15 -0
- package/dist/tools/index.js.map +1 -0
- package/dist/tools/registry.d.ts +38 -0
- package/dist/tools/registry.d.ts.map +1 -0
- package/dist/tools/registry.js +100 -0
- package/dist/tools/registry.js.map +1 -0
- package/dist/tracing/cloud-sink.d.ts +189 -0
- package/dist/tracing/cloud-sink.d.ts.map +1 -0
- package/dist/tracing/cloud-sink.js +1067 -0
- package/dist/tracing/cloud-sink.js.map +1 -0
- package/dist/tracing/index-schema.d.ts +231 -0
- package/dist/tracing/index-schema.d.ts.map +1 -0
- package/dist/tracing/index-schema.js +235 -0
- package/dist/tracing/index-schema.js.map +1 -0
- package/dist/tracing/index.d.ts +12 -0
- package/dist/tracing/index.d.ts.map +1 -0
- package/dist/tracing/index.js +28 -0
- package/dist/tracing/index.js.map +1 -0
- package/dist/tracing/indexer.d.ts +20 -0
- package/dist/tracing/indexer.d.ts.map +1 -0
- package/dist/tracing/indexer.js +347 -0
- package/dist/tracing/indexer.js.map +1 -0
- package/dist/tracing/jsonl-sink.d.ts +51 -0
- package/dist/tracing/jsonl-sink.d.ts.map +1 -0
- package/dist/tracing/jsonl-sink.js +329 -0
- package/dist/tracing/jsonl-sink.js.map +1 -0
- package/dist/tracing/sink.d.ts +25 -0
- package/dist/tracing/sink.d.ts.map +1 -0
- package/dist/tracing/sink.js +15 -0
- package/dist/tracing/sink.js.map +1 -0
- package/dist/tracing/tracer-factory.d.ts +102 -0
- package/dist/tracing/tracer-factory.d.ts.map +1 -0
- package/dist/tracing/tracer-factory.js +375 -0
- package/dist/tracing/tracer-factory.js.map +1 -0
- package/dist/tracing/tracer.d.ts +140 -0
- package/dist/tracing/tracer.d.ts.map +1 -0
- package/dist/tracing/tracer.js +336 -0
- package/dist/tracing/tracer.js.map +1 -0
- package/dist/tracing/types.d.ts +203 -0
- package/dist/tracing/types.d.ts.map +1 -0
- package/dist/tracing/types.js +8 -0
- package/dist/tracing/types.js.map +1 -0
- package/dist/types.d.ts +422 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +6 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/action-executor.d.ts +25 -0
- package/dist/utils/action-executor.d.ts.map +1 -0
- package/dist/utils/action-executor.js +121 -0
- package/dist/utils/action-executor.js.map +1 -0
- package/dist/utils/browser-evaluator.d.ts +76 -0
- package/dist/utils/browser-evaluator.d.ts.map +1 -0
- package/dist/utils/browser-evaluator.js +130 -0
- package/dist/utils/browser-evaluator.js.map +1 -0
- package/dist/utils/browser.d.ts +30 -0
- package/dist/utils/browser.d.ts.map +1 -0
- package/dist/utils/browser.js +75 -0
- package/dist/utils/browser.js.map +1 -0
- package/dist/utils/element-filter.d.ts +76 -0
- package/dist/utils/element-filter.d.ts.map +1 -0
- package/dist/utils/element-filter.js +195 -0
- package/dist/utils/element-filter.js.map +1 -0
- package/dist/utils/grid-utils.d.ts +37 -0
- package/dist/utils/grid-utils.d.ts.map +1 -0
- package/dist/utils/grid-utils.js +283 -0
- package/dist/utils/grid-utils.js.map +1 -0
- package/dist/utils/llm-interaction-handler.d.ts +41 -0
- package/dist/utils/llm-interaction-handler.d.ts.map +1 -0
- package/dist/utils/llm-interaction-handler.js +171 -0
- package/dist/utils/llm-interaction-handler.js.map +1 -0
- package/dist/utils/llm-response-builder.d.ts +56 -0
- package/dist/utils/llm-response-builder.d.ts.map +1 -0
- package/dist/utils/llm-response-builder.js +130 -0
- package/dist/utils/llm-response-builder.js.map +1 -0
- package/dist/utils/selector-utils.d.ts +12 -0
- package/dist/utils/selector-utils.d.ts.map +1 -0
- package/dist/utils/selector-utils.js +32 -0
- package/dist/utils/selector-utils.js.map +1 -0
- package/dist/utils/snapshot-event-builder.d.ts +28 -0
- package/dist/utils/snapshot-event-builder.d.ts.map +1 -0
- package/dist/utils/snapshot-event-builder.js +88 -0
- package/dist/utils/snapshot-event-builder.js.map +1 -0
- package/dist/utils/snapshot-processor.d.ts +27 -0
- package/dist/utils/snapshot-processor.d.ts.map +1 -0
- package/dist/utils/snapshot-processor.js +47 -0
- package/dist/utils/snapshot-processor.js.map +1 -0
- package/dist/utils/trace-event-builder.d.ts +122 -0
- package/dist/utils/trace-event-builder.d.ts.map +1 -0
- package/dist/utils/trace-event-builder.js +365 -0
- package/dist/utils/trace-event-builder.js.map +1 -0
- package/dist/utils/trace-file-manager.d.ts +70 -0
- package/dist/utils/trace-file-manager.d.ts.map +1 -0
- package/dist/utils/trace-file-manager.js +194 -0
- package/dist/utils/trace-file-manager.js.map +1 -0
- package/dist/utils/zod.d.ts +5 -0
- package/dist/utils/zod.d.ts.map +1 -0
- package/dist/utils/zod.js +80 -0
- package/dist/utils/zod.js.map +1 -0
- package/dist/utils.d.ts +8 -0
- package/dist/utils.d.ts.map +1 -0
- package/dist/utils.js +13 -0
- package/dist/utils.js.map +1 -0
- package/dist/verification.d.ts +194 -0
- package/dist/verification.d.ts.map +1 -0
- package/dist/verification.js +530 -0
- package/dist/verification.js.map +1 -0
- package/dist/vision-executor.d.ts +18 -0
- package/dist/vision-executor.d.ts.map +1 -0
- package/dist/vision-executor.js +60 -0
- package/dist/vision-executor.js.map +1 -0
- package/dist/visual-agent.d.ts +120 -0
- package/dist/visual-agent.d.ts.map +1 -0
- package/dist/visual-agent.js +796 -0
- package/dist/visual-agent.js.map +1 -0
- package/dist/wait.d.ts +35 -0
- package/dist/wait.d.ts.map +1 -0
- package/dist/wait.js +76 -0
- package/dist/wait.js.map +1 -0
- package/package.json +94 -0
- package/spec/README.md +72 -0
- package/spec/SNAPSHOT_V1.md +208 -0
- package/spec/sdk-types.md +259 -0
- package/spec/snapshot.schema.json +148 -0
- package/src/extension/background.js +104 -0
- package/src/extension/content.js +162 -0
- package/src/extension/injected_api.js +1399 -0
- package/src/extension/manifest.json +36 -0
- package/src/extension/pkg/README.md +1340 -0
- package/src/extension/pkg/package.json +15 -0
- package/src/extension/pkg/sentience_core.d.ts +51 -0
- package/src/extension/pkg/sentience_core.js +371 -0
- package/src/extension/pkg/sentience_core_bg.wasm +0 -0
- package/src/extension/pkg/sentience_core_bg.wasm.d.ts +10 -0
- package/src/extension/release.json +116 -0
|
@@ -0,0 +1,796 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Visual Agent - Uses labeled screenshots with vision-capable LLMs
|
|
4
|
+
*
|
|
5
|
+
* This agent extends SentienceAgent to use visual prompts:
|
|
6
|
+
* 1. Takes snapshot with screenshot enabled
|
|
7
|
+
* 2. Draws bounding boxes and labels element IDs on the screenshot
|
|
8
|
+
* 3. Uses anti-collision algorithm to position labels (4 sides + 4 corners)
|
|
9
|
+
* 4. Sends labeled screenshot to vision-capable LLM
|
|
10
|
+
* 5. Extracts element ID from LLM response
|
|
11
|
+
* 6. Clicks the element using click()
|
|
12
|
+
*
|
|
13
|
+
* Dependencies:
|
|
14
|
+
* - sharp: Required for image processing
|
|
15
|
+
* Install with: npm install sharp
|
|
16
|
+
* - canvas: Required for drawing on images
|
|
17
|
+
* Install with: npm install canvas
|
|
18
|
+
* - Vision-capable LLM: Requires an LLM provider that supports vision (e.g., GPT-4o, Claude 3)
|
|
19
|
+
*/
|
|
20
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
21
|
+
if (k2 === undefined) k2 = k;
|
|
22
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
23
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
24
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
25
|
+
}
|
|
26
|
+
Object.defineProperty(o, k2, desc);
|
|
27
|
+
}) : (function(o, m, k, k2) {
|
|
28
|
+
if (k2 === undefined) k2 = k;
|
|
29
|
+
o[k2] = m[k];
|
|
30
|
+
}));
|
|
31
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
32
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
33
|
+
}) : function(o, v) {
|
|
34
|
+
o["default"] = v;
|
|
35
|
+
});
|
|
36
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
37
|
+
var ownKeys = function(o) {
|
|
38
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
39
|
+
var ar = [];
|
|
40
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
41
|
+
return ar;
|
|
42
|
+
};
|
|
43
|
+
return ownKeys(o);
|
|
44
|
+
};
|
|
45
|
+
return function (mod) {
|
|
46
|
+
if (mod && mod.__esModule) return mod;
|
|
47
|
+
var result = {};
|
|
48
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
49
|
+
__setModuleDefault(result, mod);
|
|
50
|
+
return result;
|
|
51
|
+
};
|
|
52
|
+
})();
|
|
53
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
54
|
+
exports.PredicateVisualAgent = exports.SentienceVisualAgent = void 0;
|
|
55
|
+
const snapshot_1 = require("./snapshot");
|
|
56
|
+
const crypto_1 = require("crypto");
|
|
57
|
+
const trace_event_builder_1 = require("./utils/trace-event-builder");
|
|
58
|
+
const snapshot_event_builder_1 = require("./utils/snapshot-event-builder");
|
|
59
|
+
const snapshot_processor_1 = require("./utils/snapshot-processor");
|
|
60
|
+
const actions_1 = require("./actions");
|
|
61
|
+
const agent_1 = require("./agent");
|
|
62
|
+
const path = __importStar(require("path"));
|
|
63
|
+
const fs = __importStar(require("fs"));
|
|
64
|
+
const uuid_1 = require("uuid");
|
|
65
|
+
// Check for required image processing libraries
|
|
66
|
+
let sharp;
|
|
67
|
+
let canvas;
|
|
68
|
+
let CANVAS_AVAILABLE = false;
|
|
69
|
+
let SHARP_AVAILABLE = false;
|
|
70
|
+
try {
|
|
71
|
+
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
|
72
|
+
sharp = require('sharp');
|
|
73
|
+
SHARP_AVAILABLE = true;
|
|
74
|
+
}
|
|
75
|
+
catch {
|
|
76
|
+
sharp = undefined;
|
|
77
|
+
console.warn('⚠️ Warning: sharp not available. Install with: npm install sharp');
|
|
78
|
+
}
|
|
79
|
+
try {
|
|
80
|
+
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
|
81
|
+
canvas = require('canvas');
|
|
82
|
+
CANVAS_AVAILABLE = true;
|
|
83
|
+
}
|
|
84
|
+
catch {
|
|
85
|
+
canvas = undefined;
|
|
86
|
+
console.warn('⚠️ Warning: canvas not available. Install with: npm install canvas');
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Visual agent that uses labeled screenshots with vision-capable LLMs.
|
|
90
|
+
*
|
|
91
|
+
* Extends SentienceAgent to override act() method with visual prompting.
|
|
92
|
+
*
|
|
93
|
+
* Requirements:
|
|
94
|
+
* - sharp: Required for image processing
|
|
95
|
+
* Install with: npm install sharp
|
|
96
|
+
* - canvas: Required for drawing on images
|
|
97
|
+
* Install with: npm install canvas
|
|
98
|
+
* - Vision-capable LLM: Requires an LLM provider that supports vision (e.g., GPT-4o, Claude 3)
|
|
99
|
+
*/
|
|
100
|
+
class SentienceVisualAgent extends agent_1.SentienceAgent {
|
|
101
|
+
/**
|
|
102
|
+
* Initialize Visual Agent
|
|
103
|
+
*
|
|
104
|
+
* @param browser - SentienceBrowser instance
|
|
105
|
+
* @param llm - LLM provider (must support vision, e.g., GPT-4o, Claude 3)
|
|
106
|
+
* @param snapshotLimit - Default maximum elements to include
|
|
107
|
+
* @param verbose - Print execution logs
|
|
108
|
+
* @param tracer - Optional Tracer instance
|
|
109
|
+
* @param showOverlay - Show green bbox overlay in browser
|
|
110
|
+
*/
|
|
111
|
+
constructor(browser, llm, snapshotLimit = 50, verbose = true, tracer, showOverlay = false) {
|
|
112
|
+
super(browser, llm, snapshotLimit, verbose, tracer, showOverlay);
|
|
113
|
+
if (!SHARP_AVAILABLE || !CANVAS_AVAILABLE) {
|
|
114
|
+
throw new Error('sharp and canvas are required for SentienceVisualAgent. ' +
|
|
115
|
+
'Install with: npm install sharp canvas');
|
|
116
|
+
}
|
|
117
|
+
// Track previous snapshot for diff computation (stored in base class)
|
|
118
|
+
}
|
|
119
|
+
/**
|
|
120
|
+
* Decode base64 screenshot data URL to image buffer
|
|
121
|
+
*
|
|
122
|
+
* @param screenshotDataUrl - Base64-encoded data URL (e.g., "data:image/png;base64,...")
|
|
123
|
+
* @returns Image buffer
|
|
124
|
+
*/
|
|
125
|
+
decodeScreenshot(screenshotDataUrl) {
|
|
126
|
+
// Extract base64 data from data URL
|
|
127
|
+
if (screenshotDataUrl.startsWith('data:image/')) {
|
|
128
|
+
// Format: "data:image/png;base64,<base64_data>"
|
|
129
|
+
const base64Data = screenshotDataUrl.split(',', 2)[1];
|
|
130
|
+
return Buffer.from(base64Data, 'base64');
|
|
131
|
+
}
|
|
132
|
+
else {
|
|
133
|
+
// Assume it's already base64
|
|
134
|
+
return Buffer.from(screenshotDataUrl, 'base64');
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
/**
|
|
138
|
+
* Find best position for label using anti-collision algorithm.
|
|
139
|
+
*
|
|
140
|
+
* Tries 8 positions: 4 sides (top, bottom, left, right) + 4 corners.
|
|
141
|
+
* Returns the first position that doesn't collide with existing labels.
|
|
142
|
+
*
|
|
143
|
+
* @param elementBbox - Element bounding box {x, y, width, height}
|
|
144
|
+
* @param existingLabels - List of existing label bounding boxes
|
|
145
|
+
* @param imageWidth - Image width in pixels
|
|
146
|
+
* @param imageHeight - Image height in pixels
|
|
147
|
+
* @param labelWidth - Label width in pixels
|
|
148
|
+
* @param labelHeight - Label height in pixels
|
|
149
|
+
* @returns (x, y) position for label
|
|
150
|
+
*/
|
|
151
|
+
findLabelPosition(elementBbox, existingLabels, imageWidth, imageHeight, labelWidth, labelHeight) {
|
|
152
|
+
const { x, y, width, height } = elementBbox;
|
|
153
|
+
// Offset from element edge
|
|
154
|
+
const labelOffset = 15; // Increased from 5px for better separation
|
|
155
|
+
// Try 8 positions: top, bottom, left, right, top-left, top-right, bottom-left, bottom-right
|
|
156
|
+
const positions = [
|
|
157
|
+
[Math.floor(x + width / 2 - labelWidth / 2), Math.floor(y - labelHeight - labelOffset)], // Top
|
|
158
|
+
[Math.floor(x + width / 2 - labelWidth / 2), Math.floor(y + height + labelOffset)], // Bottom
|
|
159
|
+
[Math.floor(x - labelWidth - labelOffset), Math.floor(y + height / 2 - labelHeight / 2)], // Left
|
|
160
|
+
[Math.floor(x + width + labelOffset), Math.floor(y + height / 2 - labelHeight / 2)], // Right
|
|
161
|
+
[Math.floor(x - labelWidth - labelOffset), Math.floor(y - labelHeight - labelOffset)], // Top-left
|
|
162
|
+
[Math.floor(x + width + labelOffset), Math.floor(y - labelHeight - labelOffset)], // Top-right
|
|
163
|
+
[Math.floor(x - labelWidth - labelOffset), Math.floor(y + height + labelOffset)], // Bottom-left
|
|
164
|
+
[Math.floor(x + width + labelOffset), Math.floor(y + height + labelOffset)], // Bottom-right
|
|
165
|
+
];
|
|
166
|
+
// Check each position for collisions
|
|
167
|
+
for (const [posX, posY] of positions) {
|
|
168
|
+
// Check bounds
|
|
169
|
+
if (posX < 0 ||
|
|
170
|
+
posY < 0 ||
|
|
171
|
+
posX + labelWidth > imageWidth ||
|
|
172
|
+
posY + labelHeight > imageHeight) {
|
|
173
|
+
continue;
|
|
174
|
+
}
|
|
175
|
+
// Check collision with existing labels
|
|
176
|
+
const labelBbox = {
|
|
177
|
+
x: posX,
|
|
178
|
+
y: posY,
|
|
179
|
+
width: labelWidth,
|
|
180
|
+
height: labelHeight,
|
|
181
|
+
};
|
|
182
|
+
let collision = false;
|
|
183
|
+
for (const existing of existingLabels) {
|
|
184
|
+
// Simple AABB collision detection
|
|
185
|
+
if (!(labelBbox.x + labelBbox.width < existing.x ||
|
|
186
|
+
labelBbox.x > existing.x + existing.width ||
|
|
187
|
+
labelBbox.y + labelBbox.height < existing.y ||
|
|
188
|
+
labelBbox.y > existing.y + existing.height)) {
|
|
189
|
+
collision = true;
|
|
190
|
+
break;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
if (!collision) {
|
|
194
|
+
return [posX, posY];
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
// If all positions collide, use top position with increased offset
|
|
198
|
+
return [
|
|
199
|
+
Math.floor(x + width / 2 - labelWidth / 2),
|
|
200
|
+
Math.floor(y - labelHeight - labelOffset * 2),
|
|
201
|
+
];
|
|
202
|
+
}
|
|
203
|
+
/**
|
|
204
|
+
* Draw labeled screenshot with bounding boxes and element IDs.
|
|
205
|
+
*
|
|
206
|
+
* @param snapshot - Snapshot with screenshot data
|
|
207
|
+
* @param elements - List of elements to label
|
|
208
|
+
* @returns Image buffer with labels drawn
|
|
209
|
+
*/
|
|
210
|
+
async drawLabeledScreenshot(snapshot, elements) {
|
|
211
|
+
if (!snapshot.screenshot) {
|
|
212
|
+
throw new Error('Screenshot not available in snapshot');
|
|
213
|
+
}
|
|
214
|
+
// Decode screenshot
|
|
215
|
+
const imageBuffer = this.decodeScreenshot(snapshot.screenshot);
|
|
216
|
+
if (!sharp) {
|
|
217
|
+
throw new Error('sharp is not available. Install with: npm install sharp');
|
|
218
|
+
}
|
|
219
|
+
const img = await sharp(imageBuffer);
|
|
220
|
+
const metadata = await img.metadata();
|
|
221
|
+
const imageWidth = metadata.width || 0;
|
|
222
|
+
const imageHeight = metadata.height || 0;
|
|
223
|
+
// Create canvas for drawing
|
|
224
|
+
if (!canvas) {
|
|
225
|
+
throw new Error('canvas is not available. Install with: npm install canvas');
|
|
226
|
+
}
|
|
227
|
+
const { createCanvas, loadImage } = canvas;
|
|
228
|
+
const canvasElement = createCanvas(imageWidth, imageHeight);
|
|
229
|
+
const ctx = canvasElement.getContext('2d');
|
|
230
|
+
// Draw original image on canvas
|
|
231
|
+
const image = await loadImage(imageBuffer);
|
|
232
|
+
ctx.drawImage(image, 0, 0);
|
|
233
|
+
// Load font (fallback to default if not available)
|
|
234
|
+
let font = '16px Arial';
|
|
235
|
+
try {
|
|
236
|
+
// Try to use system font
|
|
237
|
+
font = '16px Helvetica';
|
|
238
|
+
}
|
|
239
|
+
catch {
|
|
240
|
+
// Use default
|
|
241
|
+
font = '16px Arial';
|
|
242
|
+
}
|
|
243
|
+
const existingLabels = [];
|
|
244
|
+
// Neon green color: #39FF14 (bright, vibrant green)
|
|
245
|
+
const neonGreen = '#39FF14';
|
|
246
|
+
// Draw bounding boxes and labels for each element
|
|
247
|
+
for (const element of elements) {
|
|
248
|
+
const bbox = element.bbox;
|
|
249
|
+
const x = bbox.x;
|
|
250
|
+
const y = bbox.y;
|
|
251
|
+
const width = bbox.width;
|
|
252
|
+
const height = bbox.height;
|
|
253
|
+
// Draw bounding box rectangle (neon green with 2px width)
|
|
254
|
+
ctx.strokeStyle = neonGreen;
|
|
255
|
+
ctx.lineWidth = 2;
|
|
256
|
+
ctx.strokeRect(x, y, width, height);
|
|
257
|
+
// Prepare label text (just the number - keep it simple and compact)
|
|
258
|
+
const labelText = String(element.id);
|
|
259
|
+
// Measure label text size
|
|
260
|
+
ctx.font = font;
|
|
261
|
+
const textMetrics = ctx.measureText(labelText);
|
|
262
|
+
const labelWidth = textMetrics.width;
|
|
263
|
+
const labelHeight = 16; // Approximate height for 16px font
|
|
264
|
+
// Find best position for label (anti-collision)
|
|
265
|
+
const [labelX, labelY] = this.findLabelPosition({ x, y, width, height }, existingLabels, imageWidth, imageHeight, labelWidth + 8, // Add padding
|
|
266
|
+
labelHeight + 4 // Add padding
|
|
267
|
+
);
|
|
268
|
+
// Calculate connection points for a clearer visual link
|
|
269
|
+
const elementCenterX = x + width / 2;
|
|
270
|
+
const elementCenterY = y + height / 2;
|
|
271
|
+
const labelCenterX = labelX + labelWidth / 2;
|
|
272
|
+
const labelCenterY = labelY + labelHeight / 2;
|
|
273
|
+
// Determine which edge of the element is closest to the label
|
|
274
|
+
const distTop = Math.abs(labelCenterY - y);
|
|
275
|
+
const distBottom = Math.abs(labelCenterY - (y + height));
|
|
276
|
+
const distLeft = Math.abs(labelCenterX - x);
|
|
277
|
+
const distRight = Math.abs(labelCenterX - (x + width));
|
|
278
|
+
const minDist = Math.min(distTop, distBottom, distLeft, distRight);
|
|
279
|
+
let lineStart;
|
|
280
|
+
if (minDist === distTop) {
|
|
281
|
+
lineStart = [elementCenterX, y];
|
|
282
|
+
}
|
|
283
|
+
else if (minDist === distBottom) {
|
|
284
|
+
lineStart = [elementCenterX, y + height];
|
|
285
|
+
}
|
|
286
|
+
else if (minDist === distLeft) {
|
|
287
|
+
lineStart = [x, elementCenterY];
|
|
288
|
+
}
|
|
289
|
+
else {
|
|
290
|
+
lineStart = [x + width, elementCenterY];
|
|
291
|
+
}
|
|
292
|
+
// Draw connecting line from element edge to label
|
|
293
|
+
ctx.strokeStyle = neonGreen;
|
|
294
|
+
ctx.lineWidth = 2;
|
|
295
|
+
ctx.beginPath();
|
|
296
|
+
ctx.moveTo(lineStart[0], lineStart[1]);
|
|
297
|
+
ctx.lineTo(labelCenterX, labelCenterY);
|
|
298
|
+
ctx.stroke();
|
|
299
|
+
// Draw label background (white with neon green border)
|
|
300
|
+
const labelBgX1 = labelX - 4;
|
|
301
|
+
const labelBgY1 = labelY - 2;
|
|
302
|
+
const labelBgX2 = labelX + labelWidth + 4;
|
|
303
|
+
const labelBgY2 = labelY + labelHeight + 2;
|
|
304
|
+
// Draw white background
|
|
305
|
+
ctx.fillStyle = 'white';
|
|
306
|
+
ctx.fillRect(labelBgX1, labelBgY1, labelBgX2 - labelBgX1, labelBgY2 - labelBgY1);
|
|
307
|
+
// Draw neon green border
|
|
308
|
+
ctx.strokeStyle = neonGreen;
|
|
309
|
+
ctx.lineWidth = 2;
|
|
310
|
+
ctx.strokeRect(labelBgX1, labelBgY1, labelBgX2 - labelBgX1, labelBgY2 - labelBgY1);
|
|
311
|
+
// Draw label text (black for high contrast)
|
|
312
|
+
ctx.fillStyle = 'black';
|
|
313
|
+
ctx.font = font;
|
|
314
|
+
ctx.fillText(labelText, labelX, labelY + labelHeight);
|
|
315
|
+
// Record label position for collision detection
|
|
316
|
+
existingLabels.push({
|
|
317
|
+
x: labelBgX1,
|
|
318
|
+
y: labelBgY1,
|
|
319
|
+
width: labelBgX2 - labelBgX1,
|
|
320
|
+
height: labelBgY2 - labelBgY1,
|
|
321
|
+
});
|
|
322
|
+
}
|
|
323
|
+
// Convert canvas to buffer
|
|
324
|
+
return canvasElement.toBuffer('image/png');
|
|
325
|
+
}
|
|
326
|
+
/**
|
|
327
|
+
* Encode image buffer to base64 data URL with size optimization.
|
|
328
|
+
*
|
|
329
|
+
* Vision LLM APIs typically have size limits (e.g., 20MB for OpenAI).
|
|
330
|
+
* This function automatically compresses images if they're too large.
|
|
331
|
+
*
|
|
332
|
+
* @param imageBuffer - Image buffer
|
|
333
|
+
* @param format - Image format ('PNG' or 'JPEG')
|
|
334
|
+
* @param maxSizeMb - Maximum size in MB before compression (default: 20MB)
|
|
335
|
+
* @returns Base64-encoded data URL
|
|
336
|
+
*/
|
|
337
|
+
async encodeImageToBase64(imageBuffer, format = 'PNG', maxSizeMb = 20.0) {
|
|
338
|
+
if (!sharp) {
|
|
339
|
+
throw new Error('sharp is not available. Install with: npm install sharp');
|
|
340
|
+
}
|
|
341
|
+
let quality = 95; // Start with high quality
|
|
342
|
+
let outputBuffer = imageBuffer;
|
|
343
|
+
// Try to fit within size limit
|
|
344
|
+
for (let attempt = 0; attempt < 3; attempt++) {
|
|
345
|
+
if (format === 'JPEG') {
|
|
346
|
+
outputBuffer = await sharp(imageBuffer).jpeg({ quality, mozjpeg: true }).toBuffer();
|
|
347
|
+
}
|
|
348
|
+
else {
|
|
349
|
+
outputBuffer = await sharp(imageBuffer).png({ compressionLevel: 9 }).toBuffer();
|
|
350
|
+
}
|
|
351
|
+
const sizeMb = outputBuffer.length / (1024 * 1024);
|
|
352
|
+
if (sizeMb <= maxSizeMb) {
|
|
353
|
+
break;
|
|
354
|
+
}
|
|
355
|
+
// Reduce quality for next attempt
|
|
356
|
+
quality = Math.max(70, quality - 15);
|
|
357
|
+
if (this.verbose && attempt === 0) {
|
|
358
|
+
console.log(` ⚠️ Image size ${sizeMb.toFixed(2)}MB exceeds limit, compressing...`);
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
const finalSizeMb = outputBuffer.length / (1024 * 1024);
|
|
362
|
+
if (this.verbose) {
|
|
363
|
+
console.log(` 📸 Image encoded: ${finalSizeMb.toFixed(2)}MB (${outputBuffer.length} bytes)`);
|
|
364
|
+
}
|
|
365
|
+
const base64Data = outputBuffer.toString('base64');
|
|
366
|
+
const mimeType = format === 'PNG' ? 'image/png' : 'image/jpeg';
|
|
367
|
+
return `data:${mimeType};base64,${base64Data}`;
|
|
368
|
+
}
|
|
369
|
+
/**
|
|
370
|
+
* Query LLM with vision (labeled screenshot).
|
|
371
|
+
*
|
|
372
|
+
* @param imageDataUrl - Base64-encoded image data URL
|
|
373
|
+
* @param goal - User's goal/task
|
|
374
|
+
* @returns LLMResponse with element ID
|
|
375
|
+
*/
|
|
376
|
+
async queryLLMWithVision(imageDataUrl, goal) {
|
|
377
|
+
const systemPrompt = `You are a web automation assistant. You will see a screenshot of a web page with labeled element IDs.
|
|
378
|
+
Each clickable element has:
|
|
379
|
+
- A bright neon green (#39FF14) bounding box around the element
|
|
380
|
+
- A white label box with a number (the element ID) connected by a green line
|
|
381
|
+
- The label is clearly separate from the element (not part of the UI)
|
|
382
|
+
|
|
383
|
+
CRITICAL INSTRUCTIONS:
|
|
384
|
+
1. Look at the screenshot carefully
|
|
385
|
+
2. Find the element that matches the user's goal (ignore the white label boxes - they are annotations, not UI elements)
|
|
386
|
+
3. Follow the green line from that element to find its label box with the ID number
|
|
387
|
+
4. Respond with ONLY that integer ID number (e.g., "42" or "1567")
|
|
388
|
+
5. Do NOT include any explanation, reasoning, or other text
|
|
389
|
+
6. Do NOT say "element 1" or "the first element" - just return the number
|
|
390
|
+
7. Do NOT confuse the white label box with an interactive element - labels are annotations connected by green lines
|
|
391
|
+
|
|
392
|
+
Example responses:
|
|
393
|
+
- Correct: "42"
|
|
394
|
+
- Correct: "1567"
|
|
395
|
+
- Wrong: "I see element 42"
|
|
396
|
+
- Wrong: "The element ID is 42"
|
|
397
|
+
- Wrong: "42 (the search box)"`;
|
|
398
|
+
const userPrompt = `Goal: ${goal}
|
|
399
|
+
|
|
400
|
+
Look at the screenshot. Each element has a neon green bounding box with a white label showing its ID number.
|
|
401
|
+
Find the element that should be clicked to accomplish this goal.
|
|
402
|
+
Return ONLY the integer ID number from the label, nothing else.`;
|
|
403
|
+
// Check if LLM provider supports vision (OpenAI GPT-4o, Claude, etc.)
|
|
404
|
+
// For now, we'll use a fallback approach - try to pass image via the generate method
|
|
405
|
+
// Individual LLM providers should implement vision support in their generate methods
|
|
406
|
+
try {
|
|
407
|
+
// Try to use vision API if available
|
|
408
|
+
// This is a placeholder - actual implementation depends on LLM provider
|
|
409
|
+
const response = await this.llm.generate(systemPrompt, userPrompt, {
|
|
410
|
+
image: imageDataUrl,
|
|
411
|
+
temperature: 0.0,
|
|
412
|
+
});
|
|
413
|
+
return response;
|
|
414
|
+
}
|
|
415
|
+
catch {
|
|
416
|
+
// Fallback: Try to pass image via text description
|
|
417
|
+
const fallbackPrompt = `${userPrompt}\n\n[Image data: ${imageDataUrl.substring(0, 200)}...]`;
|
|
418
|
+
const fallbackResponse = await this.llm.generate(systemPrompt, fallbackPrompt, {
|
|
419
|
+
temperature: 0.0,
|
|
420
|
+
});
|
|
421
|
+
if (this.verbose) {
|
|
422
|
+
console.log(' ⚠️ Using fallback method (may not support vision)');
|
|
423
|
+
}
|
|
424
|
+
return fallbackResponse;
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
/**
|
|
428
|
+
* Extract element ID integer from LLM response.
|
|
429
|
+
*
|
|
430
|
+
* @param llmResponse - LLM response text
|
|
431
|
+
* @returns Element ID as integer, or undefined if not found
|
|
432
|
+
*/
|
|
433
|
+
extractElementId(llmResponse) {
|
|
434
|
+
if (this.verbose) {
|
|
435
|
+
console.log(`🔍 Raw LLM response: ${JSON.stringify(llmResponse)}`);
|
|
436
|
+
}
|
|
437
|
+
// Clean the response - remove leading/trailing whitespace
|
|
438
|
+
let cleaned = llmResponse.trim();
|
|
439
|
+
if (this.verbose) {
|
|
440
|
+
console.log(` 🧹 After strip: ${JSON.stringify(cleaned)}`);
|
|
441
|
+
}
|
|
442
|
+
// Remove common prefixes that LLMs might add
|
|
443
|
+
const prefixesToRemove = [
|
|
444
|
+
'element',
|
|
445
|
+
'id',
|
|
446
|
+
'the element',
|
|
447
|
+
'element id',
|
|
448
|
+
'the id',
|
|
449
|
+
'click',
|
|
450
|
+
'click on',
|
|
451
|
+
'select',
|
|
452
|
+
'choose',
|
|
453
|
+
];
|
|
454
|
+
for (const prefix of prefixesToRemove) {
|
|
455
|
+
if (cleaned.toLowerCase().startsWith(prefix)) {
|
|
456
|
+
cleaned = cleaned.substring(prefix.length).trim();
|
|
457
|
+
// Remove any remaining punctuation
|
|
458
|
+
cleaned = cleaned.replace(/^[:.,;!?()[\]{}]+/, '').trim();
|
|
459
|
+
if (this.verbose) {
|
|
460
|
+
console.log(` 🧹 After removing prefix '${prefix}': ${JSON.stringify(cleaned)}`);
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
// Try to find all integers in the cleaned response
|
|
465
|
+
const numbers = cleaned.match(/\d+/g);
|
|
466
|
+
if (this.verbose) {
|
|
467
|
+
console.log(` 🔢 Numbers found: ${numbers}`);
|
|
468
|
+
}
|
|
469
|
+
if (numbers && numbers.length > 0) {
|
|
470
|
+
// If multiple numbers found, prefer the largest one (likely the actual element ID)
|
|
471
|
+
// Element IDs are typically larger numbers, not small ones like "1"
|
|
472
|
+
try {
|
|
473
|
+
const intNumbers = numbers.map(n => parseInt(n, 10));
|
|
474
|
+
if (this.verbose) {
|
|
475
|
+
console.log(` 🔢 As integers: ${intNumbers}`);
|
|
476
|
+
}
|
|
477
|
+
// Prefer larger numbers (element IDs are usually > 10)
|
|
478
|
+
// But if only small numbers exist, use the first one
|
|
479
|
+
const largeNumbers = intNumbers.filter(n => n > 10);
|
|
480
|
+
let elementId;
|
|
481
|
+
if (largeNumbers.length > 0) {
|
|
482
|
+
elementId = Math.max(...largeNumbers); // Take the largest
|
|
483
|
+
if (this.verbose) {
|
|
484
|
+
console.log(` ✅ Selected largest number > 10: ${elementId}`);
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
else {
|
|
488
|
+
elementId = intNumbers[0]; // Fallback to first if all are small
|
|
489
|
+
if (this.verbose) {
|
|
490
|
+
console.log(` ⚠️ All numbers ≤ 10, using first: ${elementId}`);
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
if (this.verbose) {
|
|
494
|
+
console.log(`✅ Extracted element ID: ${elementId} (from ${numbers})`);
|
|
495
|
+
}
|
|
496
|
+
return elementId;
|
|
497
|
+
}
|
|
498
|
+
catch {
|
|
499
|
+
if (this.verbose) {
|
|
500
|
+
console.log(' ❌ Failed to convert numbers to integers');
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
if (this.verbose) {
|
|
505
|
+
console.log(`⚠️ Could not extract element ID from response: ${llmResponse}`);
|
|
506
|
+
}
|
|
507
|
+
return undefined;
|
|
508
|
+
}
|
|
509
|
+
/**
|
|
510
|
+
* Override act() method to use visual prompting with full tracing support.
|
|
511
|
+
*
|
|
512
|
+
* @param goal - User's goal/task
|
|
513
|
+
* @param maxRetries - Maximum retry attempts
|
|
514
|
+
* @param snapshotOptions - Optional snapshot options (screenshot will be enabled)
|
|
515
|
+
* @returns AgentActResult
|
|
516
|
+
*/
|
|
517
|
+
async act(goal, _maxRetries = 2, snapshotOptions) {
|
|
518
|
+
if (this.verbose) {
|
|
519
|
+
console.log('\n' + '='.repeat(70));
|
|
520
|
+
console.log(`🤖 Visual Agent Goal: ${goal}`);
|
|
521
|
+
console.log('='.repeat(70));
|
|
522
|
+
}
|
|
523
|
+
// Increment step counter and generate step ID
|
|
524
|
+
const stepCount = this.stepCount + 1;
|
|
525
|
+
this.stepCount = stepCount;
|
|
526
|
+
const stepId = (0, crypto_1.randomUUID)();
|
|
527
|
+
// Emit step_start event
|
|
528
|
+
const tracer = this.tracer;
|
|
529
|
+
if (tracer) {
|
|
530
|
+
const page = this.browser.getPage();
|
|
531
|
+
const currentUrl = page ? page.url() : 'unknown';
|
|
532
|
+
tracer.emitStepStart(stepId, stepCount, goal, 0, currentUrl);
|
|
533
|
+
}
|
|
534
|
+
const startTime = Date.now();
|
|
535
|
+
// Track data collected during step execution for step_end emission on failure
|
|
536
|
+
let stepSnapWithDiff = null;
|
|
537
|
+
let stepPreUrl = null;
|
|
538
|
+
let stepLlmResponse = null;
|
|
539
|
+
try {
|
|
540
|
+
// Ensure screenshot is enabled
|
|
541
|
+
const snapOpts = {
|
|
542
|
+
...snapshotOptions,
|
|
543
|
+
screenshot: snapshotOptions?.screenshot ?? true,
|
|
544
|
+
goal: snapshotOptions?.goal ?? goal,
|
|
545
|
+
limit: snapshotOptions?.limit || this.snapshotLimit,
|
|
546
|
+
};
|
|
547
|
+
if (this.verbose) {
|
|
548
|
+
console.log(`🎯 Goal: ${goal}`);
|
|
549
|
+
console.log('📸 Taking snapshot with screenshot...');
|
|
550
|
+
}
|
|
551
|
+
// 1. Take snapshot with screenshot
|
|
552
|
+
const snap = await (0, snapshot_1.snapshot)(this.browser, snapOpts);
|
|
553
|
+
if (snap.status !== 'success') {
|
|
554
|
+
throw new Error(`Snapshot failed: ${snap.error}`);
|
|
555
|
+
}
|
|
556
|
+
if (!snap.screenshot) {
|
|
557
|
+
throw new Error('Screenshot not available in snapshot');
|
|
558
|
+
}
|
|
559
|
+
// Process snapshot: compute diff status and filter elements
|
|
560
|
+
const processed = snapshot_processor_1.SnapshotProcessor.process(snap, this.previousSnapshot, goal, this.snapshotLimit);
|
|
561
|
+
// Update previous snapshot for next comparison
|
|
562
|
+
this.previousSnapshot = snap;
|
|
563
|
+
const snapWithDiff = processed.withDiff;
|
|
564
|
+
// Track for step_end emission on failure
|
|
565
|
+
stepSnapWithDiff = snapWithDiff;
|
|
566
|
+
stepPreUrl = snap.url;
|
|
567
|
+
// Emit snapshot event
|
|
568
|
+
if (tracer) {
|
|
569
|
+
const snapshotData = snapshot_event_builder_1.SnapshotEventBuilder.buildSnapshotEventData(snapWithDiff, stepId);
|
|
570
|
+
tracer.emit('snapshot', snapshotData, stepId);
|
|
571
|
+
}
|
|
572
|
+
if (this.verbose) {
|
|
573
|
+
console.log(`✅ Snapshot taken: ${snap.elements.length} elements`);
|
|
574
|
+
}
|
|
575
|
+
// 2. Draw labeled screenshot
|
|
576
|
+
if (this.verbose) {
|
|
577
|
+
console.log('🎨 Drawing bounding boxes and labels...');
|
|
578
|
+
console.log(` Elements to label: ${snap.elements.length}`);
|
|
579
|
+
if (snap.elements.length > 0) {
|
|
580
|
+
const elementIds = snap.elements.slice(0, 10).map(el => el.id); // Show first 10
|
|
581
|
+
console.log(` Sample element IDs: ${elementIds}`);
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
const labeledImageBuffer = await this.drawLabeledScreenshot(snap, snap.elements);
|
|
585
|
+
// Save labeled image to disk for debugging
|
|
586
|
+
try {
|
|
587
|
+
const cwd = process.cwd();
|
|
588
|
+
let playgroundPath;
|
|
589
|
+
// Check if current working directory contains playground
|
|
590
|
+
if (fs.existsSync(path.join(cwd, 'playground'))) {
|
|
591
|
+
playgroundPath = path.join(cwd, 'playground', 'images');
|
|
592
|
+
}
|
|
593
|
+
else {
|
|
594
|
+
// Check if we're in a playground context via module path
|
|
595
|
+
const modulePaths = require.resolve.paths('@predicatelabs/sdk') || [];
|
|
596
|
+
for (const modulePath of modulePaths) {
|
|
597
|
+
const potentialPlayground = path.join(modulePath, '..', 'playground', 'images');
|
|
598
|
+
if (fs.existsSync(path.dirname(potentialPlayground))) {
|
|
599
|
+
playgroundPath = potentialPlayground;
|
|
600
|
+
break;
|
|
601
|
+
}
|
|
602
|
+
}
|
|
603
|
+
}
|
|
604
|
+
if (!playgroundPath) {
|
|
605
|
+
// Fallback: use current working directory
|
|
606
|
+
playgroundPath = path.join(cwd, 'playground', 'images');
|
|
607
|
+
}
|
|
608
|
+
const imagesDir = playgroundPath;
|
|
609
|
+
if (!fs.existsSync(imagesDir)) {
|
|
610
|
+
fs.mkdirSync(imagesDir, { recursive: true });
|
|
611
|
+
}
|
|
612
|
+
const imageUuid = (0, uuid_1.v4)();
|
|
613
|
+
const imageFilename = `labeled_screenshot_${imageUuid}.png`;
|
|
614
|
+
const imagePath = path.join(imagesDir, imageFilename);
|
|
615
|
+
fs.writeFileSync(imagePath, labeledImageBuffer);
|
|
616
|
+
if (this.verbose) {
|
|
617
|
+
console.log(` 💾 Saved labeled screenshot: ${path.resolve(imagePath)}`);
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
catch (saveError) {
|
|
621
|
+
// Don't fail if image save fails - it's just for debugging
|
|
622
|
+
if (this.verbose) {
|
|
623
|
+
console.log(` ⚠️ Could not save labeled screenshot: ${saveError.message}`);
|
|
624
|
+
}
|
|
625
|
+
}
|
|
626
|
+
// Use JPEG for better compression (smaller file size for vision APIs)
|
|
627
|
+
const labeledImageDataUrl = await this.encodeImageToBase64(labeledImageBuffer, 'JPEG', 20.0);
|
|
628
|
+
// 3. Query LLM with vision
|
|
629
|
+
if (this.verbose) {
|
|
630
|
+
console.log('🧠 Querying LLM with labeled screenshot...');
|
|
631
|
+
}
|
|
632
|
+
const llmResponse = await this.queryLLMWithVision(labeledImageDataUrl, goal);
|
|
633
|
+
// Track for step_end emission on failure
|
|
634
|
+
stepLlmResponse = llmResponse;
|
|
635
|
+
// Emit LLM query event
|
|
636
|
+
if (tracer) {
|
|
637
|
+
tracer.emit('llm_query', {
|
|
638
|
+
prompt_tokens: llmResponse.promptTokens,
|
|
639
|
+
completion_tokens: llmResponse.completionTokens,
|
|
640
|
+
model: llmResponse.modelName,
|
|
641
|
+
response_text: llmResponse.content.substring(0, 200), // Truncate for brevity
|
|
642
|
+
}, stepId);
|
|
643
|
+
}
|
|
644
|
+
if (this.verbose) {
|
|
645
|
+
console.log(`💭 LLM Response: ${llmResponse.content}`);
|
|
646
|
+
}
|
|
647
|
+
// Track token usage
|
|
648
|
+
this.trackTokens(goal, llmResponse);
|
|
649
|
+
// 4. Extract element ID
|
|
650
|
+
const elementId = this.extractElementId(llmResponse.content);
|
|
651
|
+
if (elementId === undefined) {
|
|
652
|
+
throw new Error(`Could not extract element ID from LLM response: ${llmResponse.content}`);
|
|
653
|
+
}
|
|
654
|
+
if (this.verbose) {
|
|
655
|
+
console.log(`🎯 Extracted Element ID: ${elementId}`);
|
|
656
|
+
}
|
|
657
|
+
// 5. Click the element
|
|
658
|
+
if (this.verbose) {
|
|
659
|
+
console.log(`🖱️ Clicking element ${elementId}...`);
|
|
660
|
+
}
|
|
661
|
+
const clickResult = await (0, actions_1.click)(this.browser, elementId);
|
|
662
|
+
const durationMs = Date.now() - startTime;
|
|
663
|
+
// Create AgentActResult from click result
|
|
664
|
+
const result = {
|
|
665
|
+
success: clickResult.success,
|
|
666
|
+
action: 'click',
|
|
667
|
+
goal,
|
|
668
|
+
durationMs,
|
|
669
|
+
attempt: 0,
|
|
670
|
+
elementId,
|
|
671
|
+
outcome: clickResult.outcome,
|
|
672
|
+
urlChanged: clickResult.url_changed || false,
|
|
673
|
+
error: clickResult.error?.reason,
|
|
674
|
+
};
|
|
675
|
+
// Emit action execution event
|
|
676
|
+
if (tracer) {
|
|
677
|
+
const page = this.browser.getPage();
|
|
678
|
+
const postUrl = page ? page.url() : null;
|
|
679
|
+
// Include element data for live overlay visualization
|
|
680
|
+
const elementsData = snap.elements.slice(0, 50).map(el => ({
|
|
681
|
+
id: el.id,
|
|
682
|
+
bbox: {
|
|
683
|
+
x: el.bbox.x,
|
|
684
|
+
y: el.bbox.y,
|
|
685
|
+
width: el.bbox.width,
|
|
686
|
+
height: el.bbox.height,
|
|
687
|
+
},
|
|
688
|
+
role: el.role,
|
|
689
|
+
text: el.text ? el.text.substring(0, 50) : '',
|
|
690
|
+
}));
|
|
691
|
+
tracer.emit('action', {
|
|
692
|
+
action_type: result.action,
|
|
693
|
+
action: result.action,
|
|
694
|
+
element_id: result.elementId,
|
|
695
|
+
success: result.success,
|
|
696
|
+
// Note: duration_ms and other custom fields are not in TraceEventData type
|
|
697
|
+
// but are accepted at runtime for custom visualization
|
|
698
|
+
post_url: postUrl,
|
|
699
|
+
elements: elementsData, // Add element data for overlay
|
|
700
|
+
target_element_id: result.elementId, // Highlight target in red
|
|
701
|
+
}, // Type assertion needed for custom visualization fields
|
|
702
|
+
stepId);
|
|
703
|
+
}
|
|
704
|
+
// Record history
|
|
705
|
+
const history = this.history;
|
|
706
|
+
history.push({
|
|
707
|
+
goal,
|
|
708
|
+
action: `CLICK(${elementId})`,
|
|
709
|
+
result,
|
|
710
|
+
success: result.success,
|
|
711
|
+
attempt: 0,
|
|
712
|
+
durationMs,
|
|
713
|
+
});
|
|
714
|
+
if (this.verbose) {
|
|
715
|
+
const status = result.success ? '✅' : '❌';
|
|
716
|
+
console.log(`${status} Completed in ${durationMs}ms`);
|
|
717
|
+
}
|
|
718
|
+
// Emit step completion event
|
|
719
|
+
if (tracer) {
|
|
720
|
+
const preUrl = snap.url;
|
|
721
|
+
const page = this.browser.getPage();
|
|
722
|
+
const postUrl = page ? page.url() || null : null;
|
|
723
|
+
let postSnapshotDigest;
|
|
724
|
+
try {
|
|
725
|
+
const postSnap = await (0, snapshot_1.snapshot)(this.browser, {
|
|
726
|
+
goal: `${goal} (post)`,
|
|
727
|
+
limit: Math.min(this.snapshotLimit, 10),
|
|
728
|
+
show_overlay: this.showOverlay,
|
|
729
|
+
});
|
|
730
|
+
if (postSnap.status === 'success') {
|
|
731
|
+
postSnapshotDigest = trace_event_builder_1.TraceEventBuilder.buildSnapshotDigest(postSnap);
|
|
732
|
+
}
|
|
733
|
+
}
|
|
734
|
+
catch {
|
|
735
|
+
postSnapshotDigest = undefined;
|
|
736
|
+
}
|
|
737
|
+
// Build complete step_end event
|
|
738
|
+
// Note: snapshotDigest, llmResponseText, execData, and verifyData are computed
|
|
739
|
+
// inside TraceEventBuilder.buildStepEndData, so we don't need them here
|
|
740
|
+
// Build complete step_end event
|
|
741
|
+
const stepEndData = trace_event_builder_1.TraceEventBuilder.buildStepEndData({
|
|
742
|
+
stepId,
|
|
743
|
+
stepIndex: stepCount,
|
|
744
|
+
goal,
|
|
745
|
+
attempt: 0,
|
|
746
|
+
preUrl,
|
|
747
|
+
postUrl: postUrl || preUrl,
|
|
748
|
+
postSnapshotDigest,
|
|
749
|
+
snapshot: snapWithDiff,
|
|
750
|
+
llmResponse,
|
|
751
|
+
result,
|
|
752
|
+
});
|
|
753
|
+
tracer.emit('step_end', stepEndData, stepId);
|
|
754
|
+
}
|
|
755
|
+
return result;
|
|
756
|
+
}
|
|
757
|
+
catch (error) {
|
|
758
|
+
// Emit error event
|
|
759
|
+
if (tracer) {
|
|
760
|
+
tracer.emitError(stepId, error.message, 0);
|
|
761
|
+
}
|
|
762
|
+
// Emit step_end with whatever data we collected before failure
|
|
763
|
+
// This ensures diff_status and other fields are preserved in traces
|
|
764
|
+
if (tracer && stepSnapWithDiff) {
|
|
765
|
+
const page = this.browser.getPage();
|
|
766
|
+
const postUrl = page ? page.url() || null : null;
|
|
767
|
+
const durationMs = Date.now() - startTime;
|
|
768
|
+
const stepEndData = trace_event_builder_1.TraceEventBuilder.buildPartialStepEndData({
|
|
769
|
+
stepId,
|
|
770
|
+
stepIndex: stepCount,
|
|
771
|
+
goal,
|
|
772
|
+
attempt: 0,
|
|
773
|
+
preUrl: stepPreUrl,
|
|
774
|
+
postUrl,
|
|
775
|
+
snapshot: stepSnapWithDiff,
|
|
776
|
+
llmResponse: stepLlmResponse,
|
|
777
|
+
error: error.message,
|
|
778
|
+
durationMs,
|
|
779
|
+
});
|
|
780
|
+
tracer.emit('step_end', stepEndData, stepId);
|
|
781
|
+
}
|
|
782
|
+
if (this.verbose) {
|
|
783
|
+
console.log(`❌ Error: ${error.message}`);
|
|
784
|
+
}
|
|
785
|
+
// Re-raise the exception
|
|
786
|
+
throw error;
|
|
787
|
+
}
|
|
788
|
+
}
|
|
789
|
+
}
|
|
790
|
+
exports.SentienceVisualAgent = SentienceVisualAgent;
|
|
791
|
+
/**
|
|
792
|
+
* Predicate rebrand alias for SentienceVisualAgent.
|
|
793
|
+
* Kept as a runtime alias to avoid breaking existing integrations.
|
|
794
|
+
*/
|
|
795
|
+
exports.PredicateVisualAgent = SentienceVisualAgent;
|
|
796
|
+
//# sourceMappingURL=visual-agent.js.map
|