@projectservan8n/cnapse 0.8.2 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,307 @@
1
+ import {
2
+ getApiKey,
3
+ getConfig
4
+ } from "./chunk-COKO6V5J.js";
5
+
6
+ // src/lib/vision.ts
7
+ async function describeScreen() {
8
+ const screenshot = await captureScreenshot();
9
+ if (!screenshot) {
10
+ throw new Error("Failed to capture screenshot");
11
+ }
12
+ const config = getConfig();
13
+ const description = await analyzeWithVision(screenshot, config.provider);
14
+ return { description, screenshot };
15
+ }
16
+ async function captureScreenshot() {
17
+ try {
18
+ const screenshotDesktop = await import("screenshot-desktop");
19
+ const buffer = await screenshotDesktop.default({ format: "png" });
20
+ return buffer.toString("base64");
21
+ } catch {
22
+ return captureScreenFallback();
23
+ }
24
+ }
25
+ async function captureScreenFallback() {
26
+ const { exec } = await import("child_process");
27
+ const { promisify } = await import("util");
28
+ const { tmpdir } = await import("os");
29
+ const { join } = await import("path");
30
+ const { readFile, unlink } = await import("fs/promises");
31
+ const execAsync = promisify(exec);
32
+ const tempFile = join(tmpdir(), `cnapse-screen-${Date.now()}.png`);
33
+ try {
34
+ const platform = process.platform;
35
+ if (platform === "win32") {
36
+ await execAsync(`
37
+ Add-Type -AssemblyName System.Windows.Forms
38
+ $screen = [System.Windows.Forms.Screen]::PrimaryScreen.Bounds
39
+ $bitmap = New-Object System.Drawing.Bitmap($screen.Width, $screen.Height)
40
+ $graphics = [System.Drawing.Graphics]::FromImage($bitmap)
41
+ $graphics.CopyFromScreen($screen.Location, [System.Drawing.Point]::Empty, $screen.Size)
42
+ $bitmap.Save("${tempFile.replace(/\\/g, "\\\\")}")
43
+ $graphics.Dispose()
44
+ $bitmap.Dispose()
45
+ `, { shell: "powershell.exe" });
46
+ } else if (platform === "darwin") {
47
+ await execAsync(`screencapture -x "${tempFile}"`);
48
+ } else {
49
+ await execAsync(`gnome-screenshot -f "${tempFile}" 2>/dev/null || scrot "${tempFile}" 2>/dev/null || import -window root "${tempFile}"`);
50
+ }
51
+ const imageBuffer = await readFile(tempFile);
52
+ await unlink(tempFile).catch(() => {
53
+ });
54
+ return imageBuffer.toString("base64");
55
+ } catch {
56
+ return null;
57
+ }
58
+ }
59
+ async function analyzeWithVision(base64Image, provider) {
60
+ const prompt = `Look at this screenshot and describe:
61
+ 1. What application or window is visible
62
+ 2. Key UI elements you can see (buttons, text fields, menus)
63
+ 3. What the user appears to be doing or could do next
64
+ 4. Any notable content or state
65
+
66
+ Be concise but helpful.`;
67
+ switch (provider) {
68
+ case "ollama":
69
+ return analyzeWithOllama(base64Image, prompt);
70
+ case "openrouter":
71
+ return analyzeWithOpenRouter(base64Image, prompt);
72
+ case "anthropic":
73
+ return analyzeWithAnthropic(base64Image, prompt);
74
+ case "openai":
75
+ return analyzeWithOpenAI(base64Image, prompt);
76
+ default:
77
+ throw new Error(`Vision not supported for provider: ${provider}`);
78
+ }
79
+ }
80
+ async function analyzeWithOllama(base64Image, prompt) {
81
+ const config = getConfig();
82
+ const ollamaHost = config.ollamaHost || "http://localhost:11434";
83
+ const visionModels = ["llava", "llama3.2-vision", "bakllava", "llava-llama3"];
84
+ const model = visionModels.find((m) => config.model.includes(m)) || "llava";
85
+ const response = await fetch(`${ollamaHost}/api/generate`, {
86
+ method: "POST",
87
+ headers: { "Content-Type": "application/json" },
88
+ body: JSON.stringify({
89
+ model,
90
+ prompt,
91
+ images: [base64Image],
92
+ stream: false
93
+ })
94
+ });
95
+ if (!response.ok) {
96
+ const text = await response.text();
97
+ throw new Error(`Ollama vision error: ${text}`);
98
+ }
99
+ const data = await response.json();
100
+ return data.response || "Unable to analyze image";
101
+ }
102
+ async function analyzeWithOpenRouter(base64Image, prompt) {
103
+ const apiKey = getApiKey("openrouter");
104
+ if (!apiKey) throw new Error("OpenRouter API key not configured");
105
+ const model = "anthropic/claude-3-5-sonnet";
106
+ const response = await fetch("https://openrouter.ai/api/v1/chat/completions", {
107
+ method: "POST",
108
+ headers: {
109
+ "Authorization": `Bearer ${apiKey}`,
110
+ "Content-Type": "application/json",
111
+ "HTTP-Referer": "https://c-napse.up.railway.app",
112
+ "X-Title": "C-napse"
113
+ },
114
+ body: JSON.stringify({
115
+ model,
116
+ messages: [
117
+ {
118
+ role: "user",
119
+ content: [
120
+ { type: "text", text: prompt },
121
+ {
122
+ type: "image_url",
123
+ image_url: { url: `data:image/png;base64,${base64Image}` }
124
+ }
125
+ ]
126
+ }
127
+ ],
128
+ max_tokens: 1e3
129
+ })
130
+ });
131
+ if (!response.ok) {
132
+ const text = await response.text();
133
+ throw new Error(`OpenRouter vision error: ${text}`);
134
+ }
135
+ const data = await response.json();
136
+ return data.choices?.[0]?.message?.content || "Unable to analyze image";
137
+ }
138
+ async function analyzeWithAnthropic(base64Image, prompt) {
139
+ const apiKey = getApiKey("anthropic");
140
+ if (!apiKey) throw new Error("Anthropic API key not configured");
141
+ const response = await fetch("https://api.anthropic.com/v1/messages", {
142
+ method: "POST",
143
+ headers: {
144
+ "x-api-key": apiKey,
145
+ "anthropic-version": "2023-06-01",
146
+ "Content-Type": "application/json"
147
+ },
148
+ body: JSON.stringify({
149
+ model: "claude-3-5-sonnet-20241022",
150
+ max_tokens: 1e3,
151
+ messages: [
152
+ {
153
+ role: "user",
154
+ content: [
155
+ {
156
+ type: "image",
157
+ source: {
158
+ type: "base64",
159
+ media_type: "image/png",
160
+ data: base64Image
161
+ }
162
+ },
163
+ { type: "text", text: prompt }
164
+ ]
165
+ }
166
+ ]
167
+ })
168
+ });
169
+ if (!response.ok) {
170
+ const text = await response.text();
171
+ throw new Error(`Anthropic vision error: ${text}`);
172
+ }
173
+ const data = await response.json();
174
+ return data.content?.[0]?.text || "Unable to analyze image";
175
+ }
176
+ async function analyzeWithOpenAI(base64Image, prompt) {
177
+ const apiKey = getApiKey("openai");
178
+ if (!apiKey) throw new Error("OpenAI API key not configured");
179
+ const response = await fetch("https://api.openai.com/v1/chat/completions", {
180
+ method: "POST",
181
+ headers: {
182
+ "Authorization": `Bearer ${apiKey}`,
183
+ "Content-Type": "application/json"
184
+ },
185
+ body: JSON.stringify({
186
+ model: "gpt-4-vision-preview",
187
+ messages: [
188
+ {
189
+ role: "user",
190
+ content: [
191
+ { type: "text", text: prompt },
192
+ {
193
+ type: "image_url",
194
+ image_url: { url: `data:image/png;base64,${base64Image}` }
195
+ }
196
+ ]
197
+ }
198
+ ],
199
+ max_tokens: 1e3
200
+ })
201
+ });
202
+ if (!response.ok) {
203
+ const text = await response.text();
204
+ throw new Error(`OpenAI vision error: ${text}`);
205
+ }
206
+ const data = await response.json();
207
+ return data.choices?.[0]?.message?.content || "Unable to analyze image";
208
+ }
209
+ async function findElementCoordinates(screenshot, description) {
210
+ const config = getConfig();
211
+ const prompt = `Look at this screenshot carefully. Find the UI element described as: "${description}"
212
+
213
+ Your task is to estimate the CENTER coordinates (x, y) of this element.
214
+
215
+ IMPORTANT:
216
+ - Assume the screen is approximately 1920x1080 pixels (adjust if you see indicators of different resolution)
217
+ - Give coordinates as integers
218
+ - If the element is clearly visible, give your best estimate
219
+ - If you absolutely cannot find it, respond with NOT_FOUND
220
+
221
+ Respond in EXACTLY this format (numbers only, no units):
222
+ X: <number>
223
+ Y: <number>
224
+
225
+ Or if not found:
226
+ NOT_FOUND`;
227
+ try {
228
+ const response = await analyzeWithVisionCustom(screenshot, config.provider, prompt);
229
+ const xMatch = response.match(/X:\s*(\d+)/i);
230
+ const yMatch = response.match(/Y:\s*(\d+)/i);
231
+ if (xMatch && yMatch) {
232
+ return {
233
+ x: parseInt(xMatch[1]),
234
+ y: parseInt(yMatch[1])
235
+ };
236
+ }
237
+ return null;
238
+ } catch (error) {
239
+ return null;
240
+ }
241
+ }
242
+ async function analyzeWithVisionCustom(base64Image, provider, prompt) {
243
+ switch (provider) {
244
+ case "ollama":
245
+ return analyzeWithOllama(base64Image, prompt);
246
+ case "openrouter":
247
+ return analyzeWithOpenRouter(base64Image, prompt);
248
+ case "anthropic":
249
+ return analyzeWithAnthropic(base64Image, prompt);
250
+ case "openai":
251
+ return analyzeWithOpenAI(base64Image, prompt);
252
+ default:
253
+ throw new Error(`Vision not supported for provider: ${provider}`);
254
+ }
255
+ }
256
+ function getScreenHash(base64Screenshot) {
257
+ let sample = "";
258
+ for (let i = 0; i < base64Screenshot.length; i += 1e3) {
259
+ sample += base64Screenshot[i];
260
+ }
261
+ let hash = 0;
262
+ for (let i = 0; i < sample.length; i++) {
263
+ const char = sample.charCodeAt(i);
264
+ hash = (hash << 5) - hash + char;
265
+ hash = hash & hash;
266
+ }
267
+ return hash.toString(16);
268
+ }
269
+ function screensChanged(screenshotA, screenshotB) {
270
+ if (!screenshotA || !screenshotB) return true;
271
+ if (screenshotA.length !== screenshotB.length) return true;
272
+ const hashA = getScreenHash(screenshotA);
273
+ const hashB = getScreenHash(screenshotB);
274
+ return hashA !== hashB;
275
+ }
276
+ async function analyzeScreenRegion(screenshot, region, question) {
277
+ const config = getConfig();
278
+ const prompt = `Look at this screenshot. Focus on the region approximately at:
279
+ - Position: (${region.x}, ${region.y})
280
+ - Size: ${region.width}x${region.height} pixels
281
+
282
+ Question: ${question}
283
+
284
+ Be specific and concise in your answer.`;
285
+ return analyzeWithVisionCustom(screenshot, config.provider, prompt);
286
+ }
287
+ var lastDescription = null;
288
+ var DESCRIPTION_CACHE_MS = 2e3;
289
+ async function getCurrentDescription() {
290
+ const now = Date.now();
291
+ if (lastDescription && now - lastDescription.timestamp < DESCRIPTION_CACHE_MS) {
292
+ return lastDescription.text;
293
+ }
294
+ const result = await describeScreen();
295
+ lastDescription = { text: result.description, timestamp: now };
296
+ return result.description;
297
+ }
298
+
299
+ export {
300
+ describeScreen,
301
+ captureScreenshot,
302
+ findElementCoordinates,
303
+ getScreenHash,
304
+ screensChanged,
305
+ analyzeScreenRegion,
306
+ getCurrentDescription
307
+ };