@projectservan8n/cnapse 0.8.2 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ProviderSelector-GZYF26LL.js +7 -0
- package/dist/autonomous-VGEVIXXQ.js +419 -0
- package/dist/browser-YLFWQXIY.js +87 -0
- package/dist/{chunk-OPX7FFL6.js → chunk-7SDY7OPA.js} +14 -55
- package/dist/chunk-COKO6V5J.js +50 -0
- package/dist/chunk-GP73OJCZ.js +377 -0
- package/dist/chunk-MOKGR7WE.js +344 -0
- package/dist/chunk-OIVTPXE4.js +307 -0
- package/dist/chunk-TFHK5CYF.js +650 -0
- package/dist/chunk-WSBJFRQH.js +366 -0
- package/dist/index.js +579 -1733
- package/dist/learner-KH3TFTD7.js +14 -0
- package/dist/vision-S57PWSCU.js +19 -0
- package/package.json +1 -2
- package/src/agents/autonomous.ts +515 -0
- package/src/agents/learner.ts +489 -0
- package/src/lib/tasks.ts +217 -153
- package/src/lib/vision.ts +139 -0
- package/src/services/browser.ts +336 -584
- package/src/services/screen-monitor.ts +288 -0
- package/src/services/telegram.ts +312 -5
- package/src/tools/computer.ts +226 -0
- package/dist/ProviderSelector-MXRZFAOB.js +0 -6
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
import {
|
|
2
|
+
getApiKey,
|
|
3
|
+
getConfig
|
|
4
|
+
} from "./chunk-COKO6V5J.js";
|
|
5
|
+
|
|
6
|
+
// src/lib/vision.ts
|
|
7
|
+
async function describeScreen() {
|
|
8
|
+
const screenshot = await captureScreenshot();
|
|
9
|
+
if (!screenshot) {
|
|
10
|
+
throw new Error("Failed to capture screenshot");
|
|
11
|
+
}
|
|
12
|
+
const config = getConfig();
|
|
13
|
+
const description = await analyzeWithVision(screenshot, config.provider);
|
|
14
|
+
return { description, screenshot };
|
|
15
|
+
}
|
|
16
|
+
async function captureScreenshot() {
|
|
17
|
+
try {
|
|
18
|
+
const screenshotDesktop = await import("screenshot-desktop");
|
|
19
|
+
const buffer = await screenshotDesktop.default({ format: "png" });
|
|
20
|
+
return buffer.toString("base64");
|
|
21
|
+
} catch {
|
|
22
|
+
return captureScreenFallback();
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
async function captureScreenFallback() {
|
|
26
|
+
const { exec } = await import("child_process");
|
|
27
|
+
const { promisify } = await import("util");
|
|
28
|
+
const { tmpdir } = await import("os");
|
|
29
|
+
const { join } = await import("path");
|
|
30
|
+
const { readFile, unlink } = await import("fs/promises");
|
|
31
|
+
const execAsync = promisify(exec);
|
|
32
|
+
const tempFile = join(tmpdir(), `cnapse-screen-${Date.now()}.png`);
|
|
33
|
+
try {
|
|
34
|
+
const platform = process.platform;
|
|
35
|
+
if (platform === "win32") {
|
|
36
|
+
await execAsync(`
|
|
37
|
+
Add-Type -AssemblyName System.Windows.Forms
|
|
38
|
+
$screen = [System.Windows.Forms.Screen]::PrimaryScreen.Bounds
|
|
39
|
+
$bitmap = New-Object System.Drawing.Bitmap($screen.Width, $screen.Height)
|
|
40
|
+
$graphics = [System.Drawing.Graphics]::FromImage($bitmap)
|
|
41
|
+
$graphics.CopyFromScreen($screen.Location, [System.Drawing.Point]::Empty, $screen.Size)
|
|
42
|
+
$bitmap.Save("${tempFile.replace(/\\/g, "\\\\")}")
|
|
43
|
+
$graphics.Dispose()
|
|
44
|
+
$bitmap.Dispose()
|
|
45
|
+
`, { shell: "powershell.exe" });
|
|
46
|
+
} else if (platform === "darwin") {
|
|
47
|
+
await execAsync(`screencapture -x "${tempFile}"`);
|
|
48
|
+
} else {
|
|
49
|
+
await execAsync(`gnome-screenshot -f "${tempFile}" 2>/dev/null || scrot "${tempFile}" 2>/dev/null || import -window root "${tempFile}"`);
|
|
50
|
+
}
|
|
51
|
+
const imageBuffer = await readFile(tempFile);
|
|
52
|
+
await unlink(tempFile).catch(() => {
|
|
53
|
+
});
|
|
54
|
+
return imageBuffer.toString("base64");
|
|
55
|
+
} catch {
|
|
56
|
+
return null;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
async function analyzeWithVision(base64Image, provider) {
|
|
60
|
+
const prompt = `Look at this screenshot and describe:
|
|
61
|
+
1. What application or window is visible
|
|
62
|
+
2. Key UI elements you can see (buttons, text fields, menus)
|
|
63
|
+
3. What the user appears to be doing or could do next
|
|
64
|
+
4. Any notable content or state
|
|
65
|
+
|
|
66
|
+
Be concise but helpful.`;
|
|
67
|
+
switch (provider) {
|
|
68
|
+
case "ollama":
|
|
69
|
+
return analyzeWithOllama(base64Image, prompt);
|
|
70
|
+
case "openrouter":
|
|
71
|
+
return analyzeWithOpenRouter(base64Image, prompt);
|
|
72
|
+
case "anthropic":
|
|
73
|
+
return analyzeWithAnthropic(base64Image, prompt);
|
|
74
|
+
case "openai":
|
|
75
|
+
return analyzeWithOpenAI(base64Image, prompt);
|
|
76
|
+
default:
|
|
77
|
+
throw new Error(`Vision not supported for provider: ${provider}`);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
async function analyzeWithOllama(base64Image, prompt) {
|
|
81
|
+
const config = getConfig();
|
|
82
|
+
const ollamaHost = config.ollamaHost || "http://localhost:11434";
|
|
83
|
+
const visionModels = ["llava", "llama3.2-vision", "bakllava", "llava-llama3"];
|
|
84
|
+
const model = visionModels.find((m) => config.model.includes(m)) || "llava";
|
|
85
|
+
const response = await fetch(`${ollamaHost}/api/generate`, {
|
|
86
|
+
method: "POST",
|
|
87
|
+
headers: { "Content-Type": "application/json" },
|
|
88
|
+
body: JSON.stringify({
|
|
89
|
+
model,
|
|
90
|
+
prompt,
|
|
91
|
+
images: [base64Image],
|
|
92
|
+
stream: false
|
|
93
|
+
})
|
|
94
|
+
});
|
|
95
|
+
if (!response.ok) {
|
|
96
|
+
const text = await response.text();
|
|
97
|
+
throw new Error(`Ollama vision error: ${text}`);
|
|
98
|
+
}
|
|
99
|
+
const data = await response.json();
|
|
100
|
+
return data.response || "Unable to analyze image";
|
|
101
|
+
}
|
|
102
|
+
async function analyzeWithOpenRouter(base64Image, prompt) {
|
|
103
|
+
const apiKey = getApiKey("openrouter");
|
|
104
|
+
if (!apiKey) throw new Error("OpenRouter API key not configured");
|
|
105
|
+
const model = "anthropic/claude-3-5-sonnet";
|
|
106
|
+
const response = await fetch("https://openrouter.ai/api/v1/chat/completions", {
|
|
107
|
+
method: "POST",
|
|
108
|
+
headers: {
|
|
109
|
+
"Authorization": `Bearer ${apiKey}`,
|
|
110
|
+
"Content-Type": "application/json",
|
|
111
|
+
"HTTP-Referer": "https://c-napse.up.railway.app",
|
|
112
|
+
"X-Title": "C-napse"
|
|
113
|
+
},
|
|
114
|
+
body: JSON.stringify({
|
|
115
|
+
model,
|
|
116
|
+
messages: [
|
|
117
|
+
{
|
|
118
|
+
role: "user",
|
|
119
|
+
content: [
|
|
120
|
+
{ type: "text", text: prompt },
|
|
121
|
+
{
|
|
122
|
+
type: "image_url",
|
|
123
|
+
image_url: { url: `data:image/png;base64,${base64Image}` }
|
|
124
|
+
}
|
|
125
|
+
]
|
|
126
|
+
}
|
|
127
|
+
],
|
|
128
|
+
max_tokens: 1e3
|
|
129
|
+
})
|
|
130
|
+
});
|
|
131
|
+
if (!response.ok) {
|
|
132
|
+
const text = await response.text();
|
|
133
|
+
throw new Error(`OpenRouter vision error: ${text}`);
|
|
134
|
+
}
|
|
135
|
+
const data = await response.json();
|
|
136
|
+
return data.choices?.[0]?.message?.content || "Unable to analyze image";
|
|
137
|
+
}
|
|
138
|
+
async function analyzeWithAnthropic(base64Image, prompt) {
|
|
139
|
+
const apiKey = getApiKey("anthropic");
|
|
140
|
+
if (!apiKey) throw new Error("Anthropic API key not configured");
|
|
141
|
+
const response = await fetch("https://api.anthropic.com/v1/messages", {
|
|
142
|
+
method: "POST",
|
|
143
|
+
headers: {
|
|
144
|
+
"x-api-key": apiKey,
|
|
145
|
+
"anthropic-version": "2023-06-01",
|
|
146
|
+
"Content-Type": "application/json"
|
|
147
|
+
},
|
|
148
|
+
body: JSON.stringify({
|
|
149
|
+
model: "claude-3-5-sonnet-20241022",
|
|
150
|
+
max_tokens: 1e3,
|
|
151
|
+
messages: [
|
|
152
|
+
{
|
|
153
|
+
role: "user",
|
|
154
|
+
content: [
|
|
155
|
+
{
|
|
156
|
+
type: "image",
|
|
157
|
+
source: {
|
|
158
|
+
type: "base64",
|
|
159
|
+
media_type: "image/png",
|
|
160
|
+
data: base64Image
|
|
161
|
+
}
|
|
162
|
+
},
|
|
163
|
+
{ type: "text", text: prompt }
|
|
164
|
+
]
|
|
165
|
+
}
|
|
166
|
+
]
|
|
167
|
+
})
|
|
168
|
+
});
|
|
169
|
+
if (!response.ok) {
|
|
170
|
+
const text = await response.text();
|
|
171
|
+
throw new Error(`Anthropic vision error: ${text}`);
|
|
172
|
+
}
|
|
173
|
+
const data = await response.json();
|
|
174
|
+
return data.content?.[0]?.text || "Unable to analyze image";
|
|
175
|
+
}
|
|
176
|
+
async function analyzeWithOpenAI(base64Image, prompt) {
|
|
177
|
+
const apiKey = getApiKey("openai");
|
|
178
|
+
if (!apiKey) throw new Error("OpenAI API key not configured");
|
|
179
|
+
const response = await fetch("https://api.openai.com/v1/chat/completions", {
|
|
180
|
+
method: "POST",
|
|
181
|
+
headers: {
|
|
182
|
+
"Authorization": `Bearer ${apiKey}`,
|
|
183
|
+
"Content-Type": "application/json"
|
|
184
|
+
},
|
|
185
|
+
body: JSON.stringify({
|
|
186
|
+
model: "gpt-4-vision-preview",
|
|
187
|
+
messages: [
|
|
188
|
+
{
|
|
189
|
+
role: "user",
|
|
190
|
+
content: [
|
|
191
|
+
{ type: "text", text: prompt },
|
|
192
|
+
{
|
|
193
|
+
type: "image_url",
|
|
194
|
+
image_url: { url: `data:image/png;base64,${base64Image}` }
|
|
195
|
+
}
|
|
196
|
+
]
|
|
197
|
+
}
|
|
198
|
+
],
|
|
199
|
+
max_tokens: 1e3
|
|
200
|
+
})
|
|
201
|
+
});
|
|
202
|
+
if (!response.ok) {
|
|
203
|
+
const text = await response.text();
|
|
204
|
+
throw new Error(`OpenAI vision error: ${text}`);
|
|
205
|
+
}
|
|
206
|
+
const data = await response.json();
|
|
207
|
+
return data.choices?.[0]?.message?.content || "Unable to analyze image";
|
|
208
|
+
}
|
|
209
|
+
async function findElementCoordinates(screenshot, description) {
|
|
210
|
+
const config = getConfig();
|
|
211
|
+
const prompt = `Look at this screenshot carefully. Find the UI element described as: "${description}"
|
|
212
|
+
|
|
213
|
+
Your task is to estimate the CENTER coordinates (x, y) of this element.
|
|
214
|
+
|
|
215
|
+
IMPORTANT:
|
|
216
|
+
- Assume the screen is approximately 1920x1080 pixels (adjust if you see indicators of different resolution)
|
|
217
|
+
- Give coordinates as integers
|
|
218
|
+
- If the element is clearly visible, give your best estimate
|
|
219
|
+
- If you absolutely cannot find it, respond with NOT_FOUND
|
|
220
|
+
|
|
221
|
+
Respond in EXACTLY this format (numbers only, no units):
|
|
222
|
+
X: <number>
|
|
223
|
+
Y: <number>
|
|
224
|
+
|
|
225
|
+
Or if not found:
|
|
226
|
+
NOT_FOUND`;
|
|
227
|
+
try {
|
|
228
|
+
const response = await analyzeWithVisionCustom(screenshot, config.provider, prompt);
|
|
229
|
+
const xMatch = response.match(/X:\s*(\d+)/i);
|
|
230
|
+
const yMatch = response.match(/Y:\s*(\d+)/i);
|
|
231
|
+
if (xMatch && yMatch) {
|
|
232
|
+
return {
|
|
233
|
+
x: parseInt(xMatch[1]),
|
|
234
|
+
y: parseInt(yMatch[1])
|
|
235
|
+
};
|
|
236
|
+
}
|
|
237
|
+
return null;
|
|
238
|
+
} catch (error) {
|
|
239
|
+
return null;
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
async function analyzeWithVisionCustom(base64Image, provider, prompt) {
|
|
243
|
+
switch (provider) {
|
|
244
|
+
case "ollama":
|
|
245
|
+
return analyzeWithOllama(base64Image, prompt);
|
|
246
|
+
case "openrouter":
|
|
247
|
+
return analyzeWithOpenRouter(base64Image, prompt);
|
|
248
|
+
case "anthropic":
|
|
249
|
+
return analyzeWithAnthropic(base64Image, prompt);
|
|
250
|
+
case "openai":
|
|
251
|
+
return analyzeWithOpenAI(base64Image, prompt);
|
|
252
|
+
default:
|
|
253
|
+
throw new Error(`Vision not supported for provider: ${provider}`);
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
function getScreenHash(base64Screenshot) {
|
|
257
|
+
let sample = "";
|
|
258
|
+
for (let i = 0; i < base64Screenshot.length; i += 1e3) {
|
|
259
|
+
sample += base64Screenshot[i];
|
|
260
|
+
}
|
|
261
|
+
let hash = 0;
|
|
262
|
+
for (let i = 0; i < sample.length; i++) {
|
|
263
|
+
const char = sample.charCodeAt(i);
|
|
264
|
+
hash = (hash << 5) - hash + char;
|
|
265
|
+
hash = hash & hash;
|
|
266
|
+
}
|
|
267
|
+
return hash.toString(16);
|
|
268
|
+
}
|
|
269
|
+
function screensChanged(screenshotA, screenshotB) {
|
|
270
|
+
if (!screenshotA || !screenshotB) return true;
|
|
271
|
+
if (screenshotA.length !== screenshotB.length) return true;
|
|
272
|
+
const hashA = getScreenHash(screenshotA);
|
|
273
|
+
const hashB = getScreenHash(screenshotB);
|
|
274
|
+
return hashA !== hashB;
|
|
275
|
+
}
|
|
276
|
+
async function analyzeScreenRegion(screenshot, region, question) {
|
|
277
|
+
const config = getConfig();
|
|
278
|
+
const prompt = `Look at this screenshot. Focus on the region approximately at:
|
|
279
|
+
- Position: (${region.x}, ${region.y})
|
|
280
|
+
- Size: ${region.width}x${region.height} pixels
|
|
281
|
+
|
|
282
|
+
Question: ${question}
|
|
283
|
+
|
|
284
|
+
Be specific and concise in your answer.`;
|
|
285
|
+
return analyzeWithVisionCustom(screenshot, config.provider, prompt);
|
|
286
|
+
}
|
|
287
|
+
var lastDescription = null;
|
|
288
|
+
var DESCRIPTION_CACHE_MS = 2e3;
|
|
289
|
+
async function getCurrentDescription() {
|
|
290
|
+
const now = Date.now();
|
|
291
|
+
if (lastDescription && now - lastDescription.timestamp < DESCRIPTION_CACHE_MS) {
|
|
292
|
+
return lastDescription.text;
|
|
293
|
+
}
|
|
294
|
+
const result = await describeScreen();
|
|
295
|
+
lastDescription = { text: result.description, timestamp: now };
|
|
296
|
+
return result.description;
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
export {
|
|
300
|
+
describeScreen,
|
|
301
|
+
captureScreenshot,
|
|
302
|
+
findElementCoordinates,
|
|
303
|
+
getScreenHash,
|
|
304
|
+
screensChanged,
|
|
305
|
+
analyzeScreenRegion,
|
|
306
|
+
getCurrentDescription
|
|
307
|
+
};
|