@projectservan8n/cnapse 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ProviderSelector-GZYF26LL.js +7 -0
- package/dist/autonomous-VGEVIXXQ.js +419 -0
- package/dist/browser-YLFWQXIY.js +87 -0
- package/dist/{chunk-OPX7FFL6.js → chunk-7SDY7OPA.js} +14 -55
- package/dist/chunk-COKO6V5J.js +50 -0
- package/dist/chunk-GP73OJCZ.js +377 -0
- package/dist/chunk-MOKGR7WE.js +344 -0
- package/dist/chunk-OIVTPXE4.js +307 -0
- package/dist/chunk-TFHK5CYF.js +650 -0
- package/dist/chunk-WSBJFRQH.js +366 -0
- package/dist/index.js +495 -1391
- package/dist/learner-KH3TFTD7.js +14 -0
- package/dist/vision-S57PWSCU.js +19 -0
- package/package.json +1 -1
- package/src/agents/autonomous.ts +515 -0
- package/src/agents/learner.ts +489 -0
- package/src/lib/tasks.ts +179 -54
- package/src/lib/vision.ts +139 -0
- package/src/services/screen-monitor.ts +288 -0
- package/src/services/telegram.ts +312 -5
- package/src/tools/computer.ts +226 -0
- package/dist/ProviderSelector-MXRZFAOB.js +0 -6
package/src/lib/tasks.ts
CHANGED
|
@@ -860,40 +860,74 @@ Be specific about locations (top-left, center, etc.) and what each element does.
|
|
|
860
860
|
}
|
|
861
861
|
|
|
862
862
|
case 'adaptive_do': {
|
|
863
|
-
//
|
|
863
|
+
// Enhanced adaptive agent with learning, more attempts, and verification
|
|
864
864
|
const goal = params;
|
|
865
|
-
const maxAttempts = 5
|
|
865
|
+
const maxAttempts = 25; // Increased from 5
|
|
866
866
|
const actionHistory: string[] = [];
|
|
867
867
|
let accomplished = false;
|
|
868
|
+
let stuckCount = 0;
|
|
869
|
+
const stuckThreshold = 3;
|
|
870
|
+
let lastScreenHash = '';
|
|
871
|
+
|
|
872
|
+
// Import learner for self-learning capabilities
|
|
873
|
+
const { getLearner } = await import('../agents/learner.js');
|
|
874
|
+
const learner = getLearner();
|
|
875
|
+
await learner.load();
|
|
876
|
+
|
|
877
|
+
// Check if we've solved something similar before
|
|
878
|
+
const initialScreen = await describeScreen();
|
|
879
|
+
const remembered = await learner.recall(goal, initialScreen.description);
|
|
880
|
+
if (remembered && remembered.successCount > remembered.failCount) {
|
|
881
|
+
actionHistory.push(`📚 Found remembered solution from ${remembered.source}`);
|
|
882
|
+
}
|
|
868
883
|
|
|
869
884
|
for (let attempt = 0; attempt < maxAttempts && !accomplished; attempt++) {
|
|
870
885
|
// Take screenshot and analyze current state using vision
|
|
871
886
|
const currentScreen = await describeScreen();
|
|
887
|
+
const currentHash = currentScreen.screenshot.slice(0, 1000);
|
|
888
|
+
|
|
889
|
+
// Check if screen changed
|
|
890
|
+
const screenChanged = currentHash !== lastScreenHash;
|
|
891
|
+
if (!screenChanged && attempt > 0) {
|
|
892
|
+
stuckCount++;
|
|
893
|
+
} else {
|
|
894
|
+
stuckCount = Math.max(0, stuckCount - 1);
|
|
895
|
+
}
|
|
896
|
+
lastScreenHash = currentHash;
|
|
872
897
|
|
|
873
|
-
// Ask our AI what to do next
|
|
898
|
+
// Ask our AI what to do next (enhanced prompt)
|
|
874
899
|
const nextAction = await chat([{
|
|
875
900
|
role: 'user',
|
|
876
901
|
content: `GOAL: ${goal}
|
|
877
902
|
|
|
878
903
|
CURRENT SCREEN: ${currentScreen.description}
|
|
879
904
|
|
|
880
|
-
PREVIOUS ACTIONS
|
|
881
|
-
${actionHistory.
|
|
905
|
+
PREVIOUS ACTIONS:
|
|
906
|
+
${actionHistory.slice(-5).join('\n') || 'None yet'}
|
|
907
|
+
|
|
908
|
+
ATTEMPT: ${attempt + 1}/${maxAttempts}
|
|
909
|
+
STUCK COUNT: ${stuckCount} (will ask for help at ${stuckThreshold})
|
|
882
910
|
|
|
883
911
|
Based on what you see, what's the SINGLE next action to take?
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
-
|
|
887
|
-
-
|
|
888
|
-
-
|
|
889
|
-
-
|
|
912
|
+
|
|
913
|
+
Available actions:
|
|
914
|
+
- click: Click at current mouse position
|
|
915
|
+
- clickAt: Click at coordinates (VALUE: x,y)
|
|
916
|
+
- moveTo: Move mouse to coordinates (VALUE: x,y)
|
|
917
|
+
- type: Type text (VALUE: text to type)
|
|
918
|
+
- press: Press a key (VALUE: Enter, Tab, Escape, etc.)
|
|
919
|
+
- keyCombo: Key combination (VALUE: command+s, control+c, etc.)
|
|
920
|
+
- scroll: Scroll (VALUE: up or down)
|
|
921
|
+
- navigate: Open URL (VALUE: full URL)
|
|
922
|
+
- wait: Wait for something to load (VALUE: seconds)
|
|
923
|
+
- findClick: Find element and click it (VALUE: description of element)
|
|
890
924
|
- done: Goal is accomplished
|
|
891
925
|
- stuck: Can't figure out what to do
|
|
892
926
|
|
|
893
|
-
Respond in format:
|
|
927
|
+
Respond EXACTLY in this format:
|
|
894
928
|
ACTION: <action_type>
|
|
895
|
-
VALUE: <
|
|
896
|
-
REASONING: <why>`
|
|
929
|
+
VALUE: <parameter>
|
|
930
|
+
REASONING: <brief why>`
|
|
897
931
|
}]);
|
|
898
932
|
|
|
899
933
|
const actionContent = nextAction.content;
|
|
@@ -903,7 +937,7 @@ REASONING: <why>`
|
|
|
903
937
|
const valueMatch = actionContent.match(/VALUE:\s*(.+?)(?:\n|$)/i);
|
|
904
938
|
|
|
905
939
|
if (!actionMatch) {
|
|
906
|
-
actionHistory.push(`
|
|
940
|
+
actionHistory.push(`[${attempt + 1}] ⚠️ Couldn't parse action`);
|
|
907
941
|
continue;
|
|
908
942
|
}
|
|
909
943
|
|
|
@@ -912,58 +946,149 @@ REASONING: <why>`
|
|
|
912
946
|
|
|
913
947
|
if (action === 'done') {
|
|
914
948
|
accomplished = true;
|
|
915
|
-
actionHistory.push(`
|
|
949
|
+
actionHistory.push(`[${attempt + 1}] ✅ Goal accomplished!`);
|
|
950
|
+
|
|
951
|
+
// Learn from success
|
|
952
|
+
if (actionHistory.length > 1) {
|
|
953
|
+
const lastSuccessfulAction = actionHistory[actionHistory.length - 2];
|
|
954
|
+
const actionParts = lastSuccessfulAction.match(/→ (\w+)(?:\s*"(.+)")?/);
|
|
955
|
+
if (actionParts) {
|
|
956
|
+
await learner.learn(
|
|
957
|
+
currentScreen.description.slice(0, 300),
|
|
958
|
+
goal,
|
|
959
|
+
actionParts[1],
|
|
960
|
+
actionParts[2] || '',
|
|
961
|
+
'self'
|
|
962
|
+
);
|
|
963
|
+
}
|
|
964
|
+
}
|
|
916
965
|
break;
|
|
917
966
|
}
|
|
918
967
|
|
|
919
|
-
if (action === 'stuck') {
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
const
|
|
924
|
-
|
|
925
|
-
|
|
968
|
+
if (action === 'stuck' || stuckCount >= stuckThreshold) {
|
|
969
|
+
actionHistory.push(`[${attempt + 1}] 🆘 Asking for help...`);
|
|
970
|
+
|
|
971
|
+
// Get help from multiple sources
|
|
972
|
+
const suggestions = await learner.getHelp(
|
|
973
|
+
goal,
|
|
974
|
+
currentScreen.description,
|
|
975
|
+
actionHistory.slice(-3)
|
|
976
|
+
);
|
|
977
|
+
|
|
978
|
+
if (suggestions.length > 0) {
|
|
979
|
+
const suggestion = suggestions[0];
|
|
980
|
+
actionHistory.push(`💡 Got suggestion from ${suggestion.source}: ${suggestion.value.slice(0, 100)}`);
|
|
981
|
+
|
|
982
|
+
// Try to parse and execute the suggestion
|
|
983
|
+
if (suggestion.action && suggestion.action !== 'suggested') {
|
|
984
|
+
try {
|
|
985
|
+
await executeAdaptiveAction(suggestion.action, suggestion.value);
|
|
986
|
+
actionHistory.push(`[${attempt + 1}] → ${suggestion.action} "${suggestion.value.slice(0, 30)}"`);
|
|
987
|
+
|
|
988
|
+
// Learn from successful suggestion
|
|
989
|
+
await learner.learn(
|
|
990
|
+
currentScreen.description.slice(0, 300),
|
|
991
|
+
goal,
|
|
992
|
+
suggestion.action,
|
|
993
|
+
suggestion.value,
|
|
994
|
+
suggestion.source
|
|
995
|
+
);
|
|
996
|
+
stuckCount = 0;
|
|
997
|
+
} catch (e) {
|
|
998
|
+
actionHistory.push(`[${attempt + 1}] ❌ Suggestion failed`);
|
|
999
|
+
}
|
|
1000
|
+
}
|
|
1001
|
+
} else {
|
|
1002
|
+
actionHistory.push(`[${attempt + 1}] 😕 No helpful suggestions found`);
|
|
1003
|
+
}
|
|
926
1004
|
continue;
|
|
927
1005
|
}
|
|
928
1006
|
|
|
929
|
-
// Execute the action
|
|
1007
|
+
// Execute the action
|
|
930
1008
|
try {
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
await computer.clickMouse('left');
|
|
934
|
-
actionHistory.push(`Attempt ${attempt + 1}: Clicked`);
|
|
935
|
-
break;
|
|
936
|
-
case 'type':
|
|
937
|
-
if (value) {
|
|
938
|
-
await computer.typeText(value);
|
|
939
|
-
}
|
|
940
|
-
actionHistory.push(`Attempt ${attempt + 1}: Typed "${value}"`);
|
|
941
|
-
break;
|
|
942
|
-
case 'press':
|
|
943
|
-
await computer.pressKey(value || 'Return');
|
|
944
|
-
actionHistory.push(`Attempt ${attempt + 1}: Pressed ${value || 'Enter'}`);
|
|
945
|
-
break;
|
|
946
|
-
case 'scroll':
|
|
947
|
-
await browser.scroll(value.toLowerCase().includes('up') ? 'up' : 'down');
|
|
948
|
-
actionHistory.push(`Attempt ${attempt + 1}: Scrolled ${value || 'down'}`);
|
|
949
|
-
break;
|
|
950
|
-
case 'navigate':
|
|
951
|
-
const url = value.startsWith('http') ? value : `https://${value}`;
|
|
952
|
-
await browser.openUrl(url);
|
|
953
|
-
actionHistory.push(`Attempt ${attempt + 1}: Opened ${url}`);
|
|
954
|
-
break;
|
|
955
|
-
default:
|
|
956
|
-
actionHistory.push(`Attempt ${attempt + 1}: Unknown action ${action}`);
|
|
957
|
-
}
|
|
1009
|
+
await executeAdaptiveAction(action, value);
|
|
1010
|
+
actionHistory.push(`[${attempt + 1}] → ${action}${value ? ` "${value.slice(0, 40)}"` : ''}`);
|
|
958
1011
|
} catch (e) {
|
|
959
|
-
actionHistory.push(`
|
|
1012
|
+
actionHistory.push(`[${attempt + 1}] ❌ ${action} failed - ${e}`);
|
|
1013
|
+
await learner.recordFailure(goal, action, value);
|
|
960
1014
|
}
|
|
961
1015
|
|
|
962
|
-
|
|
1016
|
+
// Human-like delay between actions (1-2 seconds)
|
|
1017
|
+
await sleep(1000 + Math.random() * 1000);
|
|
963
1018
|
}
|
|
964
1019
|
|
|
965
|
-
step.result = `🎯 Adaptive Agent Result:\n\nGoal: ${goal}\nAccomplished: ${accomplished ? 'Yes ✅' : 'Partial/No ❌'}\n\nAction Log:\n${actionHistory.join('\n')}`;
|
|
1020
|
+
step.result = `🎯 Adaptive Agent Result:\n\nGoal: ${goal}\nAccomplished: ${accomplished ? 'Yes ✅' : 'Partial/No ❌'}\nAttempts: ${Math.min(actionHistory.length, maxAttempts)}/${maxAttempts}\n\nAction Log:\n${actionHistory.join('\n')}`;
|
|
966
1021
|
break;
|
|
1022
|
+
|
|
1023
|
+
// Helper function for executing actions
|
|
1024
|
+
async function executeAdaptiveAction(action: string, value: string): Promise<void> {
|
|
1025
|
+
switch (action) {
|
|
1026
|
+
case 'click':
|
|
1027
|
+
await computer.clickMouse('left');
|
|
1028
|
+
break;
|
|
1029
|
+
case 'clickat':
|
|
1030
|
+
case 'clickAt': {
|
|
1031
|
+
const [x, y] = value.split(',').map(n => parseInt(n.trim()));
|
|
1032
|
+
if (!isNaN(x) && !isNaN(y)) {
|
|
1033
|
+
await computer.moveMouse(x, y);
|
|
1034
|
+
await sleep(100);
|
|
1035
|
+
await computer.clickMouse('left');
|
|
1036
|
+
}
|
|
1037
|
+
break;
|
|
1038
|
+
}
|
|
1039
|
+
case 'moveto':
|
|
1040
|
+
case 'moveTo': {
|
|
1041
|
+
const [mx, my] = value.split(',').map(n => parseInt(n.trim()));
|
|
1042
|
+
if (!isNaN(mx) && !isNaN(my)) {
|
|
1043
|
+
await computer.moveMouse(mx, my);
|
|
1044
|
+
}
|
|
1045
|
+
break;
|
|
1046
|
+
}
|
|
1047
|
+
case 'type':
|
|
1048
|
+
if (value) {
|
|
1049
|
+
// Use human-like typing if available
|
|
1050
|
+
if (computer.typeTextHuman) {
|
|
1051
|
+
await computer.typeTextHuman(value, 50);
|
|
1052
|
+
} else {
|
|
1053
|
+
await computer.typeText(value);
|
|
1054
|
+
}
|
|
1055
|
+
}
|
|
1056
|
+
break;
|
|
1057
|
+
case 'press':
|
|
1058
|
+
await computer.pressKey(value || 'Return');
|
|
1059
|
+
break;
|
|
1060
|
+
case 'keycombo':
|
|
1061
|
+
case 'keyCombo': {
|
|
1062
|
+
const keys = value.split('+').map(k => k.trim().toLowerCase());
|
|
1063
|
+
await computer.keyCombo(keys);
|
|
1064
|
+
break;
|
|
1065
|
+
}
|
|
1066
|
+
case 'scroll':
|
|
1067
|
+
await browser.scroll(value.toLowerCase().includes('up') ? 'up' : 'down');
|
|
1068
|
+
break;
|
|
1069
|
+
case 'navigate': {
|
|
1070
|
+
const navUrl = value.startsWith('http') ? value : `https://${value}`;
|
|
1071
|
+
await browser.openUrl(navUrl);
|
|
1072
|
+
await sleep(2000); // Wait for page load
|
|
1073
|
+
break;
|
|
1074
|
+
}
|
|
1075
|
+
case 'wait': {
|
|
1076
|
+
const seconds = parseFloat(value) || 2;
|
|
1077
|
+
await sleep(seconds * 1000);
|
|
1078
|
+
break;
|
|
1079
|
+
}
|
|
1080
|
+
case 'findclick':
|
|
1081
|
+
case 'findClick':
|
|
1082
|
+
if (computer.findAndClick) {
|
|
1083
|
+
await computer.findAndClick(value);
|
|
1084
|
+
} else {
|
|
1085
|
+
throw new Error('findAndClick not available');
|
|
1086
|
+
}
|
|
1087
|
+
break;
|
|
1088
|
+
default:
|
|
1089
|
+
throw new Error(`Unknown action: ${action}`);
|
|
1090
|
+
}
|
|
1091
|
+
}
|
|
967
1092
|
}
|
|
968
1093
|
|
|
969
1094
|
case 'chat':
|
package/src/lib/vision.ts
CHANGED
|
@@ -252,3 +252,142 @@ async function analyzeWithOpenAI(base64Image: string, prompt: string): Promise<s
|
|
|
252
252
|
const data = await response.json() as { choices: Array<{ message: { content: string } }> };
|
|
253
253
|
return data.choices?.[0]?.message?.content || 'Unable to analyze image';
|
|
254
254
|
}
|
|
255
|
+
|
|
256
|
+
/**
|
|
257
|
+
* Find element coordinates on screen by description
|
|
258
|
+
* Returns approximate center coordinates where AI thinks the element is
|
|
259
|
+
*/
|
|
260
|
+
export async function findElementCoordinates(
|
|
261
|
+
screenshot: string,
|
|
262
|
+
description: string
|
|
263
|
+
): Promise<{ x: number; y: number } | null> {
|
|
264
|
+
const config = getConfig();
|
|
265
|
+
|
|
266
|
+
const prompt = `Look at this screenshot carefully. Find the UI element described as: "${description}"
|
|
267
|
+
|
|
268
|
+
Your task is to estimate the CENTER coordinates (x, y) of this element.
|
|
269
|
+
|
|
270
|
+
IMPORTANT:
|
|
271
|
+
- Assume the screen is approximately 1920x1080 pixels (adjust if you see indicators of different resolution)
|
|
272
|
+
- Give coordinates as integers
|
|
273
|
+
- If the element is clearly visible, give your best estimate
|
|
274
|
+
- If you absolutely cannot find it, respond with NOT_FOUND
|
|
275
|
+
|
|
276
|
+
Respond in EXACTLY this format (numbers only, no units):
|
|
277
|
+
X: <number>
|
|
278
|
+
Y: <number>
|
|
279
|
+
|
|
280
|
+
Or if not found:
|
|
281
|
+
NOT_FOUND`;
|
|
282
|
+
|
|
283
|
+
try {
|
|
284
|
+
const response = await analyzeWithVisionCustom(screenshot, config.provider, prompt);
|
|
285
|
+
|
|
286
|
+
// Parse coordinates from response
|
|
287
|
+
const xMatch = response.match(/X:\s*(\d+)/i);
|
|
288
|
+
const yMatch = response.match(/Y:\s*(\d+)/i);
|
|
289
|
+
|
|
290
|
+
if (xMatch && yMatch) {
|
|
291
|
+
return {
|
|
292
|
+
x: parseInt(xMatch[1]),
|
|
293
|
+
y: parseInt(yMatch[1]),
|
|
294
|
+
};
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
return null;
|
|
298
|
+
} catch (error) {
|
|
299
|
+
return null;
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
/**
|
|
304
|
+
* Analyze with vision using a custom prompt (internal helper)
|
|
305
|
+
*/
|
|
306
|
+
async function analyzeWithVisionCustom(base64Image: string, provider: string, prompt: string): Promise<string> {
|
|
307
|
+
switch (provider) {
|
|
308
|
+
case 'ollama':
|
|
309
|
+
return analyzeWithOllama(base64Image, prompt);
|
|
310
|
+
case 'openrouter':
|
|
311
|
+
return analyzeWithOpenRouter(base64Image, prompt);
|
|
312
|
+
case 'anthropic':
|
|
313
|
+
return analyzeWithAnthropic(base64Image, prompt);
|
|
314
|
+
case 'openai':
|
|
315
|
+
return analyzeWithOpenAI(base64Image, prompt);
|
|
316
|
+
default:
|
|
317
|
+
throw new Error(`Vision not supported for provider: ${provider}`);
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
/**
|
|
322
|
+
* Quick hash for change detection (uses simple sampling)
|
|
323
|
+
*/
|
|
324
|
+
export function getScreenHash(base64Screenshot: string): string {
|
|
325
|
+
// Sample every 1000th character for quick comparison
|
|
326
|
+
let sample = '';
|
|
327
|
+
for (let i = 0; i < base64Screenshot.length; i += 1000) {
|
|
328
|
+
sample += base64Screenshot[i];
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
// Simple hash using character codes
|
|
332
|
+
let hash = 0;
|
|
333
|
+
for (let i = 0; i < sample.length; i++) {
|
|
334
|
+
const char = sample.charCodeAt(i);
|
|
335
|
+
hash = ((hash << 5) - hash) + char;
|
|
336
|
+
hash = hash & hash; // Convert to 32-bit integer
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
return hash.toString(16);
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
/**
|
|
343
|
+
* Compare two screenshots for significant changes
|
|
344
|
+
*/
|
|
345
|
+
export function screensChanged(screenshotA: string, screenshotB: string): boolean {
|
|
346
|
+
if (!screenshotA || !screenshotB) return true;
|
|
347
|
+
if (screenshotA.length !== screenshotB.length) return true;
|
|
348
|
+
|
|
349
|
+
// Quick hash comparison
|
|
350
|
+
const hashA = getScreenHash(screenshotA);
|
|
351
|
+
const hashB = getScreenHash(screenshotB);
|
|
352
|
+
|
|
353
|
+
return hashA !== hashB;
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
/**
|
|
357
|
+
* Analyze a specific region of the screen (for focused analysis)
|
|
358
|
+
*/
|
|
359
|
+
export async function analyzeScreenRegion(
|
|
360
|
+
screenshot: string,
|
|
361
|
+
region: { x: number; y: number; width: number; height: number },
|
|
362
|
+
question: string
|
|
363
|
+
): Promise<string> {
|
|
364
|
+
const config = getConfig();
|
|
365
|
+
|
|
366
|
+
const prompt = `Look at this screenshot. Focus on the region approximately at:
|
|
367
|
+
- Position: (${region.x}, ${region.y})
|
|
368
|
+
- Size: ${region.width}x${region.height} pixels
|
|
369
|
+
|
|
370
|
+
Question: ${question}
|
|
371
|
+
|
|
372
|
+
Be specific and concise in your answer.`;
|
|
373
|
+
|
|
374
|
+
return analyzeWithVisionCustom(screenshot, config.provider, prompt);
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
/**
|
|
378
|
+
* Get current screen description with caching
|
|
379
|
+
*/
|
|
380
|
+
let lastDescription: { text: string; timestamp: number } | null = null;
|
|
381
|
+
const DESCRIPTION_CACHE_MS = 2000;
|
|
382
|
+
|
|
383
|
+
export async function getCurrentDescription(): Promise<string> {
|
|
384
|
+
const now = Date.now();
|
|
385
|
+
|
|
386
|
+
if (lastDescription && (now - lastDescription.timestamp) < DESCRIPTION_CACHE_MS) {
|
|
387
|
+
return lastDescription.text;
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
const result = await describeScreen();
|
|
391
|
+
lastDescription = { text: result.description, timestamp: now };
|
|
392
|
+
return result.description;
|
|
393
|
+
}
|