@projectservan8n/cnapse 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/lib/tasks.ts CHANGED
@@ -860,40 +860,74 @@ Be specific about locations (top-left, center, etc.) and what each element does.
860
860
  }
861
861
 
862
862
  case 'adaptive_do': {
863
- // Adaptive agent using computer control: try to accomplish something, ask LLMs if stuck
863
+ // Enhanced adaptive agent with learning, more attempts, and verification
864
864
  const goal = params;
865
- const maxAttempts = 5;
865
+ const maxAttempts = 25; // Increased from 5
866
866
  const actionHistory: string[] = [];
867
867
  let accomplished = false;
868
+ let stuckCount = 0;
869
+ const stuckThreshold = 3;
870
+ let lastScreenHash = '';
871
+
872
+ // Import learner for self-learning capabilities
873
+ const { getLearner } = await import('../agents/learner.js');
874
+ const learner = getLearner();
875
+ await learner.load();
876
+
877
+ // Check if we've solved something similar before
878
+ const initialScreen = await describeScreen();
879
+ const remembered = await learner.recall(goal, initialScreen.description);
880
+ if (remembered && remembered.successCount > remembered.failCount) {
881
+ actionHistory.push(`📚 Found remembered solution from ${remembered.source}`);
882
+ }
868
883
 
869
884
  for (let attempt = 0; attempt < maxAttempts && !accomplished; attempt++) {
870
885
  // Take screenshot and analyze current state using vision
871
886
  const currentScreen = await describeScreen();
887
+ const currentHash = currentScreen.screenshot.slice(0, 1000);
888
+
889
+ // Check if screen changed
890
+ const screenChanged = currentHash !== lastScreenHash;
891
+ if (!screenChanged && attempt > 0) {
892
+ stuckCount++;
893
+ } else {
894
+ stuckCount = Math.max(0, stuckCount - 1);
895
+ }
896
+ lastScreenHash = currentHash;
872
897
 
873
- // Ask our AI what to do next
898
+ // Ask our AI what to do next (enhanced prompt)
874
899
  const nextAction = await chat([{
875
900
  role: 'user',
876
901
  content: `GOAL: ${goal}
877
902
 
878
903
  CURRENT SCREEN: ${currentScreen.description}
879
904
 
880
- PREVIOUS ACTIONS TAKEN:
881
- ${actionHistory.length > 0 ? actionHistory.join('\n') : 'None yet'}
905
+ PREVIOUS ACTIONS:
906
+ ${actionHistory.slice(-5).join('\n') || 'None yet'}
907
+
908
+ ATTEMPT: ${attempt + 1}/${maxAttempts}
909
+ STUCK COUNT: ${stuckCount} (will ask for help at ${stuckThreshold})
882
910
 
883
911
  Based on what you see, what's the SINGLE next action to take?
884
- Options:
885
- - click: Click (will click at current mouse position)
886
- - type: Type something (specify text)
887
- - press: Press a key (specify key like Enter, Tab, Escape)
888
- - scroll: Scroll up/down
889
- - navigate: Go to URL (opens in browser)
912
+
913
+ Available actions:
914
+ - click: Click at current mouse position
915
+ - clickAt: Click at coordinates (VALUE: x,y)
916
+ - moveTo: Move mouse to coordinates (VALUE: x,y)
917
+ - type: Type text (VALUE: text to type)
918
+ - press: Press a key (VALUE: Enter, Tab, Escape, etc.)
919
+ - keyCombo: Key combination (VALUE: command+s, control+c, etc.)
920
+ - scroll: Scroll (VALUE: up or down)
921
+ - navigate: Open URL (VALUE: full URL)
922
+ - wait: Wait for something to load (VALUE: seconds)
923
+ - findClick: Find element and click it (VALUE: description of element)
890
924
  - done: Goal is accomplished
891
925
  - stuck: Can't figure out what to do
892
926
 
893
- Respond in format:
927
+ Respond EXACTLY in this format:
894
928
  ACTION: <action_type>
895
- VALUE: <text to type, URL to navigate, or key to press>
896
- REASONING: <why>`
929
+ VALUE: <parameter>
930
+ REASONING: <brief why>`
897
931
  }]);
898
932
 
899
933
  const actionContent = nextAction.content;
@@ -903,7 +937,7 @@ REASONING: <why>`
903
937
  const valueMatch = actionContent.match(/VALUE:\s*(.+?)(?:\n|$)/i);
904
938
 
905
939
  if (!actionMatch) {
906
- actionHistory.push(`Attempt ${attempt + 1}: Couldn't parse action`);
940
+ actionHistory.push(`[${attempt + 1}] ⚠️ Couldn't parse action`);
907
941
  continue;
908
942
  }
909
943
 
@@ -912,58 +946,149 @@ REASONING: <why>`
912
946
 
913
947
  if (action === 'done') {
914
948
  accomplished = true;
915
- actionHistory.push(`Attempt ${attempt + 1}: Goal accomplished!`);
949
+ actionHistory.push(`[${attempt + 1}] Goal accomplished!`);
950
+
951
+ // Learn from success
952
+ if (actionHistory.length > 1) {
953
+ const lastSuccessfulAction = actionHistory[actionHistory.length - 2];
954
+ const actionParts = lastSuccessfulAction.match(/→ (\w+)(?:\s*"(.+)")?/);
955
+ if (actionParts) {
956
+ await learner.learn(
957
+ currentScreen.description.slice(0, 300),
958
+ goal,
959
+ actionParts[1],
960
+ actionParts[2] || '',
961
+ 'self'
962
+ );
963
+ }
964
+ }
916
965
  break;
917
966
  }
918
967
 
919
- if (action === 'stuck') {
920
- // Ask Perplexity for help
921
- actionHistory.push(`Attempt ${attempt + 1}: Got stuck, asking Perplexity for help...`);
922
-
923
- const helpRequest = `I'm trying to: ${goal}\n\nI'm stuck. What should I do next? Be specific about what to click or type.`;
924
- const advice = await browser.askAI('perplexity', helpRequest);
925
- actionHistory.push(`Got advice: ${advice.response.slice(0, 200)}...`);
968
+ if (action === 'stuck' || stuckCount >= stuckThreshold) {
969
+ actionHistory.push(`[${attempt + 1}] 🆘 Asking for help...`);
970
+
971
+ // Get help from multiple sources
972
+ const suggestions = await learner.getHelp(
973
+ goal,
974
+ currentScreen.description,
975
+ actionHistory.slice(-3)
976
+ );
977
+
978
+ if (suggestions.length > 0) {
979
+ const suggestion = suggestions[0];
980
+ actionHistory.push(`💡 Got suggestion from ${suggestion.source}: ${suggestion.value.slice(0, 100)}`);
981
+
982
+ // Try to parse and execute the suggestion
983
+ if (suggestion.action && suggestion.action !== 'suggested') {
984
+ try {
985
+ await executeAdaptiveAction(suggestion.action, suggestion.value);
986
+ actionHistory.push(`[${attempt + 1}] → ${suggestion.action} "${suggestion.value.slice(0, 30)}"`);
987
+
988
+ // Learn from successful suggestion
989
+ await learner.learn(
990
+ currentScreen.description.slice(0, 300),
991
+ goal,
992
+ suggestion.action,
993
+ suggestion.value,
994
+ suggestion.source
995
+ );
996
+ stuckCount = 0;
997
+ } catch (e) {
998
+ actionHistory.push(`[${attempt + 1}] ❌ Suggestion failed`);
999
+ }
1000
+ }
1001
+ } else {
1002
+ actionHistory.push(`[${attempt + 1}] 😕 No helpful suggestions found`);
1003
+ }
926
1004
  continue;
927
1005
  }
928
1006
 
929
- // Execute the action using computer control
1007
+ // Execute the action
930
1008
  try {
931
- switch (action) {
932
- case 'click':
933
- await computer.clickMouse('left');
934
- actionHistory.push(`Attempt ${attempt + 1}: Clicked`);
935
- break;
936
- case 'type':
937
- if (value) {
938
- await computer.typeText(value);
939
- }
940
- actionHistory.push(`Attempt ${attempt + 1}: Typed "${value}"`);
941
- break;
942
- case 'press':
943
- await computer.pressKey(value || 'Return');
944
- actionHistory.push(`Attempt ${attempt + 1}: Pressed ${value || 'Enter'}`);
945
- break;
946
- case 'scroll':
947
- await browser.scroll(value.toLowerCase().includes('up') ? 'up' : 'down');
948
- actionHistory.push(`Attempt ${attempt + 1}: Scrolled ${value || 'down'}`);
949
- break;
950
- case 'navigate':
951
- const url = value.startsWith('http') ? value : `https://${value}`;
952
- await browser.openUrl(url);
953
- actionHistory.push(`Attempt ${attempt + 1}: Opened ${url}`);
954
- break;
955
- default:
956
- actionHistory.push(`Attempt ${attempt + 1}: Unknown action ${action}`);
957
- }
1009
+ await executeAdaptiveAction(action, value);
1010
+ actionHistory.push(`[${attempt + 1}] → ${action}${value ? ` "${value.slice(0, 40)}"` : ''}`);
958
1011
  } catch (e) {
959
- actionHistory.push(`Attempt ${attempt + 1}: Action failed - ${e}`);
1012
+ actionHistory.push(`[${attempt + 1}] ${action} failed - ${e}`);
1013
+ await learner.recordFailure(goal, action, value);
960
1014
  }
961
1015
 
962
- await sleep(2000); // Wait for UI to update
1016
+ // Human-like delay between actions (1-2 seconds)
1017
+ await sleep(1000 + Math.random() * 1000);
963
1018
  }
964
1019
 
965
- step.result = `🎯 Adaptive Agent Result:\n\nGoal: ${goal}\nAccomplished: ${accomplished ? 'Yes ✅' : 'Partial/No ❌'}\n\nAction Log:\n${actionHistory.join('\n')}`;
1020
+ step.result = `🎯 Adaptive Agent Result:\n\nGoal: ${goal}\nAccomplished: ${accomplished ? 'Yes ✅' : 'Partial/No ❌'}\nAttempts: ${Math.min(actionHistory.length, maxAttempts)}/${maxAttempts}\n\nAction Log:\n${actionHistory.join('\n')}`;
966
1021
  break;
1022
+
1023
+ // Helper function for executing actions
1024
+ async function executeAdaptiveAction(action: string, value: string): Promise<void> {
1025
+ switch (action) {
1026
+ case 'click':
1027
+ await computer.clickMouse('left');
1028
+ break;
1029
+ case 'clickat':
1030
+ case 'clickAt': {
1031
+ const [x, y] = value.split(',').map(n => parseInt(n.trim()));
1032
+ if (!isNaN(x) && !isNaN(y)) {
1033
+ await computer.moveMouse(x, y);
1034
+ await sleep(100);
1035
+ await computer.clickMouse('left');
1036
+ }
1037
+ break;
1038
+ }
1039
+ case 'moveto':
1040
+ case 'moveTo': {
1041
+ const [mx, my] = value.split(',').map(n => parseInt(n.trim()));
1042
+ if (!isNaN(mx) && !isNaN(my)) {
1043
+ await computer.moveMouse(mx, my);
1044
+ }
1045
+ break;
1046
+ }
1047
+ case 'type':
1048
+ if (value) {
1049
+ // Use human-like typing if available
1050
+ if (computer.typeTextHuman) {
1051
+ await computer.typeTextHuman(value, 50);
1052
+ } else {
1053
+ await computer.typeText(value);
1054
+ }
1055
+ }
1056
+ break;
1057
+ case 'press':
1058
+ await computer.pressKey(value || 'Return');
1059
+ break;
1060
+ case 'keycombo':
1061
+ case 'keyCombo': {
1062
+ const keys = value.split('+').map(k => k.trim().toLowerCase());
1063
+ await computer.keyCombo(keys);
1064
+ break;
1065
+ }
1066
+ case 'scroll':
1067
+ await browser.scroll(value.toLowerCase().includes('up') ? 'up' : 'down');
1068
+ break;
1069
+ case 'navigate': {
1070
+ const navUrl = value.startsWith('http') ? value : `https://${value}`;
1071
+ await browser.openUrl(navUrl);
1072
+ await sleep(2000); // Wait for page load
1073
+ break;
1074
+ }
1075
+ case 'wait': {
1076
+ const seconds = parseFloat(value) || 2;
1077
+ await sleep(seconds * 1000);
1078
+ break;
1079
+ }
1080
+ case 'findclick':
1081
+ case 'findClick':
1082
+ if (computer.findAndClick) {
1083
+ await computer.findAndClick(value);
1084
+ } else {
1085
+ throw new Error('findAndClick not available');
1086
+ }
1087
+ break;
1088
+ default:
1089
+ throw new Error(`Unknown action: ${action}`);
1090
+ }
1091
+ }
967
1092
  }
968
1093
 
969
1094
  case 'chat':
package/src/lib/vision.ts CHANGED
@@ -252,3 +252,142 @@ async function analyzeWithOpenAI(base64Image: string, prompt: string): Promise<s
252
252
  const data = await response.json() as { choices: Array<{ message: { content: string } }> };
253
253
  return data.choices?.[0]?.message?.content || 'Unable to analyze image';
254
254
  }
255
+
256
+ /**
257
+ * Find element coordinates on screen by description
258
+ * Returns approximate center coordinates where AI thinks the element is
259
+ */
260
+ export async function findElementCoordinates(
261
+ screenshot: string,
262
+ description: string
263
+ ): Promise<{ x: number; y: number } | null> {
264
+ const config = getConfig();
265
+
266
+ const prompt = `Look at this screenshot carefully. Find the UI element described as: "${description}"
267
+
268
+ Your task is to estimate the CENTER coordinates (x, y) of this element.
269
+
270
+ IMPORTANT:
271
+ - Assume the screen is approximately 1920x1080 pixels (adjust if you see indicators of different resolution)
272
+ - Give coordinates as integers
273
+ - If the element is clearly visible, give your best estimate
274
+ - If you absolutely cannot find it, respond with NOT_FOUND
275
+
276
+ Respond in EXACTLY this format (numbers only, no units):
277
+ X: <number>
278
+ Y: <number>
279
+
280
+ Or if not found:
281
+ NOT_FOUND`;
282
+
283
+ try {
284
+ const response = await analyzeWithVisionCustom(screenshot, config.provider, prompt);
285
+
286
+ // Parse coordinates from response
287
+ const xMatch = response.match(/X:\s*(\d+)/i);
288
+ const yMatch = response.match(/Y:\s*(\d+)/i);
289
+
290
+ if (xMatch && yMatch) {
291
+ return {
292
+ x: parseInt(xMatch[1]),
293
+ y: parseInt(yMatch[1]),
294
+ };
295
+ }
296
+
297
+ return null;
298
+ } catch (error) {
299
+ return null;
300
+ }
301
+ }
302
+
303
+ /**
304
+ * Analyze with vision using a custom prompt (internal helper)
305
+ */
306
+ async function analyzeWithVisionCustom(base64Image: string, provider: string, prompt: string): Promise<string> {
307
+ switch (provider) {
308
+ case 'ollama':
309
+ return analyzeWithOllama(base64Image, prompt);
310
+ case 'openrouter':
311
+ return analyzeWithOpenRouter(base64Image, prompt);
312
+ case 'anthropic':
313
+ return analyzeWithAnthropic(base64Image, prompt);
314
+ case 'openai':
315
+ return analyzeWithOpenAI(base64Image, prompt);
316
+ default:
317
+ throw new Error(`Vision not supported for provider: ${provider}`);
318
+ }
319
+ }
320
+
321
+ /**
322
+ * Quick hash for change detection (uses simple sampling)
323
+ */
324
+ export function getScreenHash(base64Screenshot: string): string {
325
+ // Sample every 1000th character for quick comparison
326
+ let sample = '';
327
+ for (let i = 0; i < base64Screenshot.length; i += 1000) {
328
+ sample += base64Screenshot[i];
329
+ }
330
+
331
+ // Simple hash using character codes
332
+ let hash = 0;
333
+ for (let i = 0; i < sample.length; i++) {
334
+ const char = sample.charCodeAt(i);
335
+ hash = ((hash << 5) - hash) + char;
336
+ hash = hash & hash; // Convert to 32-bit integer
337
+ }
338
+
339
+ return hash.toString(16);
340
+ }
341
+
342
+ /**
343
+ * Compare two screenshots for significant changes
344
+ */
345
+ export function screensChanged(screenshotA: string, screenshotB: string): boolean {
346
+ if (!screenshotA || !screenshotB) return true;
347
+ if (screenshotA.length !== screenshotB.length) return true;
348
+
349
+ // Quick hash comparison
350
+ const hashA = getScreenHash(screenshotA);
351
+ const hashB = getScreenHash(screenshotB);
352
+
353
+ return hashA !== hashB;
354
+ }
355
+
356
+ /**
357
+ * Analyze a specific region of the screen (for focused analysis)
358
+ */
359
+ export async function analyzeScreenRegion(
360
+ screenshot: string,
361
+ region: { x: number; y: number; width: number; height: number },
362
+ question: string
363
+ ): Promise<string> {
364
+ const config = getConfig();
365
+
366
+ const prompt = `Look at this screenshot. Focus on the region approximately at:
367
+ - Position: (${region.x}, ${region.y})
368
+ - Size: ${region.width}x${region.height} pixels
369
+
370
+ Question: ${question}
371
+
372
+ Be specific and concise in your answer.`;
373
+
374
+ return analyzeWithVisionCustom(screenshot, config.provider, prompt);
375
+ }
376
+
377
+ /**
378
+ * Get current screen description with caching
379
+ */
380
+ let lastDescription: { text: string; timestamp: number } | null = null;
381
+ const DESCRIPTION_CACHE_MS = 2000;
382
+
383
+ export async function getCurrentDescription(): Promise<string> {
384
+ const now = Date.now();
385
+
386
+ if (lastDescription && (now - lastDescription.timestamp) < DESCRIPTION_CACHE_MS) {
387
+ return lastDescription.text;
388
+ }
389
+
390
+ const result = await describeScreen();
391
+ lastDescription = { text: result.description, timestamp: now };
392
+ return result.description;
393
+ }