npm - @projectservan8n/cnapse - Versions diffs - 0.9.0 → 0.10.0 - Mend

@projectservan8n/cnapse 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/dist/ProviderSelector-GZYF26LL.js +7 -0
package/dist/autonomous-VGEVIXXQ.js +419 -0
package/dist/browser-YLFWQXIY.js +87 -0
package/dist/{chunk-OPX7FFL6.js → chunk-7SDY7OPA.js} +14 -55
package/dist/chunk-COKO6V5J.js +50 -0
package/dist/chunk-GP73OJCZ.js +377 -0
package/dist/chunk-MOKGR7WE.js +344 -0
package/dist/chunk-OIVTPXE4.js +307 -0
package/dist/chunk-TFHK5CYF.js +650 -0
package/dist/chunk-WSBJFRQH.js +366 -0
package/dist/index.js +495 -1391
package/dist/learner-KH3TFTD7.js +14 -0
package/dist/vision-S57PWSCU.js +19 -0
package/package.json +1 -1
package/src/agents/autonomous.ts +515 -0
package/src/agents/learner.ts +489 -0
package/src/lib/tasks.ts +179 -54
package/src/lib/vision.ts +139 -0
package/src/services/screen-monitor.ts +288 -0
package/src/services/telegram.ts +312 -5
package/src/tools/computer.ts +226 -0
package/dist/ProviderSelector-MXRZFAOB.js +0 -6

package/src/lib/tasks.ts CHANGED Viewed

@@ -860,40 +860,74 @@ Be specific about locations (top-left, center, etc.) and what each element does.
     }
     case 'adaptive_do': {
-      // Adaptive agent using computer control: try to accomplish something, ask LLMs if stuck
+      // Enhanced adaptive agent with learning, more attempts, and verification
       const goal = params;
-      const maxAttempts = 5;
+      const maxAttempts = 25;  // Increased from 5
       const actionHistory: string[] = [];
       let accomplished = false;
+      let stuckCount = 0;
+      const stuckThreshold = 3;
+      let lastScreenHash = '';
+      // Import learner for self-learning capabilities
+      const { getLearner } = await import('../agents/learner.js');
+      const learner = getLearner();
+      await learner.load();
+      // Check if we've solved something similar before
+      const initialScreen = await describeScreen();
+      const remembered = await learner.recall(goal, initialScreen.description);
+      if (remembered && remembered.successCount > remembered.failCount) {
+        actionHistory.push(`📚 Found remembered solution from ${remembered.source}`);
+      }
       for (let attempt = 0; attempt < maxAttempts && !accomplished; attempt++) {
         // Take screenshot and analyze current state using vision
         const currentScreen = await describeScreen();
+        const currentHash = currentScreen.screenshot.slice(0, 1000);
+        // Check if screen changed
+        const screenChanged = currentHash !== lastScreenHash;
+        if (!screenChanged && attempt > 0) {
+          stuckCount++;
+        } else {
+          stuckCount = Math.max(0, stuckCount - 1);
+        }
+        lastScreenHash = currentHash;
-        // Ask our AI what to do next
+        // Ask our AI what to do next (enhanced prompt)
         const nextAction = await chat([{
           role: 'user',
           content: `GOAL: ${goal}
 CURRENT SCREEN: ${currentScreen.description}
-PREVIOUS ACTIONS TAKEN:
-${actionHistory.length > 0 ? actionHistory.join('\n') : 'None yet'}
+PREVIOUS ACTIONS:
+${actionHistory.slice(-5).join('\n') || 'None yet'}
+ATTEMPT: ${attempt + 1}/${maxAttempts}
+STUCK COUNT: ${stuckCount} (will ask for help at ${stuckThreshold})
 Based on what you see, what's the SINGLE next action to take?
-Options:
-- click: Click (will click at current mouse position)
-- type: Type something (specify text)
-- press: Press a key (specify key like Enter, Tab, Escape)
-- scroll: Scroll up/down
-- navigate: Go to URL (opens in browser)
+Available actions:
+- click: Click at current mouse position
+- clickAt: Click at coordinates (VALUE: x,y)
+- moveTo: Move mouse to coordinates (VALUE: x,y)
+- type: Type text (VALUE: text to type)
+- press: Press a key (VALUE: Enter, Tab, Escape, etc.)
+- keyCombo: Key combination (VALUE: command+s, control+c, etc.)
+- scroll: Scroll (VALUE: up or down)
+- navigate: Open URL (VALUE: full URL)
+- wait: Wait for something to load (VALUE: seconds)
+- findClick: Find element and click it (VALUE: description of element)
 - done: Goal is accomplished
 - stuck: Can't figure out what to do
-Respond in format:
+Respond EXACTLY in this format:
 ACTION: <action_type>
-VALUE: <text to type, URL to navigate, or key to press>
-REASONING: <why>`
+VALUE: <parameter>
+REASONING: <brief why>`
         }]);
         const actionContent = nextAction.content;
@@ -903,7 +937,7 @@ REASONING: <why>`
         const valueMatch = actionContent.match(/VALUE:\s*(.+?)(?:\n|$)/i);
         if (!actionMatch) {
-          actionHistory.push(`Attempt ${attempt + 1}: Couldn't parse action`);
+          actionHistory.push(`[${attempt + 1}] ⚠️ Couldn't parse action`);
           continue;
         }
@@ -912,58 +946,149 @@ REASONING: <why>`
         if (action === 'done') {
           accomplished = true;
-          actionHistory.push(`Attempt ${attempt + 1}: Goal accomplished!`);
+          actionHistory.push(`[${attempt + 1}] ✅ Goal accomplished!`);
+          // Learn from success
+          if (actionHistory.length > 1) {
+            const lastSuccessfulAction = actionHistory[actionHistory.length - 2];
+            const actionParts = lastSuccessfulAction.match(/→ (\w+)(?:\s*"(.+)")?/);
+            if (actionParts) {
+              await learner.learn(
+                currentScreen.description.slice(0, 300),
+                goal,
+                actionParts[1],
+                actionParts[2] || '',
+                'self'
+              );
+            }
+          }
           break;
         }
-        if (action === 'stuck') {
-          // Ask Perplexity for help
-          actionHistory.push(`Attempt ${attempt + 1}: Got stuck, asking Perplexity for help...`);
-          const helpRequest = `I'm trying to: ${goal}\n\nI'm stuck. What should I do next? Be specific about what to click or type.`;
-          const advice = await browser.askAI('perplexity', helpRequest);
-          actionHistory.push(`Got advice: ${advice.response.slice(0, 200)}...`);
+        if (action === 'stuck' || stuckCount >= stuckThreshold) {
+          actionHistory.push(`[${attempt + 1}] 🆘 Asking for help...`);
+          // Get help from multiple sources
+          const suggestions = await learner.getHelp(
+            goal,
+            currentScreen.description,
+            actionHistory.slice(-3)
+          );
+          if (suggestions.length > 0) {
+            const suggestion = suggestions[0];
+            actionHistory.push(`💡 Got suggestion from ${suggestion.source}: ${suggestion.value.slice(0, 100)}`);
+            // Try to parse and execute the suggestion
+            if (suggestion.action && suggestion.action !== 'suggested') {
+              try {
+                await executeAdaptiveAction(suggestion.action, suggestion.value);
+                actionHistory.push(`[${attempt + 1}] → ${suggestion.action} "${suggestion.value.slice(0, 30)}"`);
+                // Learn from successful suggestion
+                await learner.learn(
+                  currentScreen.description.slice(0, 300),
+                  goal,
+                  suggestion.action,
+                  suggestion.value,
+                  suggestion.source
+                );
+                stuckCount = 0;
+              } catch (e) {
+                actionHistory.push(`[${attempt + 1}] ❌ Suggestion failed`);
+              }
+            }
+          } else {
+            actionHistory.push(`[${attempt + 1}] 😕 No helpful suggestions found`);
+          }
           continue;
         }
-        // Execute the action using computer control
+        // Execute the action
         try {
-          switch (action) {
-            case 'click':
-              await computer.clickMouse('left');
-              actionHistory.push(`Attempt ${attempt + 1}: Clicked`);
-              break;
-            case 'type':
-              if (value) {
-                await computer.typeText(value);
-              }
-              actionHistory.push(`Attempt ${attempt + 1}: Typed "${value}"`);
-              break;
-            case 'press':
-              await computer.pressKey(value || 'Return');
-              actionHistory.push(`Attempt ${attempt + 1}: Pressed ${value || 'Enter'}`);
-              break;
-            case 'scroll':
-              await browser.scroll(value.toLowerCase().includes('up') ? 'up' : 'down');
-              actionHistory.push(`Attempt ${attempt + 1}: Scrolled ${value || 'down'}`);
-              break;
-            case 'navigate':
-              const url = value.startsWith('http') ? value : `https://${value}`;
-              await browser.openUrl(url);
-              actionHistory.push(`Attempt ${attempt + 1}: Opened ${url}`);
-              break;
-            default:
-              actionHistory.push(`Attempt ${attempt + 1}: Unknown action ${action}`);
-          }
+          await executeAdaptiveAction(action, value);
+          actionHistory.push(`[${attempt + 1}] → ${action}${value ? ` "${value.slice(0, 40)}"` : ''}`);
         } catch (e) {
-          actionHistory.push(`Attempt ${attempt + 1}: Action failed - ${e}`);
+          actionHistory.push(`[${attempt + 1}] ❌ ${action} failed - ${e}`);
+          await learner.recordFailure(goal, action, value);
         }
-        await sleep(2000); // Wait for UI to update
+        // Human-like delay between actions (1-2 seconds)
+        await sleep(1000 + Math.random() * 1000);
       }
-      step.result = `🎯 Adaptive Agent Result:\n\nGoal: ${goal}\nAccomplished: ${accomplished ? 'Yes ✅' : 'Partial/No ❌'}\n\nAction Log:\n${actionHistory.join('\n')}`;
+      step.result = `🎯 Adaptive Agent Result:\n\nGoal: ${goal}\nAccomplished: ${accomplished ? 'Yes ✅' : 'Partial/No ❌'}\nAttempts: ${Math.min(actionHistory.length, maxAttempts)}/${maxAttempts}\n\nAction Log:\n${actionHistory.join('\n')}`;
       break;
+      // Helper function for executing actions
+      async function executeAdaptiveAction(action: string, value: string): Promise<void> {
+        switch (action) {
+          case 'click':
+            await computer.clickMouse('left');
+            break;
+          case 'clickat':
+          case 'clickAt': {
+            const [x, y] = value.split(',').map(n => parseInt(n.trim()));
+            if (!isNaN(x) && !isNaN(y)) {
+              await computer.moveMouse(x, y);
+              await sleep(100);
+              await computer.clickMouse('left');
+            }
+            break;
+          }
+          case 'moveto':
+          case 'moveTo': {
+            const [mx, my] = value.split(',').map(n => parseInt(n.trim()));
+            if (!isNaN(mx) && !isNaN(my)) {
+              await computer.moveMouse(mx, my);
+            }
+            break;
+          }
+          case 'type':
+            if (value) {
+              // Use human-like typing if available
+              if (computer.typeTextHuman) {
+                await computer.typeTextHuman(value, 50);
+              } else {
+                await computer.typeText(value);
+              }
+            }
+            break;
+          case 'press':
+            await computer.pressKey(value || 'Return');
+            break;
+          case 'keycombo':
+          case 'keyCombo': {
+            const keys = value.split('+').map(k => k.trim().toLowerCase());
+            await computer.keyCombo(keys);
+            break;
+          }
+          case 'scroll':
+            await browser.scroll(value.toLowerCase().includes('up') ? 'up' : 'down');
+            break;
+          case 'navigate': {
+            const navUrl = value.startsWith('http') ? value : `https://${value}`;
+            await browser.openUrl(navUrl);
+            await sleep(2000); // Wait for page load
+            break;
+          }
+          case 'wait': {
+            const seconds = parseFloat(value) || 2;
+            await sleep(seconds * 1000);
+            break;
+          }
+          case 'findclick':
+          case 'findClick':
+            if (computer.findAndClick) {
+              await computer.findAndClick(value);
+            } else {
+              throw new Error('findAndClick not available');
+            }
+            break;
+          default:
+            throw new Error(`Unknown action: ${action}`);
+        }
+      }
     }
     case 'chat':

package/src/lib/vision.ts CHANGED Viewed

@@ -252,3 +252,142 @@ async function analyzeWithOpenAI(base64Image: string, prompt: string): Promise<s
   const data = await response.json() as { choices: Array<{ message: { content: string } }> };
   return data.choices?.[0]?.message?.content || 'Unable to analyze image';
 }
+/**
+ * Find element coordinates on screen by description
+ * Returns approximate center coordinates where AI thinks the element is
+ */
+export async function findElementCoordinates(
+  screenshot: string,
+  description: string
+): Promise<{ x: number; y: number } | null> {
+  const config = getConfig();
+  const prompt = `Look at this screenshot carefully. Find the UI element described as: "${description}"
+Your task is to estimate the CENTER coordinates (x, y) of this element.
+IMPORTANT:
+- Assume the screen is approximately 1920x1080 pixels (adjust if you see indicators of different resolution)
+- Give coordinates as integers
+- If the element is clearly visible, give your best estimate
+- If you absolutely cannot find it, respond with NOT_FOUND
+Respond in EXACTLY this format (numbers only, no units):
+X: <number>
+Y: <number>
+Or if not found:
+NOT_FOUND`;
+  try {
+    const response = await analyzeWithVisionCustom(screenshot, config.provider, prompt);
+    // Parse coordinates from response
+    const xMatch = response.match(/X:\s*(\d+)/i);
+    const yMatch = response.match(/Y:\s*(\d+)/i);
+    if (xMatch && yMatch) {
+      return {
+        x: parseInt(xMatch[1]),
+        y: parseInt(yMatch[1]),
+      };
+    }
+    return null;
+  } catch (error) {
+    return null;
+  }
+}
+/**
+ * Analyze with vision using a custom prompt (internal helper)
+ */
+async function analyzeWithVisionCustom(base64Image: string, provider: string, prompt: string): Promise<string> {
+  switch (provider) {
+    case 'ollama':
+      return analyzeWithOllama(base64Image, prompt);
+    case 'openrouter':
+      return analyzeWithOpenRouter(base64Image, prompt);
+    case 'anthropic':
+      return analyzeWithAnthropic(base64Image, prompt);
+    case 'openai':
+      return analyzeWithOpenAI(base64Image, prompt);
+    default:
+      throw new Error(`Vision not supported for provider: ${provider}`);
+  }
+}
+/**
+ * Quick hash for change detection (uses simple sampling)
+ */
+export function getScreenHash(base64Screenshot: string): string {
+  // Sample every 1000th character for quick comparison
+  let sample = '';
+  for (let i = 0; i < base64Screenshot.length; i += 1000) {
+    sample += base64Screenshot[i];
+  }
+  // Simple hash using character codes
+  let hash = 0;
+  for (let i = 0; i < sample.length; i++) {
+    const char = sample.charCodeAt(i);
+    hash = ((hash << 5) - hash) + char;
+    hash = hash & hash; // Convert to 32-bit integer
+  }
+  return hash.toString(16);
+}
+/**
+ * Compare two screenshots for significant changes
+ */
+export function screensChanged(screenshotA: string, screenshotB: string): boolean {
+  if (!screenshotA || !screenshotB) return true;
+  if (screenshotA.length !== screenshotB.length) return true;
+  // Quick hash comparison
+  const hashA = getScreenHash(screenshotA);
+  const hashB = getScreenHash(screenshotB);
+  return hashA !== hashB;
+}
+/**
+ * Analyze a specific region of the screen (for focused analysis)
+ */
+export async function analyzeScreenRegion(
+  screenshot: string,
+  region: { x: number; y: number; width: number; height: number },
+  question: string
+): Promise<string> {
+  const config = getConfig();
+  const prompt = `Look at this screenshot. Focus on the region approximately at:
+- Position: (${region.x}, ${region.y})
+- Size: ${region.width}x${region.height} pixels
+Question: ${question}
+Be specific and concise in your answer.`;
+  return analyzeWithVisionCustom(screenshot, config.provider, prompt);
+}
+/**
+ * Get current screen description with caching
+ */
+let lastDescription: { text: string; timestamp: number } | null = null;
+const DESCRIPTION_CACHE_MS = 2000;
+export async function getCurrentDescription(): Promise<string> {
+  const now = Date.now();
+  if (lastDescription && (now - lastDescription.timestamp) < DESCRIPTION_CACHE_MS) {
+    return lastDescription.text;
+  }
+  const result = await describeScreen();
+  lastDescription = { text: result.description, timestamp: now };
+  return result.description;
+}