npm - @projectservan8n/cnapse - Versions diffs - 0.6.3 → 0.8.0 - Mend

@projectservan8n/cnapse 0.6.3 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/dist/ProviderSelector-MXRZFAOB.js +6 -0
package/dist/chunk-OPX7FFL6.js +391 -0
package/dist/index.js +882 -525
package/package.json +17 -16
package/src/agents/executor.ts +20 -13
package/src/index.tsx +32 -6
package/src/lib/tasks.ts +451 -74
package/src/services/browser.ts +669 -0
package/src/tools/index.ts +0 -1
package/dist/ConfigUI-I2CJVODT.js +0 -305
package/dist/Setup-KGYXCA7Y.js +0 -177
package/dist/chunk-COKO6V5J.js +0 -50
package/src/components/ConfigUI.tsx +0 -352
package/src/components/Setup.tsx +0 -202
package/src/lib/screen.ts +0 -118
package/src/tools/vision.ts +0 -65

package/src/lib/tasks.ts CHANGED Viewed

@@ -4,11 +4,12 @@
  * Uses chain-of-thought prompting + learning from past tasks
  */
-import { chat, Message } from './api.js';
+import { chat, chatWithVision, Message } from './api.js';
 import * as computer from '../tools/computer.js';
-import { describeScreen } from './vision.js';
+import { describeScreen, captureScreenshot } from './vision.js';
 import * as filesystem from '../tools/filesystem.js';
 import { runCommand } from '../tools/shell.js';
+import * as browser from '../services/browser.js';
 import * as fs from 'fs';
 import * as path from 'path';
 import * as os from 'os';
@@ -211,7 +212,27 @@ Before outputting steps, THINK through these questions:
 ### Web Browsing
 - open_url: Open URL in default browser (e.g., "open_url:https://perplexity.ai")
 - browse_and_ask: Open AI website, type question, wait for response (e.g., "browse_and_ask:perplexity|What is the capital of France?")
-- browse_and_ask: Supports: perplexity, chatgpt, claude, google
+- browse_and_ask: Supports: perplexity, chatgpt, claude, google, copilot, bard
+- web_search: Search Google and extract results (e.g., "web_search:best restaurants in NYC")
+### Email
+- send_email: Send email via Gmail or Outlook web (e.g., "send_email:gmail|to@email.com|Subject|Body text here")
+- send_email: Supports: gmail, outlook
+### Google Apps (via browser)
+- google_sheets: Interact with Google Sheets (e.g., "google_sheets:new|My Spreadsheet" or "google_sheets:type|A1|Hello World")
+- google_sheets: Commands: new (create), open (open existing), type (type in cell), read (screenshot current view)
+- google_docs: Interact with Google Docs (e.g., "google_docs:new|My Document" or "google_docs:type|Hello World")
+- google_docs: Commands: new (create), open (open existing), type (type text)
+### Research
+- research: Multi-step web research - searches, gathers info, summarizes (e.g., "research:What are the latest AI trends in 2024?")
+### Adaptive/Learning
+- ask_llm: Ask another LLM for help with a screenshot (e.g., "ask_llm:perplexity|How do I do X in this app?")
+- ask_llm: Supports: perplexity, chatgpt, claude, copilot - sends screenshot + question, gets answer
+- adaptive_do: Try to accomplish something, if stuck ask LLMs for help (e.g., "adaptive_do:book a flight to NYC on kayak.com")
+- learn_ui: Take screenshot and learn how to interact with current UI (e.g., "learn_ui:What buttons can I click here?")
 ### Utility
 - wait: Wait N seconds (e.g., "wait:2" - use 1-3s for app loads)
@@ -302,14 +323,95 @@ Output:
 ### Example 7: "search google for weather today"
 Thinking:
 - Goal: Open Google and search for something
-- How: Use browse_and_ask with google target
-- Sequence: Open Google, search, capture results
+- How: Use web_search for quick results extraction
+- Sequence: Search and get results
 Output:
 [
-  { "description": "Search Google", "action": "browse_and_ask:google|weather today" },
-  { "description": "Wait for results", "action": "wait:2" },
-  { "description": "Capture search results", "action": "screenshot" }
+  { "description": "Search Google for weather", "action": "web_search:weather today" }
+]
+### Example 8: "send an email to john@example.com about the meeting tomorrow"
+Thinking:
+- Goal: Compose and send an email via Gmail
+- How: Use send_email with gmail, recipient, subject, body
+- Sequence: Open Gmail, compose, fill fields, send
+Output:
+[
+  { "description": "Send email via Gmail", "action": "send_email:gmail|john@example.com|Meeting Tomorrow|Hi John, this is a reminder about our meeting tomorrow. Please let me know if you have any questions." }
+]
+### Example 9: "create a new google sheet called Sales Report and add headers"
+Thinking:
+- Goal: Create a new Google Sheet and add content
+- How: Use google_sheets to create new, then type in cells
+- Sequence: Create sheet -> Navigate to cells -> Type headers
+Output:
+[
+  { "description": "Create new Google Sheet", "action": "google_sheets:new|Sales Report" },
+  { "description": "Wait for sheet to load", "action": "wait:3" },
+  { "description": "Type header in A1", "action": "google_sheets:type|A1|Product" },
+  { "description": "Type header in B1", "action": "google_sheets:type|B1|Quantity" },
+  { "description": "Type header in C1", "action": "google_sheets:type|C1|Price" }
+]
+### Example 10: "research the latest news about AI regulations"
+Thinking:
+- Goal: Do multi-step research on a topic
+- How: Use research action which handles searching, gathering, summarizing
+- Sequence: Single research action does it all
+Output:
+[
+  { "description": "Research AI regulations news", "action": "research:latest news about AI regulations 2024" }
+]
+### Example 11: "write a document in google docs about project status"
+Thinking:
+- Goal: Create a Google Doc and write content
+- How: Use google_docs to create and type
+- Sequence: Create doc -> Type content
+Output:
+[
+  { "description": "Create new Google Doc", "action": "google_docs:new|Project Status Report" },
+  { "description": "Wait for doc to load", "action": "wait:3" },
+  { "description": "Type the content", "action": "google_docs:type|Project Status Report\n\nDate: Today\n\nSummary:\nThe project is on track. All milestones have been met.\n\nNext Steps:\n- Complete testing\n- Deploy to production" }
+]
+### Example 12: "I don't know how to use this app, can you figure it out?"
+Thinking:
+- Goal: Learn the current UI and understand how to use it
+- How: Use learn_ui to take screenshot and analyze
+- Sequence: Screenshot -> AI analysis -> report back
+Output:
+[
+  { "description": "Analyze current UI", "action": "learn_ui:What are all the buttons, menus, and interactive elements I can use?" }
+]
+### Example 13: "book a hotel on booking.com for next weekend"
+Thinking:
+- Goal: Complex task on unfamiliar website - need adaptive approach
+- How: Use adaptive_do which will try, and if stuck ask LLMs for help
+- Sequence: Single adaptive action handles the complexity
+Output:
+[
+  { "description": "Adaptively book hotel", "action": "adaptive_do:Go to booking.com and book a hotel for next weekend" }
+]
+### Example 14: "I'm stuck, ask Claude how to proceed"
+Thinking:
+- Goal: Get help from another LLM with current screen context
+- How: Use ask_llm with claude and send screenshot
+- Sequence: Screenshot + question -> Get answer
+Output:
+[
+  { "description": "Ask Claude for help with screenshot", "action": "ask_llm:claude|I'm stuck on this screen. What should I do next to accomplish my task?" }
 ]
 ## YOUR TASK
@@ -554,102 +656,377 @@ ${existingResult.output}`;
     case 'browse_and_ask': {
       // Format: browse_and_ask:site|question
+      // Using Playwright for reliable browser automation
       const [site, ...questionParts] = params.split('|');
       const question = questionParts.join('|');
-      // Site-specific URLs and response wait times
-      const sites: Record<string, { url: string; loadTime: number; responseTime: number }> = {
-        perplexity: { url: 'https://www.perplexity.ai', loadTime: 3, responseTime: 10 },
-        chatgpt: { url: 'https://chat.openai.com', loadTime: 4, responseTime: 15 },
-        claude: { url: 'https://claude.ai', loadTime: 4, responseTime: 15 },
-        google: { url: 'https://www.google.com', loadTime: 2, responseTime: 3 },
-        bing: { url: 'https://www.bing.com', loadTime: 2, responseTime: 3 },
-        bard: { url: 'https://bard.google.com', loadTime: 3, responseTime: 12 },
-        copilot: { url: 'https://copilot.microsoft.com', loadTime: 3, responseTime: 12 },
-      };
+      // Check if site is a supported AI chat
+      const supportedSites = ['perplexity', 'chatgpt', 'claude', 'copilot', 'google'];
+      const siteLower = site.toLowerCase();
-      const siteConfig = sites[site.toLowerCase()] || { url: `https://${site}`, loadTime: 3, responseTime: 10 };
+      if (supportedSites.includes(siteLower)) {
+        // Use Playwright's AI chat helper
+        const result = await browser.askAI(siteLower as any, question, true);
-      // Open the site
-      if (process.platform === 'win32') {
-        await runCommand(`start "" "${siteConfig.url}"`, 5000);
-      } else if (process.platform === 'darwin') {
-        await runCommand(`open "${siteConfig.url}"`, 5000);
+        // If response seems short, try getting full response by scrolling
+        if (result.response.length < 500) {
+          const fullParts = await browser.getFullAIResponse(siteLower as any, 5);
+          if (fullParts.length > 0) {
+            step.result = `📝 ${site.charAt(0).toUpperCase() + site.slice(1)} says:\n\n${fullParts.join('\n\n')}`;
+            break;
+          }
+        }
+        step.result = `📝 ${site.charAt(0).toUpperCase() + site.slice(1)} says:\n\n${result.response}`;
       } else {
-        await runCommand(`xdg-open "${siteConfig.url}"`, 5000);
+        // Generic site - open and type
+        await browser.navigateTo(`https://${site}`);
+        await sleep(2000);
+        // Try to find and fill any input
+        const page = await browser.getPage();
+        const inputs = ['textarea', 'input[type="text"]', 'input[type="search"]', '[contenteditable="true"]'];
+        for (const selector of inputs) {
+          if (await browser.elementExists(selector)) {
+            await browser.typeInElement(selector, question);
+            await browser.pressKey('Enter');
+            break;
+          }
+        }
+        await sleep(5000);
+        const pageText = await browser.getPageText();
+        step.result = `📝 Response from ${site}:\n\n${pageText.slice(0, 3000)}`;
       }
+      break;
+    }
-      // Wait for page to load
-      await sleep(siteConfig.loadTime * 1000);
+    case 'screenshot':
+      const vision = await describeScreen();
+      step.result = vision.description;
+      break;
-      // Type the question (most sites have autofocus on search/input)
-      await computer.typeText(question);
-      await sleep(300);
+    case 'web_search': {
+      // Use Playwright for reliable web search
+      const searchResults = await browser.webSearch(params, 'google');
-      // Press Enter to submit
-      await computer.pressKey('Return');
+      if (searchResults.length > 0) {
+        step.result = `🔍 Search results for "${params}":\n\n${searchResults.map((r, i) => `${i + 1}. ${r}`).join('\n')}`;
+      } else {
+        // Fallback: get page text
+        const pageText = await browser.getPageText();
+        step.result = `🔍 Search results for "${params}":\n\n${pageText.slice(0, 2000)}`;
+      }
+      break;
+    }
+    case 'send_email': {
+      // Use Playwright for reliable email sending
+      // Format: send_email:provider|to|subject|body
+      const [provider, to, subject, ...bodyParts] = params.split('|');
+      const body = bodyParts.join('|');
+      const emailData = { to, subject, body };
+      let success = false;
+      if (provider.toLowerCase() === 'gmail') {
+        success = await browser.sendGmail(emailData);
+      } else if (provider.toLowerCase() === 'outlook') {
+        success = await browser.sendOutlook(emailData);
+      } else {
+        throw new Error(`Unsupported email provider: ${provider}. Use gmail or outlook.`);
+      }
+      if (success) {
+        step.result = `📧 Email sent via ${provider} to ${to}`;
+      } else {
+        throw new Error(`Failed to send email via ${provider}. Make sure you're logged in.`);
+      }
+      break;
+    }
+    case 'google_sheets': {
+      // Use Playwright for Google Sheets
+      // Format: google_sheets:command|arg1|arg2...
+      const [sheetCmd, ...sheetArgs] = params.split('|');
+      switch (sheetCmd.toLowerCase()) {
+        case 'new': {
+          const sheetName = sheetArgs[0] || 'Untitled spreadsheet';
+          await browser.navigateTo('https://docs.google.com/spreadsheets/create');
+          await sleep(5000);
+          step.result = `📊 Created Google Sheet: ${sheetName}`;
+          break;
+        }
+        case 'type': {
+          const cell = sheetArgs[0] || 'A1';
+          const cellValue = sheetArgs.slice(1).join('|');
+          const success = await browser.googleSheetsType([{ cell, value: cellValue }]);
+          step.result = success
+            ? `📊 Typed "${cellValue}" in cell ${cell}`
+            : `📊 Could not type in cell ${cell}`;
+          break;
+        }
+        case 'read': {
+          const screenshot = await browser.takeScreenshot();
+          const analysis = await chat([{
+            role: 'user',
+            content: 'Describe the contents of this Google Sheet. List visible data in the cells.'
+          }]);
+          step.result = `📊 Current sheet view:\n${analysis.content}`;
+          break;
+        }
+        default:
+          throw new Error(`Unknown google_sheets command: ${sheetCmd}`);
+      }
+      break;
+    }
+    case 'google_docs': {
+      // Use Playwright for Google Docs
+      // Format: google_docs:command|arg1|arg2...
+      const [docCmd, ...docArgs] = params.split('|');
+      switch (docCmd.toLowerCase()) {
+        case 'new': {
+          const docName = docArgs[0] || 'Untitled document';
+          const success = await browser.googleDocsType('');
+          step.result = success
+            ? `📄 Created Google Doc: ${docName}`
+            : `📄 Could not create Google Doc`;
+          break;
+        }
+        case 'type': {
+          const docText = docArgs.join('|');
+          const success = await browser.googleDocsType(docText);
+          step.result = success
+            ? `📄 Typed content in Google Doc`
+            : `📄 Could not type in Google Doc`;
+          break;
+        }
+        default:
+          throw new Error(`Unknown google_docs command: ${docCmd}`);
+      }
+      break;
+    }
-      // Wait for AI to generate response
-      await sleep(siteConfig.responseTime * 1000);
+    case 'research': {
+      // Use Playwright for multi-step research
+      const researchQuery = params;
-      // Capture multiple screenshots by scrolling to get full response
-      const extractedParts: string[] = [];
-      const maxScrolls = 5; // Maximum number of scroll captures
+      // Use browser.research which handles search, clicking, gathering
+      const researchData = await browser.research(researchQuery, 3);
-      for (let scrollIndex = 0; scrollIndex < maxScrolls; scrollIndex++) {
-        // Capture current view
-        const screenResult = await describeScreen();
+      // Format sources
+      const sourceSummaries = researchData.sources.map((s, i) =>
+        `Source ${i + 1}: ${s.title}\n${s.content.slice(0, 500)}...`
+      ).join('\n\n');
-        // Ask AI to extract just the response text from what it sees
-        const extractPrompt = `You are looking at screenshot ${scrollIndex + 1} of ${site}. The user asked: "${question}"
+      // Ask AI to synthesize
+      const synthesis = await chat([{
+        role: 'user',
+        content: `Based on the following research gathered about "${researchQuery}", provide a comprehensive summary:
-Extract ONLY the AI's response/answer text visible on screen. Do NOT include:
-- The user's question
-- Any UI elements, buttons, navigation, or headers
-- Any disclaimers, suggestions, or "related questions"
-- Any "Sources" or citation links
-- Any text you already extracted (avoid duplicates)
+${sourceSummaries}
-${scrollIndex > 0 ? `Previous parts already extracted:\n${extractedParts.join('\n---\n')}\n\nOnly extract NEW text that continues from where we left off.` : ''}
+Create a well-organized summary with:
+1. Key findings
+2. Important details
+3. Any notable facts or statistics
+4. Conclusion
-Just give me the actual answer text, word for word as it appears. If there's no more response text visible, respond with exactly: "END_OF_RESPONSE"`;
+Be thorough but concise.`
+      }]);
-        const extractResponse = await chat([{ role: 'user', content: extractPrompt }]);
-        const extracted = extractResponse.content.trim();
+      step.result = `🔬 Research Summary: ${researchQuery}\n\n${synthesis.content}`;
+      break;
+    }
-        // Check if we've reached the end
-        if (extracted === 'END_OF_RESPONSE' || extracted.includes('END_OF_RESPONSE')) {
-          break;
+    case 'ask_llm': {
+      // Use Playwright to ask another LLM for help with a screenshot
+      // Format: ask_llm:llm_name|question
+      const [llmName, ...questionParts] = params.split('|');
+      const question = questionParts.join('|');
+      // Take screenshot first to describe current context
+      const currentScreen = await describeScreen();
+      // Compose the question with screen context
+      const fullQuestion = `I'm looking at my screen and I need help. ${question}\n\nHere's what I see on my screen: ${currentScreen.description}`;
+      // Supported LLMs
+      const supportedLLMs = ['perplexity', 'chatgpt', 'claude', 'copilot'];
+      const llmLower = llmName.toLowerCase();
+      if (!supportedLLMs.includes(llmLower)) {
+        throw new Error(`Unknown LLM: ${llmName}. Supported: ${supportedLLMs.join(', ')}`);
+      }
+      // Use Playwright's AI chat helper
+      const result = await browser.askAI(llmLower as any, fullQuestion, false);
+      // Get full response if needed
+      const fullParts = await browser.getFullAIResponse(llmLower as any, 3);
+      const finalResponse = fullParts.length > 0 ? fullParts.join('\n\n') : result.response;
+      step.result = `🤖 ${llmName} says:\n\n${finalResponse}`;
+      break;
+    }
+    case 'learn_ui': {
+      // Take screenshot and analyze the UI to learn how to interact
+      const uiScreen = await describeScreen();
+      const uiAnalysis = await chat([{
+        role: 'user',
+        content: `Analyze this screenshot and identify all interactive UI elements. List:
+1. All clickable buttons and their likely functions
+2. Text input fields
+3. Menus and dropdowns
+4. Links
+5. Any keyboard shortcuts visible
+6. The main actions available in this interface
+Question: ${params}
+Be specific about locations (top-left, center, etc.) and what each element does.`
+      }]);
+      step.result = `🔍 UI Analysis:\n\n${uiAnalysis.content}`;
+      break;
+    }
+    case 'adaptive_do': {
+      // Adaptive agent using Playwright: try to accomplish something, ask LLMs if stuck
+      const goal = params;
+      const maxAttempts = 5;
+      const actionHistory: string[] = [];
+      let accomplished = false;
+      // Initialize browser
+      const page = await browser.getPage();
+      for (let attempt = 0; attempt < maxAttempts && !accomplished; attempt++) {
+        // Take screenshot and analyze current state
+        const screenshot = await browser.takeScreenshot();
+        const currentState = await chat([{
+          role: 'user',
+          content: `Describe what you see on this screen. What app/website is it? What elements are visible?`
+        }]);
+        // Ask our AI what to do next
+        const nextAction = await chat([{
+          role: 'user',
+          content: `GOAL: ${goal}
+CURRENT SCREEN: ${currentState.content}
+PREVIOUS ACTIONS TAKEN:
+${actionHistory.length > 0 ? actionHistory.join('\n') : 'None yet'}
+Based on what you see, what's the SINGLE next action to take?
+Options:
+- click: Click element (describe CSS selector or visible text)
+- type: Type something (specify selector and text)
+- press: Press a key (specify key)
+- scroll: Scroll up/down
+- navigate: Go to URL
+- done: Goal is accomplished
+- stuck: Can't figure out what to do
+Respond in format:
+ACTION: <action_type>
+SELECTOR: <css selector or text to find>
+VALUE: <text to type or URL>
+REASONING: <why>`
+        }]);
+        const actionContent = nextAction.content;
+        // Parse the action
+        const actionMatch = actionContent.match(/ACTION:\s*(\w+)/i);
+        const selectorMatch = actionContent.match(/SELECTOR:\s*(.+?)(?:\n|$)/i);
+        const valueMatch = actionContent.match(/VALUE:\s*(.+?)(?:\n|$)/i);
+        if (!actionMatch) {
+          actionHistory.push(`Attempt ${attempt + 1}: Couldn't parse action`);
+          continue;
         }
-        // Check for "no response" indicators
-        if (extracted.toLowerCase().includes('response not ready') ||
-            extracted.toLowerCase().includes('no response visible') ||
-            extracted.toLowerCase().includes('no additional text')) {
-          if (scrollIndex === 0) {
-            extractedParts.push('Response not ready yet or page still loading.');
-          }
+        const action = actionMatch[1].toLowerCase();
+        const selector = selectorMatch?.[1]?.trim() || '';
+        const value = valueMatch?.[1]?.trim() || '';
+        if (action === 'done') {
+          accomplished = true;
+          actionHistory.push(`Attempt ${attempt + 1}: Goal accomplished!`);
           break;
         }
-        extractedParts.push(extracted);
+        if (action === 'stuck') {
+          // Ask Perplexity for help using Playwright
+          actionHistory.push(`Attempt ${attempt + 1}: Got stuck, asking Perplexity for help...`);
+          const helpRequest = `I'm trying to: ${goal}\n\nI'm stuck. What should I do next? Be specific about what to click or type.`;
+          const advice = await browser.askAI('perplexity', helpRequest, false);
+          actionHistory.push(`Got advice: ${advice.response.slice(0, 200)}...`);
+          // Navigate back to continue
+          await browser.navigateTo(page.url());
+          continue;
+        }
+        // Execute the action using Playwright
+        try {
+          switch (action) {
+            case 'click':
+              // Try to click by selector or text
+              if (selector) {
+                const clicked = await browser.clickElement(selector);
+                if (!clicked) {
+                  // Try by text
+                  await page.getByText(selector).first().click({ timeout: 5000 });
+                }
+              }
+              actionHistory.push(`Attempt ${attempt + 1}: Clicked "${selector}"`);
+              break;
+            case 'type':
+              if (selector && value) {
+                const typed = await browser.typeInElement(selector, value);
+                if (!typed) {
+                  await page.getByPlaceholder(selector).first().fill(value);
+                }
+              }
+              actionHistory.push(`Attempt ${attempt + 1}: Typed "${value}" in "${selector}"`);
+              break;
+            case 'press':
+              await browser.pressKey(value || selector);
+              actionHistory.push(`Attempt ${attempt + 1}: Pressed ${value || selector}`);
+              break;
+            case 'scroll':
+              await browser.scroll(value.toLowerCase().includes('up') ? 'up' : 'down');
+              actionHistory.push(`Attempt ${attempt + 1}: Scrolled ${value || 'down'}`);
+              break;
+            case 'navigate':
+              const url = value.startsWith('http') ? value : `https://${value}`;
+              await browser.navigateTo(url);
+              actionHistory.push(`Attempt ${attempt + 1}: Navigated to ${url}`);
+              break;
+            default:
+              actionHistory.push(`Attempt ${attempt + 1}: Unknown action ${action}`);
+          }
+        } catch (e) {
+          actionHistory.push(`Attempt ${attempt + 1}: Action failed - ${e}`);
+        }
-        // Scroll down to see more content
-        await computer.scrollMouse(-5); // Scroll down
-        await sleep(1000); // Wait for scroll animation
+        await sleep(2000); // Wait for UI to update
       }
-      // Combine all extracted parts
-      const fullResponse = extractedParts.join('\n\n');
-      step.result = `📝 ${site.charAt(0).toUpperCase() + site.slice(1)} says:\n\n${fullResponse}`;
+      step.result = `🎯 Adaptive Agent Result:\n\nGoal: ${goal}\nAccomplished: ${accomplished ? 'Yes ✅' : 'Partial/No ❌'}\n\nAction Log:\n${actionHistory.join('\n')}`;
       break;
     }
-    case 'screenshot':
-      const vision = await describeScreen();
-      step.result = vision.description;
-      break;
     case 'chat':
       // This is a fallback - just describe what user wants
       step.result = `Task noted: ${params}`;