@projectservan8n/cnapse 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/lib/tasks.ts CHANGED
@@ -4,11 +4,12 @@
4
4
  * Uses chain-of-thought prompting + learning from past tasks
5
5
  */
6
6
 
7
- import { chat, Message } from './api.js';
7
+ import { chat, chatWithVision, Message } from './api.js';
8
8
  import * as computer from '../tools/computer.js';
9
- import { describeScreen } from './vision.js';
9
+ import { describeScreen, captureScreenshot } from './vision.js';
10
10
  import * as filesystem from '../tools/filesystem.js';
11
11
  import { runCommand } from '../tools/shell.js';
12
+ import * as browser from '../services/browser.js';
12
13
  import * as fs from 'fs';
13
14
  import * as path from 'path';
14
15
  import * as os from 'os';
@@ -227,6 +228,12 @@ Before outputting steps, THINK through these questions:
227
228
  ### Research
228
229
  - research: Multi-step web research - searches, gathers info, summarizes (e.g., "research:What are the latest AI trends in 2024?")
229
230
 
231
+ ### Adaptive/Learning
232
+ - ask_llm: Ask another LLM for help with a screenshot (e.g., "ask_llm:perplexity|How do I do X in this app?")
233
+ - ask_llm: Supports: perplexity, chatgpt, claude, copilot - sends screenshot + question, gets answer
234
+ - adaptive_do: Try to accomplish something, if stuck ask LLMs for help (e.g., "adaptive_do:book a flight to NYC on kayak.com")
235
+ - learn_ui: Take screenshot and learn how to interact with current UI (e.g., "learn_ui:What buttons can I click here?")
236
+
230
237
  ### Utility
231
238
  - wait: Wait N seconds (e.g., "wait:2" - use 1-3s for app loads)
232
239
  - screenshot: Capture and describe screen
@@ -374,6 +381,39 @@ Output:
374
381
  { "description": "Type the content", "action": "google_docs:type|Project Status Report\n\nDate: Today\n\nSummary:\nThe project is on track. All milestones have been met.\n\nNext Steps:\n- Complete testing\n- Deploy to production" }
375
382
  ]
376
383
 
384
+ ### Example 12: "I don't know how to use this app, can you figure it out?"
385
+ Thinking:
386
+ - Goal: Learn the current UI and understand how to use it
387
+ - How: Use learn_ui to take screenshot and analyze
388
+ - Sequence: Screenshot -> AI analysis -> report back
389
+
390
+ Output:
391
+ [
392
+ { "description": "Analyze current UI", "action": "learn_ui:What are all the buttons, menus, and interactive elements I can use?" }
393
+ ]
394
+
395
+ ### Example 13: "book a hotel on booking.com for next weekend"
396
+ Thinking:
397
+ - Goal: Complex task on unfamiliar website - need adaptive approach
398
+ - How: Use adaptive_do which will try, and if stuck ask LLMs for help
399
+ - Sequence: Single adaptive action handles the complexity
400
+
401
+ Output:
402
+ [
403
+ { "description": "Adaptively book hotel", "action": "adaptive_do:Go to booking.com and book a hotel for next weekend" }
404
+ ]
405
+
406
+ ### Example 14: "I'm stuck, ask Claude how to proceed"
407
+ Thinking:
408
+ - Goal: Get help from another LLM with current screen context
409
+ - How: Use ask_llm with claude and send screenshot
410
+ - Sequence: Screenshot + question -> Get answer
411
+
412
+ Output:
413
+ [
414
+ { "description": "Ask Claude for help with screenshot", "action": "ask_llm:claude|I'm stuck on this screen. What should I do next to accomplish my task?" }
415
+ ]
416
+
377
417
  ## YOUR TASK
378
418
  Now parse this request: "${input}"
379
419
 
@@ -616,94 +656,49 @@ ${existingResult.output}`;
616
656
 
617
657
  case 'browse_and_ask': {
618
658
  // Format: browse_and_ask:site|question
659
+ // Using Playwright for reliable browser automation
619
660
  const [site, ...questionParts] = params.split('|');
620
661
  const question = questionParts.join('|');
621
662
 
622
- // Site-specific URLs and response wait times
623
- const sites: Record<string, { url: string; loadTime: number; responseTime: number }> = {
624
- perplexity: { url: 'https://www.perplexity.ai', loadTime: 3, responseTime: 10 },
625
- chatgpt: { url: 'https://chat.openai.com', loadTime: 4, responseTime: 15 },
626
- claude: { url: 'https://claude.ai', loadTime: 4, responseTime: 15 },
627
- google: { url: 'https://www.google.com', loadTime: 2, responseTime: 3 },
628
- bing: { url: 'https://www.bing.com', loadTime: 2, responseTime: 3 },
629
- bard: { url: 'https://bard.google.com', loadTime: 3, responseTime: 12 },
630
- copilot: { url: 'https://copilot.microsoft.com', loadTime: 3, responseTime: 12 },
631
- };
632
-
633
- const siteConfig = sites[site.toLowerCase()] || { url: `https://${site}`, loadTime: 3, responseTime: 10 };
634
-
635
- // Open the site
636
- if (process.platform === 'win32') {
637
- await runCommand(`start "" "${siteConfig.url}"`, 5000);
638
- } else if (process.platform === 'darwin') {
639
- await runCommand(`open "${siteConfig.url}"`, 5000);
640
- } else {
641
- await runCommand(`xdg-open "${siteConfig.url}"`, 5000);
642
- }
643
-
644
- // Wait for page to load
645
- await sleep(siteConfig.loadTime * 1000);
646
-
647
- // Type the question (most sites have autofocus on search/input)
648
- await computer.typeText(question);
649
- await sleep(300);
650
-
651
- // Press Enter to submit
652
- await computer.pressKey('Return');
653
-
654
- // Wait for AI to generate response
655
- await sleep(siteConfig.responseTime * 1000);
656
-
657
- // Capture multiple screenshots by scrolling to get full response
658
- const extractedParts: string[] = [];
659
- const maxScrolls = 5; // Maximum number of scroll captures
660
-
661
- for (let scrollIndex = 0; scrollIndex < maxScrolls; scrollIndex++) {
662
- // Capture current view
663
- const screenResult = await describeScreen();
664
-
665
- // Ask AI to extract just the response text from what it sees
666
- const extractPrompt = `You are looking at screenshot ${scrollIndex + 1} of ${site}. The user asked: "${question}"
663
+ // Check if site is a supported AI chat
664
+ const supportedSites = ['perplexity', 'chatgpt', 'claude', 'copilot', 'google'];
665
+ const siteLower = site.toLowerCase();
667
666
 
668
- Extract ONLY the AI's response/answer text visible on screen. Do NOT include:
669
- - The user's question
670
- - Any UI elements, buttons, navigation, or headers
671
- - Any disclaimers, suggestions, or "related questions"
672
- - Any "Sources" or citation links
673
- - Any text you already extracted (avoid duplicates)
667
+ if (supportedSites.includes(siteLower)) {
668
+ // Use Playwright's AI chat helper
669
+ const result = await browser.askAI(siteLower as any, question, true);
674
670
 
675
- ${scrollIndex > 0 ? `Previous parts already extracted:\n${extractedParts.join('\n---\n')}\n\nOnly extract NEW text that continues from where we left off.` : ''}
671
+ // If response seems short, try getting full response by scrolling
672
+ if (result.response.length < 500) {
673
+ const fullParts = await browser.getFullAIResponse(siteLower as any, 5);
674
+ if (fullParts.length > 0) {
675
+ step.result = `📝 ${site.charAt(0).toUpperCase() + site.slice(1)} says:\n\n${fullParts.join('\n\n')}`;
676
+ break;
677
+ }
678
+ }
676
679
 
677
- Just give me the actual answer text, word for word as it appears. If there's no more response text visible, respond with exactly: "END_OF_RESPONSE"`;
680
+ step.result = `📝 ${site.charAt(0).toUpperCase() + site.slice(1)} says:\n\n${result.response}`;
681
+ } else {
682
+ // Generic site - open and type
683
+ await browser.navigateTo(`https://${site}`);
684
+ await sleep(2000);
678
685
 
679
- const extractResponse = await chat([{ role: 'user', content: extractPrompt }]);
680
- const extracted = extractResponse.content.trim();
686
+ // Try to find and fill any input
687
+ const page = await browser.getPage();
688
+ const inputs = ['textarea', 'input[type="text"]', 'input[type="search"]', '[contenteditable="true"]'];
681
689
 
682
- // Check if we've reached the end
683
- if (extracted === 'END_OF_RESPONSE' || extracted.includes('END_OF_RESPONSE')) {
684
- break;
685
- }
686
-
687
- // Check for "no response" indicators
688
- if (extracted.toLowerCase().includes('response not ready') ||
689
- extracted.toLowerCase().includes('no response visible') ||
690
- extracted.toLowerCase().includes('no additional text')) {
691
- if (scrollIndex === 0) {
692
- extractedParts.push('Response not ready yet or page still loading.');
690
+ for (const selector of inputs) {
691
+ if (await browser.elementExists(selector)) {
692
+ await browser.typeInElement(selector, question);
693
+ await browser.pressKey('Enter');
694
+ break;
693
695
  }
694
- break;
695
696
  }
696
697
 
697
- extractedParts.push(extracted);
698
-
699
- // Scroll down to see more content
700
- await computer.scrollMouse(-5); // Scroll down
701
- await sleep(1000); // Wait for scroll animation
698
+ await sleep(5000);
699
+ const pageText = await browser.getPageText();
700
+ step.result = `📝 Response from ${site}:\n\n${pageText.slice(0, 3000)}`;
702
701
  }
703
-
704
- // Combine all extracted parts
705
- const fullResponse = extractedParts.join('\n\n');
706
- step.result = `📝 ${site.charAt(0).toUpperCase() + site.slice(1)} says:\n\n${fullResponse}`;
707
702
  break;
708
703
  }
709
704
 
@@ -713,181 +708,73 @@ Just give me the actual answer text, word for word as it appears. If there's no
713
708
  break;
714
709
 
715
710
  case 'web_search': {
716
- // Human-like Google search: open browser, go to google, type, search
717
- // Open browser with Win+R -> chrome/edge or just open google.com
718
- await computer.keyCombo(['meta', 'r']); // Win+R
719
- await sleep(500);
720
- await computer.typeText('chrome'); // Try Chrome first
721
- await computer.pressKey('Return');
722
- await sleep(2000);
723
-
724
- // Go to Google (Ctrl+L to focus address bar)
725
- await computer.keyCombo(['control', 'l']);
726
- await sleep(300);
727
- await computer.typeText('google.com');
728
- await computer.pressKey('Return');
729
- await sleep(2000);
711
+ // Use Playwright for reliable web search
712
+ const searchResults = await browser.webSearch(params, 'google');
730
713
 
731
- // Type search query (Google search box should be focused)
732
- await computer.typeText(params);
733
- await sleep(300);
734
- await computer.pressKey('Return');
735
- await sleep(3000); // Wait for results
736
-
737
- // Capture and extract search results
738
- const searchScreen = await describeScreen();
739
- const searchExtract = await chat([{
740
- role: 'user',
741
- content: `Extract the top search results from this Google search page. For each result, include:
742
- - Title
743
- - Brief snippet/description
744
- - URL if visible
745
-
746
- Format as a numbered list. Be concise.`
747
- }]);
748
-
749
- step.result = `🔍 Search results for "${params}":\n\n${searchExtract.content}`;
714
+ if (searchResults.length > 0) {
715
+ step.result = `🔍 Search results for "${params}":\n\n${searchResults.map((r, i) => `${i + 1}. ${r}`).join('\n')}`;
716
+ } else {
717
+ // Fallback: get page text
718
+ const pageText = await browser.getPageText();
719
+ step.result = `🔍 Search results for "${params}":\n\n${pageText.slice(0, 2000)}`;
720
+ }
750
721
  break;
751
722
  }
752
723
 
753
724
  case 'send_email': {
754
- // Human-like email: open browser, navigate to Gmail/Outlook, compose
725
+ // Use Playwright for reliable email sending
755
726
  // Format: send_email:provider|to|subject|body
756
727
  const [provider, to, subject, ...bodyParts] = params.split('|');
757
728
  const body = bodyParts.join('|');
758
729
 
759
- // Open browser
760
- await computer.keyCombo(['meta', 'r']); // Win+R
761
- await sleep(500);
762
- await computer.typeText('chrome');
763
- await computer.pressKey('Return');
764
- await sleep(2000);
765
-
766
- // Navigate to email service
767
- await computer.keyCombo(['control', 'l']); // Focus address bar
768
- await sleep(300);
730
+ const emailData = { to, subject, body };
769
731
 
732
+ let success = false;
770
733
  if (provider.toLowerCase() === 'gmail') {
771
- await computer.typeText('mail.google.com');
772
- await computer.pressKey('Return');
773
- await sleep(4000); // Wait for Gmail to load
774
-
775
- // Click Compose button (use keyboard shortcut 'c')
776
- await computer.typeText('c'); // Gmail shortcut for compose
777
- await sleep(2000); // Wait for compose window
778
-
779
- // Fill in fields
780
- await computer.typeText(to); // To field is focused
781
- await sleep(300);
782
- await computer.pressKey('Tab'); // Move to subject
783
- await sleep(200);
784
- await computer.typeText(subject);
785
- await sleep(300);
786
- await computer.pressKey('Tab'); // Move to body
787
- await sleep(200);
788
- await computer.typeText(body);
789
- await sleep(500);
790
-
791
- // Send with Ctrl+Enter
792
- await computer.keyCombo(['control', 'Return']);
793
-
734
+ success = await browser.sendGmail(emailData);
794
735
  } else if (provider.toLowerCase() === 'outlook') {
795
- await computer.typeText('outlook.live.com');
796
- await computer.pressKey('Return');
797
- await sleep(4000); // Wait for Outlook to load
798
-
799
- // Click New mail (use keyboard shortcut 'n')
800
- await computer.typeText('n'); // Outlook shortcut for new mail
801
- await sleep(2000);
802
-
803
- // Fill in fields
804
- await computer.typeText(to);
805
- await sleep(300);
806
- await computer.pressKey('Tab');
807
- await sleep(200);
808
- await computer.typeText(subject);
809
- await sleep(300);
810
- await computer.pressKey('Tab');
811
- await sleep(200);
812
- await computer.typeText(body);
813
- await sleep(500);
814
-
815
- // Send with Ctrl+Enter
816
- await computer.keyCombo(['control', 'Return']);
736
+ success = await browser.sendOutlook(emailData);
817
737
  } else {
818
738
  throw new Error(`Unsupported email provider: ${provider}. Use gmail or outlook.`);
819
739
  }
820
740
 
821
- await sleep(2000);
822
- step.result = `📧 Email sent via ${provider} to ${to}`;
741
+ if (success) {
742
+ step.result = `📧 Email sent via ${provider} to ${to}`;
743
+ } else {
744
+ throw new Error(`Failed to send email via ${provider}. Make sure you're logged in.`);
745
+ }
823
746
  break;
824
747
  }
825
748
 
826
749
  case 'google_sheets': {
827
- // Human-like: open browser, go to sheets, interact
750
+ // Use Playwright for Google Sheets
828
751
  // Format: google_sheets:command|arg1|arg2...
829
752
  const [sheetCmd, ...sheetArgs] = params.split('|');
830
753
 
831
754
  switch (sheetCmd.toLowerCase()) {
832
755
  case 'new': {
833
756
  const sheetName = sheetArgs[0] || 'Untitled spreadsheet';
834
-
835
- // Open browser and go to Google Sheets
836
- await computer.keyCombo(['meta', 'r']);
837
- await sleep(500);
838
- await computer.typeText('chrome');
839
- await computer.pressKey('Return');
840
- await sleep(2000);
841
-
842
- await computer.keyCombo(['control', 'l']);
843
- await sleep(300);
844
- await computer.typeText('sheets.google.com');
845
- await computer.pressKey('Return');
846
- await sleep(3000);
847
-
848
- // Click "Blank" to create new (or use keyboard)
849
- // Usually there's a + or Blank option, let's try clicking near top
850
- await computer.pressKey('Tab'); // Navigate
851
- await computer.pressKey('Tab');
852
- await computer.pressKey('Return'); // Create blank
853
- await sleep(3000);
854
-
855
- // Rename: click on title or use File > Rename
856
- await computer.keyCombo(['alt', 'f']); // File menu
857
- await sleep(500);
858
- await computer.typeText('r'); // Rename option
859
- await sleep(500);
860
- await computer.keyCombo(['control', 'a']); // Select all
861
- await computer.typeText(sheetName);
862
- await computer.pressKey('Return');
863
- await sleep(500);
864
- await computer.pressKey('Escape'); // Close any dialog
865
-
757
+ await browser.navigateTo('https://docs.google.com/spreadsheets/create');
758
+ await sleep(5000);
866
759
  step.result = `📊 Created Google Sheet: ${sheetName}`;
867
760
  break;
868
761
  }
869
762
  case 'type': {
870
763
  const cell = sheetArgs[0] || 'A1';
871
764
  const cellValue = sheetArgs.slice(1).join('|');
872
-
873
- // Navigate to cell using Ctrl+G or F5 (Go to)
874
- await computer.keyCombo(['control', 'g']); // Go to cell dialog
875
- await sleep(500);
876
- await computer.typeText(cell);
877
- await computer.pressKey('Return');
878
- await sleep(300);
879
-
880
- // Type the value
881
- await computer.typeText(cellValue);
882
- await computer.pressKey('Return'); // Confirm and move down
883
- await sleep(200);
884
-
885
- step.result = `📊 Typed "${cellValue}" in cell ${cell}`;
765
+ const success = await browser.googleSheetsType([{ cell, value: cellValue }]);
766
+ step.result = success
767
+ ? `📊 Typed "${cellValue}" in cell ${cell}`
768
+ : `📊 Could not type in cell ${cell}`;
886
769
  break;
887
770
  }
888
771
  case 'read': {
889
- const readScreen = await describeScreen();
890
- step.result = `📊 Current sheet view:\n${readScreen.description}`;
772
+ const screenshot = await browser.takeScreenshot();
773
+ const analysis = await chat([{
774
+ role: 'user',
775
+ content: 'Describe the contents of this Google Sheet. List visible data in the cells.'
776
+ }]);
777
+ step.result = `📊 Current sheet view:\n${analysis.content}`;
891
778
  break;
892
779
  }
893
780
  default:
@@ -897,52 +784,25 @@ Format as a numbered list. Be concise.`
897
784
  }
898
785
 
899
786
  case 'google_docs': {
900
- // Human-like: open browser, go to docs, interact
787
+ // Use Playwright for Google Docs
901
788
  // Format: google_docs:command|arg1|arg2...
902
789
  const [docCmd, ...docArgs] = params.split('|');
903
790
 
904
791
  switch (docCmd.toLowerCase()) {
905
792
  case 'new': {
906
793
  const docName = docArgs[0] || 'Untitled document';
907
-
908
- // Open browser and go to Google Docs
909
- await computer.keyCombo(['meta', 'r']);
910
- await sleep(500);
911
- await computer.typeText('chrome');
912
- await computer.pressKey('Return');
913
- await sleep(2000);
914
-
915
- await computer.keyCombo(['control', 'l']);
916
- await sleep(300);
917
- await computer.typeText('docs.google.com');
918
- await computer.pressKey('Return');
919
- await sleep(3000);
920
-
921
- // Click "Blank" to create new
922
- await computer.pressKey('Tab');
923
- await computer.pressKey('Tab');
924
- await computer.pressKey('Return');
925
- await sleep(3000);
926
-
927
- // Rename using File > Rename
928
- await computer.keyCombo(['alt', 'f']); // File menu
929
- await sleep(500);
930
- await computer.typeText('r'); // Rename
931
- await sleep(500);
932
- await computer.keyCombo(['control', 'a']); // Select all
933
- await computer.typeText(docName);
934
- await computer.pressKey('Return');
935
- await sleep(500);
936
- await computer.pressKey('Escape'); // Close dialog, focus doc
937
-
938
- step.result = `📄 Created Google Doc: ${docName}`;
794
+ const success = await browser.googleDocsType('');
795
+ step.result = success
796
+ ? `📄 Created Google Doc: ${docName}`
797
+ : `📄 Could not create Google Doc`;
939
798
  break;
940
799
  }
941
800
  case 'type': {
942
801
  const docText = docArgs.join('|');
943
- // Just type - cursor should be in document
944
- await computer.typeText(docText);
945
- step.result = `📄 Typed content in Google Doc`;
802
+ const success = await browser.googleDocsType(docText);
803
+ step.result = success
804
+ ? `📄 Typed content in Google Doc`
805
+ : `📄 Could not type in Google Doc`;
946
806
  break;
947
807
  }
948
808
  default:
@@ -952,83 +812,23 @@ Format as a numbered list. Be concise.`
952
812
  }
953
813
 
954
814
  case 'research': {
955
- // Human-like multi-step research: open browser, search, click results, gather info
815
+ // Use Playwright for multi-step research
956
816
  const researchQuery = params;
957
- const researchResults: string[] = [];
958
817
 
959
- // Step 1: Open browser and go to Google
960
- await computer.keyCombo(['meta', 'r']); // Win+R
961
- await sleep(500);
962
- await computer.typeText('chrome');
963
- await computer.pressKey('Return');
964
- await sleep(2000);
965
-
966
- await computer.keyCombo(['control', 'l']); // Focus address bar
967
- await sleep(300);
968
- await computer.typeText('google.com');
969
- await computer.pressKey('Return');
970
- await sleep(2000);
818
+ // Use browser.research which handles search, clicking, gathering
819
+ const researchData = await browser.research(researchQuery, 3);
971
820
 
972
- // Type search query
973
- await computer.typeText(researchQuery);
974
- await computer.pressKey('Return');
975
- await sleep(3000);
821
+ // Format sources
822
+ const sourceSummaries = researchData.sources.map((s, i) =>
823
+ `Source ${i + 1}: ${s.title}\n${s.content.slice(0, 500)}...`
824
+ ).join('\n\n');
976
825
 
977
- // Capture initial search results
978
- let searchScreen = await describeScreen();
979
- const initialResults = await chat([{
980
- role: 'user',
981
- content: `Extract the key information from these Google search results about: "${researchQuery}"
982
- Include any relevant facts, numbers, dates, or key points visible. Be thorough but concise.`
983
- }]);
984
- researchResults.push(`Search Results:\n${initialResults.content}`);
985
-
986
- // Step 2: Click on first result (Tab to navigate, Enter to click)
987
- await computer.pressKey('Tab');
988
- await sleep(200);
989
- await computer.pressKey('Tab');
990
- await sleep(200);
991
- await computer.pressKey('Return'); // Click first result
992
- await sleep(4000); // Wait for page load
993
-
994
- // Extract content from the page
995
- searchScreen = await describeScreen();
996
- const pageContent = await chat([{
997
- role: 'user',
998
- content: `Extract the main content and key information from this webpage about: "${researchQuery}"
999
- Ignore ads, navigation, footers. Focus on the actual article/content.`
1000
- }]);
1001
- researchResults.push(`\nSource 1 Content:\n${pageContent.content}`);
1002
-
1003
- // Step 3: Go back (Alt+Left) and check another source
1004
- await computer.keyCombo(['alt', 'Left']); // Browser back
1005
- await sleep(2000);
1006
-
1007
- // Scroll down a bit to see more results
1008
- await computer.scrollMouse(-3);
1009
- await sleep(500);
1010
-
1011
- // Navigate to second result
1012
- await computer.pressKey('Tab');
1013
- await computer.pressKey('Tab');
1014
- await computer.pressKey('Tab');
1015
- await computer.pressKey('Return');
1016
- await sleep(4000);
1017
-
1018
- searchScreen = await describeScreen();
1019
- const pageContent2 = await chat([{
1020
- role: 'user',
1021
- content: `Extract additional information from this webpage about: "${researchQuery}"
1022
- Look for details not covered in the previous source.`
1023
- }]);
1024
- researchResults.push(`\nSource 2 Content:\n${pageContent2.content}`);
1025
-
1026
- // Step 4: Synthesize all gathered information
826
+ // Ask AI to synthesize
1027
827
  const synthesis = await chat([{
1028
828
  role: 'user',
1029
829
  content: `Based on the following research gathered about "${researchQuery}", provide a comprehensive summary:
1030
830
 
1031
- ${researchResults.join('\n\n')}
831
+ ${sourceSummaries}
1032
832
 
1033
833
  Create a well-organized summary with:
1034
834
  1. Key findings
@@ -1043,6 +843,190 @@ Be thorough but concise.`
1043
843
  break;
1044
844
  }
1045
845
 
846
+ case 'ask_llm': {
847
+ // Use Playwright to ask another LLM for help with a screenshot
848
+ // Format: ask_llm:llm_name|question
849
+ const [llmName, ...questionParts] = params.split('|');
850
+ const question = questionParts.join('|');
851
+
852
+ // Take screenshot first to describe current context
853
+ const currentScreen = await describeScreen();
854
+
855
+ // Compose the question with screen context
856
+ const fullQuestion = `I'm looking at my screen and I need help. ${question}\n\nHere's what I see on my screen: ${currentScreen.description}`;
857
+
858
+ // Supported LLMs
859
+ const supportedLLMs = ['perplexity', 'chatgpt', 'claude', 'copilot'];
860
+ const llmLower = llmName.toLowerCase();
861
+
862
+ if (!supportedLLMs.includes(llmLower)) {
863
+ throw new Error(`Unknown LLM: ${llmName}. Supported: ${supportedLLMs.join(', ')}`);
864
+ }
865
+
866
+ // Use Playwright's AI chat helper
867
+ const result = await browser.askAI(llmLower as any, fullQuestion, false);
868
+
869
+ // Get full response if needed
870
+ const fullParts = await browser.getFullAIResponse(llmLower as any, 3);
871
+ const finalResponse = fullParts.length > 0 ? fullParts.join('\n\n') : result.response;
872
+
873
+ step.result = `🤖 ${llmName} says:\n\n${finalResponse}`;
874
+ break;
875
+ }
876
+
877
+ case 'learn_ui': {
878
+ // Take screenshot and analyze the UI to learn how to interact
879
+ const uiScreen = await describeScreen();
880
+
881
+ const uiAnalysis = await chat([{
882
+ role: 'user',
883
+ content: `Analyze this screenshot and identify all interactive UI elements. List:
884
+ 1. All clickable buttons and their likely functions
885
+ 2. Text input fields
886
+ 3. Menus and dropdowns
887
+ 4. Links
888
+ 5. Any keyboard shortcuts visible
889
+ 6. The main actions available in this interface
890
+
891
+ Question: ${params}
892
+
893
+ Be specific about locations (top-left, center, etc.) and what each element does.`
894
+ }]);
895
+
896
+ step.result = `🔍 UI Analysis:\n\n${uiAnalysis.content}`;
897
+ break;
898
+ }
899
+
900
+ case 'adaptive_do': {
901
+ // Adaptive agent using Playwright: try to accomplish something, ask LLMs if stuck
902
+ const goal = params;
903
+ const maxAttempts = 5;
904
+ const actionHistory: string[] = [];
905
+ let accomplished = false;
906
+
907
+ // Initialize browser
908
+ const page = await browser.getPage();
909
+
910
+ for (let attempt = 0; attempt < maxAttempts && !accomplished; attempt++) {
911
+ // Take screenshot and analyze current state
912
+ const screenshot = await browser.takeScreenshot();
913
+ const currentState = await chat([{
914
+ role: 'user',
915
+ content: `Describe what you see on this screen. What app/website is it? What elements are visible?`
916
+ }]);
917
+
918
+ // Ask our AI what to do next
919
+ const nextAction = await chat([{
920
+ role: 'user',
921
+ content: `GOAL: ${goal}
922
+
923
+ CURRENT SCREEN: ${currentState.content}
924
+
925
+ PREVIOUS ACTIONS TAKEN:
926
+ ${actionHistory.length > 0 ? actionHistory.join('\n') : 'None yet'}
927
+
928
+ Based on what you see, what's the SINGLE next action to take?
929
+ Options:
930
+ - click: Click element (describe CSS selector or visible text)
931
+ - type: Type something (specify selector and text)
932
+ - press: Press a key (specify key)
933
+ - scroll: Scroll up/down
934
+ - navigate: Go to URL
935
+ - done: Goal is accomplished
936
+ - stuck: Can't figure out what to do
937
+
938
+ Respond in format:
939
+ ACTION: <action_type>
940
+ SELECTOR: <css selector or text to find>
941
+ VALUE: <text to type or URL>
942
+ REASONING: <why>`
943
+ }]);
944
+
945
+ const actionContent = nextAction.content;
946
+
947
+ // Parse the action
948
+ const actionMatch = actionContent.match(/ACTION:\s*(\w+)/i);
949
+ const selectorMatch = actionContent.match(/SELECTOR:\s*(.+?)(?:\n|$)/i);
950
+ const valueMatch = actionContent.match(/VALUE:\s*(.+?)(?:\n|$)/i);
951
+
952
+ if (!actionMatch) {
953
+ actionHistory.push(`Attempt ${attempt + 1}: Couldn't parse action`);
954
+ continue;
955
+ }
956
+
957
+ const action = actionMatch[1].toLowerCase();
958
+ const selector = selectorMatch?.[1]?.trim() || '';
959
+ const value = valueMatch?.[1]?.trim() || '';
960
+
961
+ if (action === 'done') {
962
+ accomplished = true;
963
+ actionHistory.push(`Attempt ${attempt + 1}: Goal accomplished!`);
964
+ break;
965
+ }
966
+
967
+ if (action === 'stuck') {
968
+ // Ask Perplexity for help using Playwright
969
+ actionHistory.push(`Attempt ${attempt + 1}: Got stuck, asking Perplexity for help...`);
970
+
971
+ const helpRequest = `I'm trying to: ${goal}\n\nI'm stuck. What should I do next? Be specific about what to click or type.`;
972
+ const advice = await browser.askAI('perplexity', helpRequest, false);
973
+ actionHistory.push(`Got advice: ${advice.response.slice(0, 200)}...`);
974
+
975
+ // Navigate back to continue
976
+ await browser.navigateTo(page.url());
977
+ continue;
978
+ }
979
+
980
+ // Execute the action using Playwright
981
+ try {
982
+ switch (action) {
983
+ case 'click':
984
+ // Try to click by selector or text
985
+ if (selector) {
986
+ const clicked = await browser.clickElement(selector);
987
+ if (!clicked) {
988
+ // Try by text
989
+ await page.getByText(selector).first().click({ timeout: 5000 });
990
+ }
991
+ }
992
+ actionHistory.push(`Attempt ${attempt + 1}: Clicked "${selector}"`);
993
+ break;
994
+ case 'type':
995
+ if (selector && value) {
996
+ const typed = await browser.typeInElement(selector, value);
997
+ if (!typed) {
998
+ await page.getByPlaceholder(selector).first().fill(value);
999
+ }
1000
+ }
1001
+ actionHistory.push(`Attempt ${attempt + 1}: Typed "${value}" in "${selector}"`);
1002
+ break;
1003
+ case 'press':
1004
+ await browser.pressKey(value || selector);
1005
+ actionHistory.push(`Attempt ${attempt + 1}: Pressed ${value || selector}`);
1006
+ break;
1007
+ case 'scroll':
1008
+ await browser.scroll(value.toLowerCase().includes('up') ? 'up' : 'down');
1009
+ actionHistory.push(`Attempt ${attempt + 1}: Scrolled ${value || 'down'}`);
1010
+ break;
1011
+ case 'navigate':
1012
+ const url = value.startsWith('http') ? value : `https://${value}`;
1013
+ await browser.navigateTo(url);
1014
+ actionHistory.push(`Attempt ${attempt + 1}: Navigated to ${url}`);
1015
+ break;
1016
+ default:
1017
+ actionHistory.push(`Attempt ${attempt + 1}: Unknown action ${action}`);
1018
+ }
1019
+ } catch (e) {
1020
+ actionHistory.push(`Attempt ${attempt + 1}: Action failed - ${e}`);
1021
+ }
1022
+
1023
+ await sleep(2000); // Wait for UI to update
1024
+ }
1025
+
1026
+ step.result = `🎯 Adaptive Agent Result:\n\nGoal: ${goal}\nAccomplished: ${accomplished ? 'Yes ✅' : 'Partial/No ❌'}\n\nAction Log:\n${actionHistory.join('\n')}`;
1027
+ break;
1028
+ }
1029
+
1046
1030
  case 'chat':
1047
1031
  // This is a fallback - just describe what user wants
1048
1032
  step.result = `Task noted: ${params}`;