@projectservan8n/cnapse 0.6.3 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ProviderSelector-MXRZFAOB.js +6 -0
- package/dist/chunk-OPX7FFL6.js +391 -0
- package/dist/index.js +882 -525
- package/package.json +17 -16
- package/src/agents/executor.ts +20 -13
- package/src/index.tsx +32 -6
- package/src/lib/tasks.ts +451 -74
- package/src/services/browser.ts +669 -0
- package/src/tools/index.ts +0 -1
- package/dist/ConfigUI-I2CJVODT.js +0 -305
- package/dist/Setup-KGYXCA7Y.js +0 -177
- package/dist/chunk-COKO6V5J.js +0 -50
- package/src/components/ConfigUI.tsx +0 -352
- package/src/components/Setup.tsx +0 -202
- package/src/lib/screen.ts +0 -118
- package/src/tools/vision.ts +0 -65
package/src/lib/tasks.ts
CHANGED
|
@@ -4,11 +4,12 @@
|
|
|
4
4
|
* Uses chain-of-thought prompting + learning from past tasks
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
|
-
import { chat, Message } from './api.js';
|
|
7
|
+
import { chat, chatWithVision, Message } from './api.js';
|
|
8
8
|
import * as computer from '../tools/computer.js';
|
|
9
|
-
import { describeScreen } from './vision.js';
|
|
9
|
+
import { describeScreen, captureScreenshot } from './vision.js';
|
|
10
10
|
import * as filesystem from '../tools/filesystem.js';
|
|
11
11
|
import { runCommand } from '../tools/shell.js';
|
|
12
|
+
import * as browser from '../services/browser.js';
|
|
12
13
|
import * as fs from 'fs';
|
|
13
14
|
import * as path from 'path';
|
|
14
15
|
import * as os from 'os';
|
|
@@ -211,7 +212,27 @@ Before outputting steps, THINK through these questions:
|
|
|
211
212
|
### Web Browsing
|
|
212
213
|
- open_url: Open URL in default browser (e.g., "open_url:https://perplexity.ai")
|
|
213
214
|
- browse_and_ask: Open AI website, type question, wait for response (e.g., "browse_and_ask:perplexity|What is the capital of France?")
|
|
214
|
-
- browse_and_ask: Supports: perplexity, chatgpt, claude, google
|
|
215
|
+
- browse_and_ask: Supports: perplexity, chatgpt, claude, google, copilot, bard
|
|
216
|
+
- web_search: Search Google and extract results (e.g., "web_search:best restaurants in NYC")
|
|
217
|
+
|
|
218
|
+
### Email
|
|
219
|
+
- send_email: Send email via Gmail or Outlook web (e.g., "send_email:gmail|to@email.com|Subject|Body text here")
|
|
220
|
+
- send_email: Supports: gmail, outlook
|
|
221
|
+
|
|
222
|
+
### Google Apps (via browser)
|
|
223
|
+
- google_sheets: Interact with Google Sheets (e.g., "google_sheets:new|My Spreadsheet" or "google_sheets:type|A1|Hello World")
|
|
224
|
+
- google_sheets: Commands: new (create), open (open existing), type (type in cell), read (screenshot current view)
|
|
225
|
+
- google_docs: Interact with Google Docs (e.g., "google_docs:new|My Document" or "google_docs:type|Hello World")
|
|
226
|
+
- google_docs: Commands: new (create), open (open existing), type (type text)
|
|
227
|
+
|
|
228
|
+
### Research
|
|
229
|
+
- research: Multi-step web research - searches, gathers info, summarizes (e.g., "research:What are the latest AI trends in 2024?")
|
|
230
|
+
|
|
231
|
+
### Adaptive/Learning
|
|
232
|
+
- ask_llm: Ask another LLM for help with a screenshot (e.g., "ask_llm:perplexity|How do I do X in this app?")
|
|
233
|
+
- ask_llm: Supports: perplexity, chatgpt, claude, copilot - sends screenshot + question, gets answer
|
|
234
|
+
- adaptive_do: Try to accomplish something, if stuck ask LLMs for help (e.g., "adaptive_do:book a flight to NYC on kayak.com")
|
|
235
|
+
- learn_ui: Take screenshot and learn how to interact with current UI (e.g., "learn_ui:What buttons can I click here?")
|
|
215
236
|
|
|
216
237
|
### Utility
|
|
217
238
|
- wait: Wait N seconds (e.g., "wait:2" - use 1-3s for app loads)
|
|
@@ -302,14 +323,95 @@ Output:
|
|
|
302
323
|
### Example 7: "search google for weather today"
|
|
303
324
|
Thinking:
|
|
304
325
|
- Goal: Open Google and search for something
|
|
305
|
-
- How: Use
|
|
306
|
-
- Sequence:
|
|
326
|
+
- How: Use web_search for quick results extraction
|
|
327
|
+
- Sequence: Search and get results
|
|
307
328
|
|
|
308
329
|
Output:
|
|
309
330
|
[
|
|
310
|
-
{ "description": "Search Google", "action": "
|
|
311
|
-
|
|
312
|
-
|
|
331
|
+
{ "description": "Search Google for weather", "action": "web_search:weather today" }
|
|
332
|
+
]
|
|
333
|
+
|
|
334
|
+
### Example 8: "send an email to john@example.com about the meeting tomorrow"
|
|
335
|
+
Thinking:
|
|
336
|
+
- Goal: Compose and send an email via Gmail
|
|
337
|
+
- How: Use send_email with gmail, recipient, subject, body
|
|
338
|
+
- Sequence: Open Gmail, compose, fill fields, send
|
|
339
|
+
|
|
340
|
+
Output:
|
|
341
|
+
[
|
|
342
|
+
{ "description": "Send email via Gmail", "action": "send_email:gmail|john@example.com|Meeting Tomorrow|Hi John, this is a reminder about our meeting tomorrow. Please let me know if you have any questions." }
|
|
343
|
+
]
|
|
344
|
+
|
|
345
|
+
### Example 9: "create a new google sheet called Sales Report and add headers"
|
|
346
|
+
Thinking:
|
|
347
|
+
- Goal: Create a new Google Sheet and add content
|
|
348
|
+
- How: Use google_sheets to create new, then type in cells
|
|
349
|
+
- Sequence: Create sheet -> Navigate to cells -> Type headers
|
|
350
|
+
|
|
351
|
+
Output:
|
|
352
|
+
[
|
|
353
|
+
{ "description": "Create new Google Sheet", "action": "google_sheets:new|Sales Report" },
|
|
354
|
+
{ "description": "Wait for sheet to load", "action": "wait:3" },
|
|
355
|
+
{ "description": "Type header in A1", "action": "google_sheets:type|A1|Product" },
|
|
356
|
+
{ "description": "Type header in B1", "action": "google_sheets:type|B1|Quantity" },
|
|
357
|
+
{ "description": "Type header in C1", "action": "google_sheets:type|C1|Price" }
|
|
358
|
+
]
|
|
359
|
+
|
|
360
|
+
### Example 10: "research the latest news about AI regulations"
|
|
361
|
+
Thinking:
|
|
362
|
+
- Goal: Do multi-step research on a topic
|
|
363
|
+
- How: Use research action which handles searching, gathering, summarizing
|
|
364
|
+
- Sequence: Single research action does it all
|
|
365
|
+
|
|
366
|
+
Output:
|
|
367
|
+
[
|
|
368
|
+
{ "description": "Research AI regulations news", "action": "research:latest news about AI regulations 2024" }
|
|
369
|
+
]
|
|
370
|
+
|
|
371
|
+
### Example 11: "write a document in google docs about project status"
|
|
372
|
+
Thinking:
|
|
373
|
+
- Goal: Create a Google Doc and write content
|
|
374
|
+
- How: Use google_docs to create and type
|
|
375
|
+
- Sequence: Create doc -> Type content
|
|
376
|
+
|
|
377
|
+
Output:
|
|
378
|
+
[
|
|
379
|
+
{ "description": "Create new Google Doc", "action": "google_docs:new|Project Status Report" },
|
|
380
|
+
{ "description": "Wait for doc to load", "action": "wait:3" },
|
|
381
|
+
{ "description": "Type the content", "action": "google_docs:type|Project Status Report\n\nDate: Today\n\nSummary:\nThe project is on track. All milestones have been met.\n\nNext Steps:\n- Complete testing\n- Deploy to production" }
|
|
382
|
+
]
|
|
383
|
+
|
|
384
|
+
### Example 12: "I don't know how to use this app, can you figure it out?"
|
|
385
|
+
Thinking:
|
|
386
|
+
- Goal: Learn the current UI and understand how to use it
|
|
387
|
+
- How: Use learn_ui to take screenshot and analyze
|
|
388
|
+
- Sequence: Screenshot -> AI analysis -> report back
|
|
389
|
+
|
|
390
|
+
Output:
|
|
391
|
+
[
|
|
392
|
+
{ "description": "Analyze current UI", "action": "learn_ui:What are all the buttons, menus, and interactive elements I can use?" }
|
|
393
|
+
]
|
|
394
|
+
|
|
395
|
+
### Example 13: "book a hotel on booking.com for next weekend"
|
|
396
|
+
Thinking:
|
|
397
|
+
- Goal: Complex task on unfamiliar website - need adaptive approach
|
|
398
|
+
- How: Use adaptive_do which will try, and if stuck ask LLMs for help
|
|
399
|
+
- Sequence: Single adaptive action handles the complexity
|
|
400
|
+
|
|
401
|
+
Output:
|
|
402
|
+
[
|
|
403
|
+
{ "description": "Adaptively book hotel", "action": "adaptive_do:Go to booking.com and book a hotel for next weekend" }
|
|
404
|
+
]
|
|
405
|
+
|
|
406
|
+
### Example 14: "I'm stuck, ask Claude how to proceed"
|
|
407
|
+
Thinking:
|
|
408
|
+
- Goal: Get help from another LLM with current screen context
|
|
409
|
+
- How: Use ask_llm with claude and send screenshot
|
|
410
|
+
- Sequence: Screenshot + question -> Get answer
|
|
411
|
+
|
|
412
|
+
Output:
|
|
413
|
+
[
|
|
414
|
+
{ "description": "Ask Claude for help with screenshot", "action": "ask_llm:claude|I'm stuck on this screen. What should I do next to accomplish my task?" }
|
|
313
415
|
]
|
|
314
416
|
|
|
315
417
|
## YOUR TASK
|
|
@@ -554,102 +656,377 @@ ${existingResult.output}`;
|
|
|
554
656
|
|
|
555
657
|
case 'browse_and_ask': {
|
|
556
658
|
// Format: browse_and_ask:site|question
|
|
659
|
+
// Using Playwright for reliable browser automation
|
|
557
660
|
const [site, ...questionParts] = params.split('|');
|
|
558
661
|
const question = questionParts.join('|');
|
|
559
662
|
|
|
560
|
-
//
|
|
561
|
-
const
|
|
562
|
-
|
|
563
|
-
chatgpt: { url: 'https://chat.openai.com', loadTime: 4, responseTime: 15 },
|
|
564
|
-
claude: { url: 'https://claude.ai', loadTime: 4, responseTime: 15 },
|
|
565
|
-
google: { url: 'https://www.google.com', loadTime: 2, responseTime: 3 },
|
|
566
|
-
bing: { url: 'https://www.bing.com', loadTime: 2, responseTime: 3 },
|
|
567
|
-
bard: { url: 'https://bard.google.com', loadTime: 3, responseTime: 12 },
|
|
568
|
-
copilot: { url: 'https://copilot.microsoft.com', loadTime: 3, responseTime: 12 },
|
|
569
|
-
};
|
|
663
|
+
// Check if site is a supported AI chat
|
|
664
|
+
const supportedSites = ['perplexity', 'chatgpt', 'claude', 'copilot', 'google'];
|
|
665
|
+
const siteLower = site.toLowerCase();
|
|
570
666
|
|
|
571
|
-
|
|
667
|
+
if (supportedSites.includes(siteLower)) {
|
|
668
|
+
// Use Playwright's AI chat helper
|
|
669
|
+
const result = await browser.askAI(siteLower as any, question, true);
|
|
572
670
|
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
671
|
+
// If response seems short, try getting full response by scrolling
|
|
672
|
+
if (result.response.length < 500) {
|
|
673
|
+
const fullParts = await browser.getFullAIResponse(siteLower as any, 5);
|
|
674
|
+
if (fullParts.length > 0) {
|
|
675
|
+
step.result = `📝 ${site.charAt(0).toUpperCase() + site.slice(1)} says:\n\n${fullParts.join('\n\n')}`;
|
|
676
|
+
break;
|
|
677
|
+
}
|
|
678
|
+
}
|
|
679
|
+
|
|
680
|
+
step.result = `📝 ${site.charAt(0).toUpperCase() + site.slice(1)} says:\n\n${result.response}`;
|
|
578
681
|
} else {
|
|
579
|
-
|
|
682
|
+
// Generic site - open and type
|
|
683
|
+
await browser.navigateTo(`https://${site}`);
|
|
684
|
+
await sleep(2000);
|
|
685
|
+
|
|
686
|
+
// Try to find and fill any input
|
|
687
|
+
const page = await browser.getPage();
|
|
688
|
+
const inputs = ['textarea', 'input[type="text"]', 'input[type="search"]', '[contenteditable="true"]'];
|
|
689
|
+
|
|
690
|
+
for (const selector of inputs) {
|
|
691
|
+
if (await browser.elementExists(selector)) {
|
|
692
|
+
await browser.typeInElement(selector, question);
|
|
693
|
+
await browser.pressKey('Enter');
|
|
694
|
+
break;
|
|
695
|
+
}
|
|
696
|
+
}
|
|
697
|
+
|
|
698
|
+
await sleep(5000);
|
|
699
|
+
const pageText = await browser.getPageText();
|
|
700
|
+
step.result = `📝 Response from ${site}:\n\n${pageText.slice(0, 3000)}`;
|
|
580
701
|
}
|
|
702
|
+
break;
|
|
703
|
+
}
|
|
581
704
|
|
|
582
|
-
|
|
583
|
-
await
|
|
705
|
+
case 'screenshot':
|
|
706
|
+
const vision = await describeScreen();
|
|
707
|
+
step.result = vision.description;
|
|
708
|
+
break;
|
|
584
709
|
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
await
|
|
710
|
+
case 'web_search': {
|
|
711
|
+
// Use Playwright for reliable web search
|
|
712
|
+
const searchResults = await browser.webSearch(params, 'google');
|
|
588
713
|
|
|
589
|
-
|
|
590
|
-
|
|
714
|
+
if (searchResults.length > 0) {
|
|
715
|
+
step.result = `🔍 Search results for "${params}":\n\n${searchResults.map((r, i) => `${i + 1}. ${r}`).join('\n')}`;
|
|
716
|
+
} else {
|
|
717
|
+
// Fallback: get page text
|
|
718
|
+
const pageText = await browser.getPageText();
|
|
719
|
+
step.result = `🔍 Search results for "${params}":\n\n${pageText.slice(0, 2000)}`;
|
|
720
|
+
}
|
|
721
|
+
break;
|
|
722
|
+
}
|
|
723
|
+
|
|
724
|
+
case 'send_email': {
|
|
725
|
+
// Use Playwright for reliable email sending
|
|
726
|
+
// Format: send_email:provider|to|subject|body
|
|
727
|
+
const [provider, to, subject, ...bodyParts] = params.split('|');
|
|
728
|
+
const body = bodyParts.join('|');
|
|
729
|
+
|
|
730
|
+
const emailData = { to, subject, body };
|
|
731
|
+
|
|
732
|
+
let success = false;
|
|
733
|
+
if (provider.toLowerCase() === 'gmail') {
|
|
734
|
+
success = await browser.sendGmail(emailData);
|
|
735
|
+
} else if (provider.toLowerCase() === 'outlook') {
|
|
736
|
+
success = await browser.sendOutlook(emailData);
|
|
737
|
+
} else {
|
|
738
|
+
throw new Error(`Unsupported email provider: ${provider}. Use gmail or outlook.`);
|
|
739
|
+
}
|
|
740
|
+
|
|
741
|
+
if (success) {
|
|
742
|
+
step.result = `📧 Email sent via ${provider} to ${to}`;
|
|
743
|
+
} else {
|
|
744
|
+
throw new Error(`Failed to send email via ${provider}. Make sure you're logged in.`);
|
|
745
|
+
}
|
|
746
|
+
break;
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
case 'google_sheets': {
|
|
750
|
+
// Use Playwright for Google Sheets
|
|
751
|
+
// Format: google_sheets:command|arg1|arg2...
|
|
752
|
+
const [sheetCmd, ...sheetArgs] = params.split('|');
|
|
753
|
+
|
|
754
|
+
switch (sheetCmd.toLowerCase()) {
|
|
755
|
+
case 'new': {
|
|
756
|
+
const sheetName = sheetArgs[0] || 'Untitled spreadsheet';
|
|
757
|
+
await browser.navigateTo('https://docs.google.com/spreadsheets/create');
|
|
758
|
+
await sleep(5000);
|
|
759
|
+
step.result = `📊 Created Google Sheet: ${sheetName}`;
|
|
760
|
+
break;
|
|
761
|
+
}
|
|
762
|
+
case 'type': {
|
|
763
|
+
const cell = sheetArgs[0] || 'A1';
|
|
764
|
+
const cellValue = sheetArgs.slice(1).join('|');
|
|
765
|
+
const success = await browser.googleSheetsType([{ cell, value: cellValue }]);
|
|
766
|
+
step.result = success
|
|
767
|
+
? `📊 Typed "${cellValue}" in cell ${cell}`
|
|
768
|
+
: `📊 Could not type in cell ${cell}`;
|
|
769
|
+
break;
|
|
770
|
+
}
|
|
771
|
+
case 'read': {
|
|
772
|
+
const screenshot = await browser.takeScreenshot();
|
|
773
|
+
const analysis = await chat([{
|
|
774
|
+
role: 'user',
|
|
775
|
+
content: 'Describe the contents of this Google Sheet. List visible data in the cells.'
|
|
776
|
+
}]);
|
|
777
|
+
step.result = `📊 Current sheet view:\n${analysis.content}`;
|
|
778
|
+
break;
|
|
779
|
+
}
|
|
780
|
+
default:
|
|
781
|
+
throw new Error(`Unknown google_sheets command: ${sheetCmd}`);
|
|
782
|
+
}
|
|
783
|
+
break;
|
|
784
|
+
}
|
|
785
|
+
|
|
786
|
+
case 'google_docs': {
|
|
787
|
+
// Use Playwright for Google Docs
|
|
788
|
+
// Format: google_docs:command|arg1|arg2...
|
|
789
|
+
const [docCmd, ...docArgs] = params.split('|');
|
|
790
|
+
|
|
791
|
+
switch (docCmd.toLowerCase()) {
|
|
792
|
+
case 'new': {
|
|
793
|
+
const docName = docArgs[0] || 'Untitled document';
|
|
794
|
+
const success = await browser.googleDocsType('');
|
|
795
|
+
step.result = success
|
|
796
|
+
? `📄 Created Google Doc: ${docName}`
|
|
797
|
+
: `📄 Could not create Google Doc`;
|
|
798
|
+
break;
|
|
799
|
+
}
|
|
800
|
+
case 'type': {
|
|
801
|
+
const docText = docArgs.join('|');
|
|
802
|
+
const success = await browser.googleDocsType(docText);
|
|
803
|
+
step.result = success
|
|
804
|
+
? `📄 Typed content in Google Doc`
|
|
805
|
+
: `📄 Could not type in Google Doc`;
|
|
806
|
+
break;
|
|
807
|
+
}
|
|
808
|
+
default:
|
|
809
|
+
throw new Error(`Unknown google_docs command: ${docCmd}`);
|
|
810
|
+
}
|
|
811
|
+
break;
|
|
812
|
+
}
|
|
591
813
|
|
|
592
|
-
|
|
593
|
-
|
|
814
|
+
case 'research': {
|
|
815
|
+
// Use Playwright for multi-step research
|
|
816
|
+
const researchQuery = params;
|
|
594
817
|
|
|
595
|
-
//
|
|
596
|
-
const
|
|
597
|
-
const maxScrolls = 5; // Maximum number of scroll captures
|
|
818
|
+
// Use browser.research which handles search, clicking, gathering
|
|
819
|
+
const researchData = await browser.research(researchQuery, 3);
|
|
598
820
|
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
821
|
+
// Format sources
|
|
822
|
+
const sourceSummaries = researchData.sources.map((s, i) =>
|
|
823
|
+
`Source ${i + 1}: ${s.title}\n${s.content.slice(0, 500)}...`
|
|
824
|
+
).join('\n\n');
|
|
602
825
|
|
|
603
|
-
|
|
604
|
-
|
|
826
|
+
// Ask AI to synthesize
|
|
827
|
+
const synthesis = await chat([{
|
|
828
|
+
role: 'user',
|
|
829
|
+
content: `Based on the following research gathered about "${researchQuery}", provide a comprehensive summary:
|
|
605
830
|
|
|
606
|
-
|
|
607
|
-
- The user's question
|
|
608
|
-
- Any UI elements, buttons, navigation, or headers
|
|
609
|
-
- Any disclaimers, suggestions, or "related questions"
|
|
610
|
-
- Any "Sources" or citation links
|
|
611
|
-
- Any text you already extracted (avoid duplicates)
|
|
831
|
+
${sourceSummaries}
|
|
612
832
|
|
|
613
|
-
|
|
833
|
+
Create a well-organized summary with:
|
|
834
|
+
1. Key findings
|
|
835
|
+
2. Important details
|
|
836
|
+
3. Any notable facts or statistics
|
|
837
|
+
4. Conclusion
|
|
614
838
|
|
|
615
|
-
|
|
839
|
+
Be thorough but concise.`
|
|
840
|
+
}]);
|
|
616
841
|
|
|
617
|
-
|
|
618
|
-
|
|
842
|
+
step.result = `🔬 Research Summary: ${researchQuery}\n\n${synthesis.content}`;
|
|
843
|
+
break;
|
|
844
|
+
}
|
|
619
845
|
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
846
|
+
case 'ask_llm': {
|
|
847
|
+
// Use Playwright to ask another LLM for help with a screenshot
|
|
848
|
+
// Format: ask_llm:llm_name|question
|
|
849
|
+
const [llmName, ...questionParts] = params.split('|');
|
|
850
|
+
const question = questionParts.join('|');
|
|
851
|
+
|
|
852
|
+
// Take screenshot first to describe current context
|
|
853
|
+
const currentScreen = await describeScreen();
|
|
854
|
+
|
|
855
|
+
// Compose the question with screen context
|
|
856
|
+
const fullQuestion = `I'm looking at my screen and I need help. ${question}\n\nHere's what I see on my screen: ${currentScreen.description}`;
|
|
857
|
+
|
|
858
|
+
// Supported LLMs
|
|
859
|
+
const supportedLLMs = ['perplexity', 'chatgpt', 'claude', 'copilot'];
|
|
860
|
+
const llmLower = llmName.toLowerCase();
|
|
861
|
+
|
|
862
|
+
if (!supportedLLMs.includes(llmLower)) {
|
|
863
|
+
throw new Error(`Unknown LLM: ${llmName}. Supported: ${supportedLLMs.join(', ')}`);
|
|
864
|
+
}
|
|
865
|
+
|
|
866
|
+
// Use Playwright's AI chat helper
|
|
867
|
+
const result = await browser.askAI(llmLower as any, fullQuestion, false);
|
|
868
|
+
|
|
869
|
+
// Get full response if needed
|
|
870
|
+
const fullParts = await browser.getFullAIResponse(llmLower as any, 3);
|
|
871
|
+
const finalResponse = fullParts.length > 0 ? fullParts.join('\n\n') : result.response;
|
|
872
|
+
|
|
873
|
+
step.result = `🤖 ${llmName} says:\n\n${finalResponse}`;
|
|
874
|
+
break;
|
|
875
|
+
}
|
|
876
|
+
|
|
877
|
+
case 'learn_ui': {
|
|
878
|
+
// Take screenshot and analyze the UI to learn how to interact
|
|
879
|
+
const uiScreen = await describeScreen();
|
|
880
|
+
|
|
881
|
+
const uiAnalysis = await chat([{
|
|
882
|
+
role: 'user',
|
|
883
|
+
content: `Analyze this screenshot and identify all interactive UI elements. List:
|
|
884
|
+
1. All clickable buttons and their likely functions
|
|
885
|
+
2. Text input fields
|
|
886
|
+
3. Menus and dropdowns
|
|
887
|
+
4. Links
|
|
888
|
+
5. Any keyboard shortcuts visible
|
|
889
|
+
6. The main actions available in this interface
|
|
890
|
+
|
|
891
|
+
Question: ${params}
|
|
892
|
+
|
|
893
|
+
Be specific about locations (top-left, center, etc.) and what each element does.`
|
|
894
|
+
}]);
|
|
895
|
+
|
|
896
|
+
step.result = `🔍 UI Analysis:\n\n${uiAnalysis.content}`;
|
|
897
|
+
break;
|
|
898
|
+
}
|
|
899
|
+
|
|
900
|
+
case 'adaptive_do': {
|
|
901
|
+
// Adaptive agent using Playwright: try to accomplish something, ask LLMs if stuck
|
|
902
|
+
const goal = params;
|
|
903
|
+
const maxAttempts = 5;
|
|
904
|
+
const actionHistory: string[] = [];
|
|
905
|
+
let accomplished = false;
|
|
906
|
+
|
|
907
|
+
// Initialize browser
|
|
908
|
+
const page = await browser.getPage();
|
|
909
|
+
|
|
910
|
+
for (let attempt = 0; attempt < maxAttempts && !accomplished; attempt++) {
|
|
911
|
+
// Take screenshot and analyze current state
|
|
912
|
+
const screenshot = await browser.takeScreenshot();
|
|
913
|
+
const currentState = await chat([{
|
|
914
|
+
role: 'user',
|
|
915
|
+
content: `Describe what you see on this screen. What app/website is it? What elements are visible?`
|
|
916
|
+
}]);
|
|
917
|
+
|
|
918
|
+
// Ask our AI what to do next
|
|
919
|
+
const nextAction = await chat([{
|
|
920
|
+
role: 'user',
|
|
921
|
+
content: `GOAL: ${goal}
|
|
922
|
+
|
|
923
|
+
CURRENT SCREEN: ${currentState.content}
|
|
924
|
+
|
|
925
|
+
PREVIOUS ACTIONS TAKEN:
|
|
926
|
+
${actionHistory.length > 0 ? actionHistory.join('\n') : 'None yet'}
|
|
927
|
+
|
|
928
|
+
Based on what you see, what's the SINGLE next action to take?
|
|
929
|
+
Options:
|
|
930
|
+
- click: Click element (describe CSS selector or visible text)
|
|
931
|
+
- type: Type something (specify selector and text)
|
|
932
|
+
- press: Press a key (specify key)
|
|
933
|
+
- scroll: Scroll up/down
|
|
934
|
+
- navigate: Go to URL
|
|
935
|
+
- done: Goal is accomplished
|
|
936
|
+
- stuck: Can't figure out what to do
|
|
937
|
+
|
|
938
|
+
Respond in format:
|
|
939
|
+
ACTION: <action_type>
|
|
940
|
+
SELECTOR: <css selector or text to find>
|
|
941
|
+
VALUE: <text to type or URL>
|
|
942
|
+
REASONING: <why>`
|
|
943
|
+
}]);
|
|
944
|
+
|
|
945
|
+
const actionContent = nextAction.content;
|
|
946
|
+
|
|
947
|
+
// Parse the action
|
|
948
|
+
const actionMatch = actionContent.match(/ACTION:\s*(\w+)/i);
|
|
949
|
+
const selectorMatch = actionContent.match(/SELECTOR:\s*(.+?)(?:\n|$)/i);
|
|
950
|
+
const valueMatch = actionContent.match(/VALUE:\s*(.+?)(?:\n|$)/i);
|
|
951
|
+
|
|
952
|
+
if (!actionMatch) {
|
|
953
|
+
actionHistory.push(`Attempt ${attempt + 1}: Couldn't parse action`);
|
|
954
|
+
continue;
|
|
623
955
|
}
|
|
624
956
|
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
}
|
|
957
|
+
const action = actionMatch[1].toLowerCase();
|
|
958
|
+
const selector = selectorMatch?.[1]?.trim() || '';
|
|
959
|
+
const value = valueMatch?.[1]?.trim() || '';
|
|
960
|
+
|
|
961
|
+
if (action === 'done') {
|
|
962
|
+
accomplished = true;
|
|
963
|
+
actionHistory.push(`Attempt ${attempt + 1}: Goal accomplished!`);
|
|
632
964
|
break;
|
|
633
965
|
}
|
|
634
966
|
|
|
635
|
-
|
|
967
|
+
if (action === 'stuck') {
|
|
968
|
+
// Ask Perplexity for help using Playwright
|
|
969
|
+
actionHistory.push(`Attempt ${attempt + 1}: Got stuck, asking Perplexity for help...`);
|
|
970
|
+
|
|
971
|
+
const helpRequest = `I'm trying to: ${goal}\n\nI'm stuck. What should I do next? Be specific about what to click or type.`;
|
|
972
|
+
const advice = await browser.askAI('perplexity', helpRequest, false);
|
|
973
|
+
actionHistory.push(`Got advice: ${advice.response.slice(0, 200)}...`);
|
|
974
|
+
|
|
975
|
+
// Navigate back to continue
|
|
976
|
+
await browser.navigateTo(page.url());
|
|
977
|
+
continue;
|
|
978
|
+
}
|
|
979
|
+
|
|
980
|
+
// Execute the action using Playwright
|
|
981
|
+
try {
|
|
982
|
+
switch (action) {
|
|
983
|
+
case 'click':
|
|
984
|
+
// Try to click by selector or text
|
|
985
|
+
if (selector) {
|
|
986
|
+
const clicked = await browser.clickElement(selector);
|
|
987
|
+
if (!clicked) {
|
|
988
|
+
// Try by text
|
|
989
|
+
await page.getByText(selector).first().click({ timeout: 5000 });
|
|
990
|
+
}
|
|
991
|
+
}
|
|
992
|
+
actionHistory.push(`Attempt ${attempt + 1}: Clicked "${selector}"`);
|
|
993
|
+
break;
|
|
994
|
+
case 'type':
|
|
995
|
+
if (selector && value) {
|
|
996
|
+
const typed = await browser.typeInElement(selector, value);
|
|
997
|
+
if (!typed) {
|
|
998
|
+
await page.getByPlaceholder(selector).first().fill(value);
|
|
999
|
+
}
|
|
1000
|
+
}
|
|
1001
|
+
actionHistory.push(`Attempt ${attempt + 1}: Typed "${value}" in "${selector}"`);
|
|
1002
|
+
break;
|
|
1003
|
+
case 'press':
|
|
1004
|
+
await browser.pressKey(value || selector);
|
|
1005
|
+
actionHistory.push(`Attempt ${attempt + 1}: Pressed ${value || selector}`);
|
|
1006
|
+
break;
|
|
1007
|
+
case 'scroll':
|
|
1008
|
+
await browser.scroll(value.toLowerCase().includes('up') ? 'up' : 'down');
|
|
1009
|
+
actionHistory.push(`Attempt ${attempt + 1}: Scrolled ${value || 'down'}`);
|
|
1010
|
+
break;
|
|
1011
|
+
case 'navigate':
|
|
1012
|
+
const url = value.startsWith('http') ? value : `https://${value}`;
|
|
1013
|
+
await browser.navigateTo(url);
|
|
1014
|
+
actionHistory.push(`Attempt ${attempt + 1}: Navigated to ${url}`);
|
|
1015
|
+
break;
|
|
1016
|
+
default:
|
|
1017
|
+
actionHistory.push(`Attempt ${attempt + 1}: Unknown action ${action}`);
|
|
1018
|
+
}
|
|
1019
|
+
} catch (e) {
|
|
1020
|
+
actionHistory.push(`Attempt ${attempt + 1}: Action failed - ${e}`);
|
|
1021
|
+
}
|
|
636
1022
|
|
|
637
|
-
//
|
|
638
|
-
await computer.scrollMouse(-5); // Scroll down
|
|
639
|
-
await sleep(1000); // Wait for scroll animation
|
|
1023
|
+
await sleep(2000); // Wait for UI to update
|
|
640
1024
|
}
|
|
641
1025
|
|
|
642
|
-
|
|
643
|
-
const fullResponse = extractedParts.join('\n\n');
|
|
644
|
-
step.result = `📝 ${site.charAt(0).toUpperCase() + site.slice(1)} says:\n\n${fullResponse}`;
|
|
1026
|
+
step.result = `🎯 Adaptive Agent Result:\n\nGoal: ${goal}\nAccomplished: ${accomplished ? 'Yes ✅' : 'Partial/No ❌'}\n\nAction Log:\n${actionHistory.join('\n')}`;
|
|
645
1027
|
break;
|
|
646
1028
|
}
|
|
647
1029
|
|
|
648
|
-
case 'screenshot':
|
|
649
|
-
const vision = await describeScreen();
|
|
650
|
-
step.result = vision.description;
|
|
651
|
-
break;
|
|
652
|
-
|
|
653
1030
|
case 'chat':
|
|
654
1031
|
// This is a fallback - just describe what user wants
|
|
655
1032
|
step.result = `Task noted: ${params}`;
|