@projectservan8n/cnapse 0.6.2 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -2380,7 +2380,21 @@ Before outputting steps, THINK through these questions:
2380
2380
  ### Web Browsing
2381
2381
  - open_url: Open URL in default browser (e.g., "open_url:https://perplexity.ai")
2382
2382
  - browse_and_ask: Open AI website, type question, wait for response (e.g., "browse_and_ask:perplexity|What is the capital of France?")
2383
- - browse_and_ask: Supports: perplexity, chatgpt, claude, google
2383
+ - browse_and_ask: Supports: perplexity, chatgpt, claude, google, copilot, bard
2384
+ - web_search: Search Google and extract results (e.g., "web_search:best restaurants in NYC")
2385
+
2386
+ ### Email
2387
+ - send_email: Send email via Gmail or Outlook web (e.g., "send_email:gmail|to@email.com|Subject|Body text here")
2388
+ - send_email: Supports: gmail, outlook
2389
+
2390
+ ### Google Apps (via browser)
2391
+ - google_sheets: Interact with Google Sheets (e.g., "google_sheets:new|My Spreadsheet" or "google_sheets:type|A1|Hello World")
2392
+ - google_sheets: Commands: new (create), open (open existing), type (type in cell), read (screenshot current view)
2393
+ - google_docs: Interact with Google Docs (e.g., "google_docs:new|My Document" or "google_docs:type|Hello World")
2394
+ - google_docs: Commands: new (create), open (open existing), type (type text)
2395
+
2396
+ ### Research
2397
+ - research: Multi-step web research - searches, gathers info, summarizes (e.g., "research:What are the latest AI trends in 2024?")
2384
2398
 
2385
2399
  ### Utility
2386
2400
  - wait: Wait N seconds (e.g., "wait:2" - use 1-3s for app loads)
@@ -2471,14 +2485,71 @@ Output:
2471
2485
  ### Example 7: "search google for weather today"
2472
2486
  Thinking:
2473
2487
  - Goal: Open Google and search for something
2474
- - How: Use browse_and_ask with google target
2475
- - Sequence: Open Google, search, capture results
2488
+ - How: Use web_search for quick results extraction
2489
+ - Sequence: Search and get results
2490
+
2491
+ Output:
2492
+ [
2493
+ { "description": "Search Google for weather", "action": "web_search:weather today" }
2494
+ ]
2495
+
2496
+ ### Example 8: "send an email to john@example.com about the meeting tomorrow"
2497
+ Thinking:
2498
+ - Goal: Compose and send an email via Gmail
2499
+ - How: Use send_email with gmail, recipient, subject, body
2500
+ - Sequence: Open Gmail, compose, fill fields, send
2501
+
2502
+ Output:
2503
+ [
2504
+ { "description": "Send email via Gmail", "action": "send_email:gmail|john@example.com|Meeting Tomorrow|Hi John, this is a reminder about our meeting tomorrow. Please let me know if you have any questions." }
2505
+ ]
2506
+
2507
+ ### Example 9: "create a new google sheet called Sales Report and add headers"
2508
+ Thinking:
2509
+ - Goal: Create a new Google Sheet and add content
2510
+ - How: Use google_sheets to create new, then type in cells
2511
+ - Sequence: Create sheet -> Navigate to cells -> Type headers
2476
2512
 
2477
2513
  Output:
2478
2514
  [
2479
- { "description": "Search Google", "action": "browse_and_ask:google|weather today" },
2480
- { "description": "Wait for results", "action": "wait:2" },
2481
- { "description": "Capture search results", "action": "screenshot" }
2515
+ { "description": "Create new Google Sheet", "action": "google_sheets:new|Sales Report" },
2516
+ { "description": "Wait for sheet to load", "action": "wait:3" },
2517
+ { "description": "Type header in A1", "action": "google_sheets:type|A1|Product" },
2518
+ { "description": "Type header in B1", "action": "google_sheets:type|B1|Quantity" },
2519
+ { "description": "Type header in C1", "action": "google_sheets:type|C1|Price" }
2520
+ ]
2521
+
2522
+ ### Example 10: "research the latest news about AI regulations"
2523
+ Thinking:
2524
+ - Goal: Do multi-step research on a topic
2525
+ - How: Use research action which handles searching, gathering, summarizing
2526
+ - Sequence: Single research action does it all
2527
+
2528
+ Output:
2529
+ [
2530
+ { "description": "Research AI regulations news", "action": "research:latest news about AI regulations 2024" }
2531
+ ]
2532
+
2533
+ ### Example 11: "write a document in google docs about project status"
2534
+ Thinking:
2535
+ - Goal: Create a Google Doc and write content
2536
+ - How: Use google_docs to create and type
2537
+ - Sequence: Create doc -> Type content
2538
+
2539
+ Output:
2540
+ [
2541
+ { "description": "Create new Google Doc", "action": "google_docs:new|Project Status Report" },
2542
+ { "description": "Wait for doc to load", "action": "wait:3" },
2543
+ { "description": "Type the content", "action": "google_docs:type|Project Status Report
2544
+
2545
+ Date: Today
2546
+
2547
+ Summary:
2548
+ The project is on track. All milestones have been met.
2549
+
2550
+ Next Steps:
2551
+ - Complete testing
2552
+ - Deploy to production" }
2482
2553
  ]
2483
2554
 
2484
2555
  ## YOUR TASK
@@ -2671,13 +2742,15 @@ ${existingResult.output}`;
2671
2742
  const [site, ...questionParts] = params.split("|");
2672
2743
  const question = questionParts.join("|");
2673
2744
  const sites = {
2674
- perplexity: { url: "https://www.perplexity.ai", waitTime: 3 },
2675
- chatgpt: { url: "https://chat.openai.com", waitTime: 4 },
2676
- claude: { url: "https://claude.ai", waitTime: 4 },
2677
- google: { url: "https://www.google.com", waitTime: 2 },
2678
- bing: { url: "https://www.bing.com", waitTime: 2 }
2745
+ perplexity: { url: "https://www.perplexity.ai", loadTime: 3, responseTime: 10 },
2746
+ chatgpt: { url: "https://chat.openai.com", loadTime: 4, responseTime: 15 },
2747
+ claude: { url: "https://claude.ai", loadTime: 4, responseTime: 15 },
2748
+ google: { url: "https://www.google.com", loadTime: 2, responseTime: 3 },
2749
+ bing: { url: "https://www.bing.com", loadTime: 2, responseTime: 3 },
2750
+ bard: { url: "https://bard.google.com", loadTime: 3, responseTime: 12 },
2751
+ copilot: { url: "https://copilot.microsoft.com", loadTime: 3, responseTime: 12 }
2679
2752
  };
2680
- const siteConfig = sites[site.toLowerCase()] || { url: `https://${site}`, waitTime: 3 };
2753
+ const siteConfig = sites[site.toLowerCase()] || { url: `https://${site}`, loadTime: 3, responseTime: 10 };
2681
2754
  if (process.platform === "win32") {
2682
2755
  await runCommand(`start "" "${siteConfig.url}"`, 5e3);
2683
2756
  } else if (process.platform === "darwin") {
@@ -2685,17 +2758,310 @@ ${existingResult.output}`;
2685
2758
  } else {
2686
2759
  await runCommand(`xdg-open "${siteConfig.url}"`, 5e3);
2687
2760
  }
2688
- await sleep(siteConfig.waitTime * 1e3);
2761
+ await sleep(siteConfig.loadTime * 1e3);
2689
2762
  await typeText(question);
2690
2763
  await sleep(300);
2691
2764
  await pressKey("Return");
2692
- step.result = `Asked ${site}: "${question}"`;
2765
+ await sleep(siteConfig.responseTime * 1e3);
2766
+ const extractedParts = [];
2767
+ const maxScrolls = 5;
2768
+ for (let scrollIndex = 0; scrollIndex < maxScrolls; scrollIndex++) {
2769
+ const screenResult = await describeScreen();
2770
+ const extractPrompt = `You are looking at screenshot ${scrollIndex + 1} of ${site}. The user asked: "${question}"
2771
+
2772
+ Extract ONLY the AI's response/answer text visible on screen. Do NOT include:
2773
+ - The user's question
2774
+ - Any UI elements, buttons, navigation, or headers
2775
+ - Any disclaimers, suggestions, or "related questions"
2776
+ - Any "Sources" or citation links
2777
+ - Any text you already extracted (avoid duplicates)
2778
+
2779
+ ${scrollIndex > 0 ? `Previous parts already extracted:
2780
+ ${extractedParts.join("\n---\n")}
2781
+
2782
+ Only extract NEW text that continues from where we left off.` : ""}
2783
+
2784
+ Just give me the actual answer text, word for word as it appears. If there's no more response text visible, respond with exactly: "END_OF_RESPONSE"`;
2785
+ const extractResponse = await chat([{ role: "user", content: extractPrompt }]);
2786
+ const extracted = extractResponse.content.trim();
2787
+ if (extracted === "END_OF_RESPONSE" || extracted.includes("END_OF_RESPONSE")) {
2788
+ break;
2789
+ }
2790
+ if (extracted.toLowerCase().includes("response not ready") || extracted.toLowerCase().includes("no response visible") || extracted.toLowerCase().includes("no additional text")) {
2791
+ if (scrollIndex === 0) {
2792
+ extractedParts.push("Response not ready yet or page still loading.");
2793
+ }
2794
+ break;
2795
+ }
2796
+ extractedParts.push(extracted);
2797
+ await scrollMouse(-5);
2798
+ await sleep(1e3);
2799
+ }
2800
+ const fullResponse = extractedParts.join("\n\n");
2801
+ step.result = `\u{1F4DD} ${site.charAt(0).toUpperCase() + site.slice(1)} says:
2802
+
2803
+ ${fullResponse}`;
2693
2804
  break;
2694
2805
  }
2695
2806
  case "screenshot":
2696
2807
  const vision = await describeScreen();
2697
2808
  step.result = vision.description;
2698
2809
  break;
2810
+ case "web_search": {
2811
+ await keyCombo(["meta", "r"]);
2812
+ await sleep(500);
2813
+ await typeText("chrome");
2814
+ await pressKey("Return");
2815
+ await sleep(2e3);
2816
+ await keyCombo(["control", "l"]);
2817
+ await sleep(300);
2818
+ await typeText("google.com");
2819
+ await pressKey("Return");
2820
+ await sleep(2e3);
2821
+ await typeText(params);
2822
+ await sleep(300);
2823
+ await pressKey("Return");
2824
+ await sleep(3e3);
2825
+ const searchScreen = await describeScreen();
2826
+ const searchExtract = await chat([{
2827
+ role: "user",
2828
+ content: `Extract the top search results from this Google search page. For each result, include:
2829
+ - Title
2830
+ - Brief snippet/description
2831
+ - URL if visible
2832
+
2833
+ Format as a numbered list. Be concise.`
2834
+ }]);
2835
+ step.result = `\u{1F50D} Search results for "${params}":
2836
+
2837
+ ${searchExtract.content}`;
2838
+ break;
2839
+ }
2840
+ case "send_email": {
2841
+ const [provider, to, subject, ...bodyParts] = params.split("|");
2842
+ const body = bodyParts.join("|");
2843
+ await keyCombo(["meta", "r"]);
2844
+ await sleep(500);
2845
+ await typeText("chrome");
2846
+ await pressKey("Return");
2847
+ await sleep(2e3);
2848
+ await keyCombo(["control", "l"]);
2849
+ await sleep(300);
2850
+ if (provider.toLowerCase() === "gmail") {
2851
+ await typeText("mail.google.com");
2852
+ await pressKey("Return");
2853
+ await sleep(4e3);
2854
+ await typeText("c");
2855
+ await sleep(2e3);
2856
+ await typeText(to);
2857
+ await sleep(300);
2858
+ await pressKey("Tab");
2859
+ await sleep(200);
2860
+ await typeText(subject);
2861
+ await sleep(300);
2862
+ await pressKey("Tab");
2863
+ await sleep(200);
2864
+ await typeText(body);
2865
+ await sleep(500);
2866
+ await keyCombo(["control", "Return"]);
2867
+ } else if (provider.toLowerCase() === "outlook") {
2868
+ await typeText("outlook.live.com");
2869
+ await pressKey("Return");
2870
+ await sleep(4e3);
2871
+ await typeText("n");
2872
+ await sleep(2e3);
2873
+ await typeText(to);
2874
+ await sleep(300);
2875
+ await pressKey("Tab");
2876
+ await sleep(200);
2877
+ await typeText(subject);
2878
+ await sleep(300);
2879
+ await pressKey("Tab");
2880
+ await sleep(200);
2881
+ await typeText(body);
2882
+ await sleep(500);
2883
+ await keyCombo(["control", "Return"]);
2884
+ } else {
2885
+ throw new Error(`Unsupported email provider: ${provider}. Use gmail or outlook.`);
2886
+ }
2887
+ await sleep(2e3);
2888
+ step.result = `\u{1F4E7} Email sent via ${provider} to ${to}`;
2889
+ break;
2890
+ }
2891
+ case "google_sheets": {
2892
+ const [sheetCmd, ...sheetArgs] = params.split("|");
2893
+ switch (sheetCmd.toLowerCase()) {
2894
+ case "new": {
2895
+ const sheetName = sheetArgs[0] || "Untitled spreadsheet";
2896
+ await keyCombo(["meta", "r"]);
2897
+ await sleep(500);
2898
+ await typeText("chrome");
2899
+ await pressKey("Return");
2900
+ await sleep(2e3);
2901
+ await keyCombo(["control", "l"]);
2902
+ await sleep(300);
2903
+ await typeText("sheets.google.com");
2904
+ await pressKey("Return");
2905
+ await sleep(3e3);
2906
+ await pressKey("Tab");
2907
+ await pressKey("Tab");
2908
+ await pressKey("Return");
2909
+ await sleep(3e3);
2910
+ await keyCombo(["alt", "f"]);
2911
+ await sleep(500);
2912
+ await typeText("r");
2913
+ await sleep(500);
2914
+ await keyCombo(["control", "a"]);
2915
+ await typeText(sheetName);
2916
+ await pressKey("Return");
2917
+ await sleep(500);
2918
+ await pressKey("Escape");
2919
+ step.result = `\u{1F4CA} Created Google Sheet: ${sheetName}`;
2920
+ break;
2921
+ }
2922
+ case "type": {
2923
+ const cell = sheetArgs[0] || "A1";
2924
+ const cellValue = sheetArgs.slice(1).join("|");
2925
+ await keyCombo(["control", "g"]);
2926
+ await sleep(500);
2927
+ await typeText(cell);
2928
+ await pressKey("Return");
2929
+ await sleep(300);
2930
+ await typeText(cellValue);
2931
+ await pressKey("Return");
2932
+ await sleep(200);
2933
+ step.result = `\u{1F4CA} Typed "${cellValue}" in cell ${cell}`;
2934
+ break;
2935
+ }
2936
+ case "read": {
2937
+ const readScreen = await describeScreen();
2938
+ step.result = `\u{1F4CA} Current sheet view:
2939
+ ${readScreen.description}`;
2940
+ break;
2941
+ }
2942
+ default:
2943
+ throw new Error(`Unknown google_sheets command: ${sheetCmd}`);
2944
+ }
2945
+ break;
2946
+ }
2947
+ case "google_docs": {
2948
+ const [docCmd, ...docArgs] = params.split("|");
2949
+ switch (docCmd.toLowerCase()) {
2950
+ case "new": {
2951
+ const docName = docArgs[0] || "Untitled document";
2952
+ await keyCombo(["meta", "r"]);
2953
+ await sleep(500);
2954
+ await typeText("chrome");
2955
+ await pressKey("Return");
2956
+ await sleep(2e3);
2957
+ await keyCombo(["control", "l"]);
2958
+ await sleep(300);
2959
+ await typeText("docs.google.com");
2960
+ await pressKey("Return");
2961
+ await sleep(3e3);
2962
+ await pressKey("Tab");
2963
+ await pressKey("Tab");
2964
+ await pressKey("Return");
2965
+ await sleep(3e3);
2966
+ await keyCombo(["alt", "f"]);
2967
+ await sleep(500);
2968
+ await typeText("r");
2969
+ await sleep(500);
2970
+ await keyCombo(["control", "a"]);
2971
+ await typeText(docName);
2972
+ await pressKey("Return");
2973
+ await sleep(500);
2974
+ await pressKey("Escape");
2975
+ step.result = `\u{1F4C4} Created Google Doc: ${docName}`;
2976
+ break;
2977
+ }
2978
+ case "type": {
2979
+ const docText = docArgs.join("|");
2980
+ await typeText(docText);
2981
+ step.result = `\u{1F4C4} Typed content in Google Doc`;
2982
+ break;
2983
+ }
2984
+ default:
2985
+ throw new Error(`Unknown google_docs command: ${docCmd}`);
2986
+ }
2987
+ break;
2988
+ }
2989
+ case "research": {
2990
+ const researchQuery = params;
2991
+ const researchResults = [];
2992
+ await keyCombo(["meta", "r"]);
2993
+ await sleep(500);
2994
+ await typeText("chrome");
2995
+ await pressKey("Return");
2996
+ await sleep(2e3);
2997
+ await keyCombo(["control", "l"]);
2998
+ await sleep(300);
2999
+ await typeText("google.com");
3000
+ await pressKey("Return");
3001
+ await sleep(2e3);
3002
+ await typeText(researchQuery);
3003
+ await pressKey("Return");
3004
+ await sleep(3e3);
3005
+ let searchScreen = await describeScreen();
3006
+ const initialResults = await chat([{
3007
+ role: "user",
3008
+ content: `Extract the key information from these Google search results about: "${researchQuery}"
3009
+ Include any relevant facts, numbers, dates, or key points visible. Be thorough but concise.`
3010
+ }]);
3011
+ researchResults.push(`Search Results:
3012
+ ${initialResults.content}`);
3013
+ await pressKey("Tab");
3014
+ await sleep(200);
3015
+ await pressKey("Tab");
3016
+ await sleep(200);
3017
+ await pressKey("Return");
3018
+ await sleep(4e3);
3019
+ searchScreen = await describeScreen();
3020
+ const pageContent = await chat([{
3021
+ role: "user",
3022
+ content: `Extract the main content and key information from this webpage about: "${researchQuery}"
3023
+ Ignore ads, navigation, footers. Focus on the actual article/content.`
3024
+ }]);
3025
+ researchResults.push(`
3026
+ Source 1 Content:
3027
+ ${pageContent.content}`);
3028
+ await keyCombo(["alt", "Left"]);
3029
+ await sleep(2e3);
3030
+ await scrollMouse(-3);
3031
+ await sleep(500);
3032
+ await pressKey("Tab");
3033
+ await pressKey("Tab");
3034
+ await pressKey("Tab");
3035
+ await pressKey("Return");
3036
+ await sleep(4e3);
3037
+ searchScreen = await describeScreen();
3038
+ const pageContent2 = await chat([{
3039
+ role: "user",
3040
+ content: `Extract additional information from this webpage about: "${researchQuery}"
3041
+ Look for details not covered in the previous source.`
3042
+ }]);
3043
+ researchResults.push(`
3044
+ Source 2 Content:
3045
+ ${pageContent2.content}`);
3046
+ const synthesis = await chat([{
3047
+ role: "user",
3048
+ content: `Based on the following research gathered about "${researchQuery}", provide a comprehensive summary:
3049
+
3050
+ ${researchResults.join("\n\n")}
3051
+
3052
+ Create a well-organized summary with:
3053
+ 1. Key findings
3054
+ 2. Important details
3055
+ 3. Any notable facts or statistics
3056
+ 4. Conclusion
3057
+
3058
+ Be thorough but concise.`
3059
+ }]);
3060
+ step.result = `\u{1F52C} Research Summary: ${researchQuery}
3061
+
3062
+ ${synthesis.content}`;
3063
+ break;
3064
+ }
2699
3065
  case "chat":
2700
3066
  step.result = `Task noted: ${params}`;
2701
3067
  break;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@projectservan8n/cnapse",
3
- "version": "0.6.2",
3
+ "version": "0.7.0",
4
4
  "description": "Autonomous PC intelligence - AI assistant for desktop automation",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
package/src/lib/tasks.ts CHANGED
@@ -211,7 +211,21 @@ Before outputting steps, THINK through these questions:
211
211
  ### Web Browsing
212
212
  - open_url: Open URL in default browser (e.g., "open_url:https://perplexity.ai")
213
213
  - browse_and_ask: Open AI website, type question, wait for response (e.g., "browse_and_ask:perplexity|What is the capital of France?")
214
- - browse_and_ask: Supports: perplexity, chatgpt, claude, google
214
+ - browse_and_ask: Supports: perplexity, chatgpt, claude, google, copilot, bard
215
+ - web_search: Search Google and extract results (e.g., "web_search:best restaurants in NYC")
216
+
217
+ ### Email
218
+ - send_email: Send email via Gmail or Outlook web (e.g., "send_email:gmail|to@email.com|Subject|Body text here")
219
+ - send_email: Supports: gmail, outlook
220
+
221
+ ### Google Apps (via browser)
222
+ - google_sheets: Interact with Google Sheets (e.g., "google_sheets:new|My Spreadsheet" or "google_sheets:type|A1|Hello World")
223
+ - google_sheets: Commands: new (create), open (open existing), type (type in cell), read (screenshot current view)
224
+ - google_docs: Interact with Google Docs (e.g., "google_docs:new|My Document" or "google_docs:type|Hello World")
225
+ - google_docs: Commands: new (create), open (open existing), type (type text)
226
+
227
+ ### Research
228
+ - research: Multi-step web research - searches, gathers info, summarizes (e.g., "research:What are the latest AI trends in 2024?")
215
229
 
216
230
  ### Utility
217
231
  - wait: Wait N seconds (e.g., "wait:2" - use 1-3s for app loads)
@@ -302,14 +316,62 @@ Output:
302
316
  ### Example 7: "search google for weather today"
303
317
  Thinking:
304
318
  - Goal: Open Google and search for something
305
- - How: Use browse_and_ask with google target
306
- - Sequence: Open Google, search, capture results
319
+ - How: Use web_search for quick results extraction
320
+ - Sequence: Search and get results
321
+
322
+ Output:
323
+ [
324
+ { "description": "Search Google for weather", "action": "web_search:weather today" }
325
+ ]
326
+
327
+ ### Example 8: "send an email to john@example.com about the meeting tomorrow"
328
+ Thinking:
329
+ - Goal: Compose and send an email via Gmail
330
+ - How: Use send_email with gmail, recipient, subject, body
331
+ - Sequence: Open Gmail, compose, fill fields, send
332
+
333
+ Output:
334
+ [
335
+ { "description": "Send email via Gmail", "action": "send_email:gmail|john@example.com|Meeting Tomorrow|Hi John, this is a reminder about our meeting tomorrow. Please let me know if you have any questions." }
336
+ ]
337
+
338
+ ### Example 9: "create a new google sheet called Sales Report and add headers"
339
+ Thinking:
340
+ - Goal: Create a new Google Sheet and add content
341
+ - How: Use google_sheets to create new, then type in cells
342
+ - Sequence: Create sheet -> Navigate to cells -> Type headers
343
+
344
+ Output:
345
+ [
346
+ { "description": "Create new Google Sheet", "action": "google_sheets:new|Sales Report" },
347
+ { "description": "Wait for sheet to load", "action": "wait:3" },
348
+ { "description": "Type header in A1", "action": "google_sheets:type|A1|Product" },
349
+ { "description": "Type header in B1", "action": "google_sheets:type|B1|Quantity" },
350
+ { "description": "Type header in C1", "action": "google_sheets:type|C1|Price" }
351
+ ]
352
+
353
+ ### Example 10: "research the latest news about AI regulations"
354
+ Thinking:
355
+ - Goal: Do multi-step research on a topic
356
+ - How: Use research action which handles searching, gathering, summarizing
357
+ - Sequence: Single research action does it all
358
+
359
+ Output:
360
+ [
361
+ { "description": "Research AI regulations news", "action": "research:latest news about AI regulations 2024" }
362
+ ]
363
+
364
+ ### Example 11: "write a document in google docs about project status"
365
+ Thinking:
366
+ - Goal: Create a Google Doc and write content
367
+ - How: Use google_docs to create and type
368
+ - Sequence: Create doc -> Type content
307
369
 
308
370
  Output:
309
371
  [
310
- { "description": "Search Google", "action": "browse_and_ask:google|weather today" },
311
- { "description": "Wait for results", "action": "wait:2" },
312
- { "description": "Capture search results", "action": "screenshot" }
372
+ { "description": "Create new Google Doc", "action": "google_docs:new|Project Status Report" },
373
+ { "description": "Wait for doc to load", "action": "wait:3" },
374
+ { "description": "Type the content", "action": "google_docs:type|Project Status Report\n\nDate: Today\n\nSummary:\nThe project is on track. All milestones have been met.\n\nNext Steps:\n- Complete testing\n- Deploy to production" }
313
375
  ]
314
376
 
315
377
  ## YOUR TASK
@@ -557,16 +619,18 @@ ${existingResult.output}`;
557
619
  const [site, ...questionParts] = params.split('|');
558
620
  const question = questionParts.join('|');
559
621
 
560
- // Site-specific URLs and input selectors
561
- const sites: Record<string, { url: string; waitTime: number; searchSelector?: string }> = {
562
- perplexity: { url: 'https://www.perplexity.ai', waitTime: 3 },
563
- chatgpt: { url: 'https://chat.openai.com', waitTime: 4 },
564
- claude: { url: 'https://claude.ai', waitTime: 4 },
565
- google: { url: 'https://www.google.com', waitTime: 2 },
566
- bing: { url: 'https://www.bing.com', waitTime: 2 },
622
+ // Site-specific URLs and response wait times
623
+ const sites: Record<string, { url: string; loadTime: number; responseTime: number }> = {
624
+ perplexity: { url: 'https://www.perplexity.ai', loadTime: 3, responseTime: 10 },
625
+ chatgpt: { url: 'https://chat.openai.com', loadTime: 4, responseTime: 15 },
626
+ claude: { url: 'https://claude.ai', loadTime: 4, responseTime: 15 },
627
+ google: { url: 'https://www.google.com', loadTime: 2, responseTime: 3 },
628
+ bing: { url: 'https://www.bing.com', loadTime: 2, responseTime: 3 },
629
+ bard: { url: 'https://bard.google.com', loadTime: 3, responseTime: 12 },
630
+ copilot: { url: 'https://copilot.microsoft.com', loadTime: 3, responseTime: 12 },
567
631
  };
568
632
 
569
- const siteConfig = sites[site.toLowerCase()] || { url: `https://${site}`, waitTime: 3 };
633
+ const siteConfig = sites[site.toLowerCase()] || { url: `https://${site}`, loadTime: 3, responseTime: 10 };
570
634
 
571
635
  // Open the site
572
636
  if (process.platform === 'win32') {
@@ -578,7 +642,7 @@ ${existingResult.output}`;
578
642
  }
579
643
 
580
644
  // Wait for page to load
581
- await sleep(siteConfig.waitTime * 1000);
645
+ await sleep(siteConfig.loadTime * 1000);
582
646
 
583
647
  // Type the question (most sites have autofocus on search/input)
584
648
  await computer.typeText(question);
@@ -587,7 +651,59 @@ ${existingResult.output}`;
587
651
  // Press Enter to submit
588
652
  await computer.pressKey('Return');
589
653
 
590
- step.result = `Asked ${site}: "${question}"`;
654
+ // Wait for AI to generate response
655
+ await sleep(siteConfig.responseTime * 1000);
656
+
657
+ // Capture multiple screenshots by scrolling to get full response
658
+ const extractedParts: string[] = [];
659
+ const maxScrolls = 5; // Maximum number of scroll captures
660
+
661
+ for (let scrollIndex = 0; scrollIndex < maxScrolls; scrollIndex++) {
662
+ // Capture current view
663
+ const screenResult = await describeScreen();
664
+
665
+ // Ask AI to extract just the response text from what it sees
666
+ const extractPrompt = `You are looking at screenshot ${scrollIndex + 1} of ${site}. The user asked: "${question}"
667
+
668
+ Extract ONLY the AI's response/answer text visible on screen. Do NOT include:
669
+ - The user's question
670
+ - Any UI elements, buttons, navigation, or headers
671
+ - Any disclaimers, suggestions, or "related questions"
672
+ - Any "Sources" or citation links
673
+ - Any text you already extracted (avoid duplicates)
674
+
675
+ ${scrollIndex > 0 ? `Previous parts already extracted:\n${extractedParts.join('\n---\n')}\n\nOnly extract NEW text that continues from where we left off.` : ''}
676
+
677
+ Just give me the actual answer text, word for word as it appears. If there's no more response text visible, respond with exactly: "END_OF_RESPONSE"`;
678
+
679
+ const extractResponse = await chat([{ role: 'user', content: extractPrompt }]);
680
+ const extracted = extractResponse.content.trim();
681
+
682
+ // Check if we've reached the end
683
+ if (extracted === 'END_OF_RESPONSE' || extracted.includes('END_OF_RESPONSE')) {
684
+ break;
685
+ }
686
+
687
+ // Check for "no response" indicators
688
+ if (extracted.toLowerCase().includes('response not ready') ||
689
+ extracted.toLowerCase().includes('no response visible') ||
690
+ extracted.toLowerCase().includes('no additional text')) {
691
+ if (scrollIndex === 0) {
692
+ extractedParts.push('Response not ready yet or page still loading.');
693
+ }
694
+ break;
695
+ }
696
+
697
+ extractedParts.push(extracted);
698
+
699
+ // Scroll down to see more content
700
+ await computer.scrollMouse(-5); // Scroll down
701
+ await sleep(1000); // Wait for scroll animation
702
+ }
703
+
704
+ // Combine all extracted parts
705
+ const fullResponse = extractedParts.join('\n\n');
706
+ step.result = `📝 ${site.charAt(0).toUpperCase() + site.slice(1)} says:\n\n${fullResponse}`;
591
707
  break;
592
708
  }
593
709
 
@@ -596,6 +712,337 @@ ${existingResult.output}`;
596
712
  step.result = vision.description;
597
713
  break;
598
714
 
715
+ case 'web_search': {
716
+ // Human-like Google search: open browser, go to google, type, search
717
+ // Open browser with Win+R -> chrome/edge or just open google.com
718
+ await computer.keyCombo(['meta', 'r']); // Win+R
719
+ await sleep(500);
720
+ await computer.typeText('chrome'); // Try Chrome first
721
+ await computer.pressKey('Return');
722
+ await sleep(2000);
723
+
724
+ // Go to Google (Ctrl+L to focus address bar)
725
+ await computer.keyCombo(['control', 'l']);
726
+ await sleep(300);
727
+ await computer.typeText('google.com');
728
+ await computer.pressKey('Return');
729
+ await sleep(2000);
730
+
731
+ // Type search query (Google search box should be focused)
732
+ await computer.typeText(params);
733
+ await sleep(300);
734
+ await computer.pressKey('Return');
735
+ await sleep(3000); // Wait for results
736
+
737
+ // Capture and extract search results
738
+ const searchScreen = await describeScreen();
739
+ const searchExtract = await chat([{
740
+ role: 'user',
741
+ content: `Extract the top search results from this Google search page. For each result, include:
742
+ - Title
743
+ - Brief snippet/description
744
+ - URL if visible
745
+
746
+ Format as a numbered list. Be concise.`
747
+ }]);
748
+
749
+ step.result = `🔍 Search results for "${params}":\n\n${searchExtract.content}`;
750
+ break;
751
+ }
752
+
753
+ case 'send_email': {
754
+ // Human-like email: open browser, navigate to Gmail/Outlook, compose
755
+ // Format: send_email:provider|to|subject|body
756
+ const [provider, to, subject, ...bodyParts] = params.split('|');
757
+ const body = bodyParts.join('|');
758
+
759
+ // Open browser
760
+ await computer.keyCombo(['meta', 'r']); // Win+R
761
+ await sleep(500);
762
+ await computer.typeText('chrome');
763
+ await computer.pressKey('Return');
764
+ await sleep(2000);
765
+
766
+ // Navigate to email service
767
+ await computer.keyCombo(['control', 'l']); // Focus address bar
768
+ await sleep(300);
769
+
770
+ if (provider.toLowerCase() === 'gmail') {
771
+ await computer.typeText('mail.google.com');
772
+ await computer.pressKey('Return');
773
+ await sleep(4000); // Wait for Gmail to load
774
+
775
+ // Click Compose button (use keyboard shortcut 'c')
776
+ await computer.typeText('c'); // Gmail shortcut for compose
777
+ await sleep(2000); // Wait for compose window
778
+
779
+ // Fill in fields
780
+ await computer.typeText(to); // To field is focused
781
+ await sleep(300);
782
+ await computer.pressKey('Tab'); // Move to subject
783
+ await sleep(200);
784
+ await computer.typeText(subject);
785
+ await sleep(300);
786
+ await computer.pressKey('Tab'); // Move to body
787
+ await sleep(200);
788
+ await computer.typeText(body);
789
+ await sleep(500);
790
+
791
+ // Send with Ctrl+Enter
792
+ await computer.keyCombo(['control', 'Return']);
793
+
794
+ } else if (provider.toLowerCase() === 'outlook') {
795
+ await computer.typeText('outlook.live.com');
796
+ await computer.pressKey('Return');
797
+ await sleep(4000); // Wait for Outlook to load
798
+
799
+ // Click New mail (use keyboard shortcut 'n')
800
+ await computer.typeText('n'); // Outlook shortcut for new mail
801
+ await sleep(2000);
802
+
803
+ // Fill in fields
804
+ await computer.typeText(to);
805
+ await sleep(300);
806
+ await computer.pressKey('Tab');
807
+ await sleep(200);
808
+ await computer.typeText(subject);
809
+ await sleep(300);
810
+ await computer.pressKey('Tab');
811
+ await sleep(200);
812
+ await computer.typeText(body);
813
+ await sleep(500);
814
+
815
+ // Send with Ctrl+Enter
816
+ await computer.keyCombo(['control', 'Return']);
817
+ } else {
818
+ throw new Error(`Unsupported email provider: ${provider}. Use gmail or outlook.`);
819
+ }
820
+
821
+ await sleep(2000);
822
+ step.result = `📧 Email sent via ${provider} to ${to}`;
823
+ break;
824
+ }
825
+
826
+ case 'google_sheets': {
827
+ // Human-like: open browser, go to sheets, interact
828
+ // Format: google_sheets:command|arg1|arg2...
829
+ const [sheetCmd, ...sheetArgs] = params.split('|');
830
+
831
+ switch (sheetCmd.toLowerCase()) {
832
+ case 'new': {
833
+ const sheetName = sheetArgs[0] || 'Untitled spreadsheet';
834
+
835
+ // Open browser and go to Google Sheets
836
+ await computer.keyCombo(['meta', 'r']);
837
+ await sleep(500);
838
+ await computer.typeText('chrome');
839
+ await computer.pressKey('Return');
840
+ await sleep(2000);
841
+
842
+ await computer.keyCombo(['control', 'l']);
843
+ await sleep(300);
844
+ await computer.typeText('sheets.google.com');
845
+ await computer.pressKey('Return');
846
+ await sleep(3000);
847
+
848
+ // Click "Blank" to create new (or use keyboard)
849
+ // Usually there's a + or Blank option, let's try clicking near top
850
+ await computer.pressKey('Tab'); // Navigate
851
+ await computer.pressKey('Tab');
852
+ await computer.pressKey('Return'); // Create blank
853
+ await sleep(3000);
854
+
855
+ // Rename: click on title or use File > Rename
856
+ await computer.keyCombo(['alt', 'f']); // File menu
857
+ await sleep(500);
858
+ await computer.typeText('r'); // Rename option
859
+ await sleep(500);
860
+ await computer.keyCombo(['control', 'a']); // Select all
861
+ await computer.typeText(sheetName);
862
+ await computer.pressKey('Return');
863
+ await sleep(500);
864
+ await computer.pressKey('Escape'); // Close any dialog
865
+
866
+ step.result = `📊 Created Google Sheet: ${sheetName}`;
867
+ break;
868
+ }
869
+ case 'type': {
870
+ const cell = sheetArgs[0] || 'A1';
871
+ const cellValue = sheetArgs.slice(1).join('|');
872
+
873
+ // Navigate to cell using Ctrl+G or F5 (Go to)
874
+ await computer.keyCombo(['control', 'g']); // Go to cell dialog
875
+ await sleep(500);
876
+ await computer.typeText(cell);
877
+ await computer.pressKey('Return');
878
+ await sleep(300);
879
+
880
+ // Type the value
881
+ await computer.typeText(cellValue);
882
+ await computer.pressKey('Return'); // Confirm and move down
883
+ await sleep(200);
884
+
885
+ step.result = `📊 Typed "${cellValue}" in cell ${cell}`;
886
+ break;
887
+ }
888
+ case 'read': {
889
+ const readScreen = await describeScreen();
890
+ step.result = `📊 Current sheet view:\n${readScreen.description}`;
891
+ break;
892
+ }
893
+ default:
894
+ throw new Error(`Unknown google_sheets command: ${sheetCmd}`);
895
+ }
896
+ break;
897
+ }
898
+
899
+ case 'google_docs': {
900
+ // Human-like: open browser, go to docs, interact
901
+ // Format: google_docs:command|arg1|arg2...
902
+ const [docCmd, ...docArgs] = params.split('|');
903
+
904
+ switch (docCmd.toLowerCase()) {
905
+ case 'new': {
906
+ const docName = docArgs[0] || 'Untitled document';
907
+
908
+ // Open browser and go to Google Docs
909
+ await computer.keyCombo(['meta', 'r']);
910
+ await sleep(500);
911
+ await computer.typeText('chrome');
912
+ await computer.pressKey('Return');
913
+ await sleep(2000);
914
+
915
+ await computer.keyCombo(['control', 'l']);
916
+ await sleep(300);
917
+ await computer.typeText('docs.google.com');
918
+ await computer.pressKey('Return');
919
+ await sleep(3000);
920
+
921
+ // Click "Blank" to create new
922
+ await computer.pressKey('Tab');
923
+ await computer.pressKey('Tab');
924
+ await computer.pressKey('Return');
925
+ await sleep(3000);
926
+
927
+ // Rename using File > Rename
928
+ await computer.keyCombo(['alt', 'f']); // File menu
929
+ await sleep(500);
930
+ await computer.typeText('r'); // Rename
931
+ await sleep(500);
932
+ await computer.keyCombo(['control', 'a']); // Select all
933
+ await computer.typeText(docName);
934
+ await computer.pressKey('Return');
935
+ await sleep(500);
936
+ await computer.pressKey('Escape'); // Close dialog, focus doc
937
+
938
+ step.result = `📄 Created Google Doc: ${docName}`;
939
+ break;
940
+ }
941
+ case 'type': {
942
+ const docText = docArgs.join('|');
943
+ // Just type - cursor should be in document
944
+ await computer.typeText(docText);
945
+ step.result = `📄 Typed content in Google Doc`;
946
+ break;
947
+ }
948
+ default:
949
+ throw new Error(`Unknown google_docs command: ${docCmd}`);
950
+ }
951
+ break;
952
+ }
953
+
954
+ case 'research': {
955
+ // Human-like multi-step research: open browser, search, click results, gather info
956
+ const researchQuery = params;
957
+ const researchResults: string[] = [];
958
+
959
+ // Step 1: Open browser and go to Google
960
+ await computer.keyCombo(['meta', 'r']); // Win+R
961
+ await sleep(500);
962
+ await computer.typeText('chrome');
963
+ await computer.pressKey('Return');
964
+ await sleep(2000);
965
+
966
+ await computer.keyCombo(['control', 'l']); // Focus address bar
967
+ await sleep(300);
968
+ await computer.typeText('google.com');
969
+ await computer.pressKey('Return');
970
+ await sleep(2000);
971
+
972
+ // Type search query
973
+ await computer.typeText(researchQuery);
974
+ await computer.pressKey('Return');
975
+ await sleep(3000);
976
+
977
+ // Capture initial search results
978
+ let searchScreen = await describeScreen();
979
+ const initialResults = await chat([{
980
+ role: 'user',
981
+ content: `Extract the key information from these Google search results about: "${researchQuery}"
982
+ Include any relevant facts, numbers, dates, or key points visible. Be thorough but concise.`
983
+ }]);
984
+ researchResults.push(`Search Results:\n${initialResults.content}`);
985
+
986
+ // Step 2: Click on first result (Tab to navigate, Enter to click)
987
+ await computer.pressKey('Tab');
988
+ await sleep(200);
989
+ await computer.pressKey('Tab');
990
+ await sleep(200);
991
+ await computer.pressKey('Return'); // Click first result
992
+ await sleep(4000); // Wait for page load
993
+
994
+ // Extract content from the page
995
+ searchScreen = await describeScreen();
996
+ const pageContent = await chat([{
997
+ role: 'user',
998
+ content: `Extract the main content and key information from this webpage about: "${researchQuery}"
999
+ Ignore ads, navigation, footers. Focus on the actual article/content.`
1000
+ }]);
1001
+ researchResults.push(`\nSource 1 Content:\n${pageContent.content}`);
1002
+
1003
+ // Step 3: Go back (Alt+Left) and check another source
1004
+ await computer.keyCombo(['alt', 'Left']); // Browser back
1005
+ await sleep(2000);
1006
+
1007
+ // Scroll down a bit to see more results
1008
+ await computer.scrollMouse(-3);
1009
+ await sleep(500);
1010
+
1011
+ // Navigate to second result
1012
+ await computer.pressKey('Tab');
1013
+ await computer.pressKey('Tab');
1014
+ await computer.pressKey('Tab');
1015
+ await computer.pressKey('Return');
1016
+ await sleep(4000);
1017
+
1018
+ searchScreen = await describeScreen();
1019
+ const pageContent2 = await chat([{
1020
+ role: 'user',
1021
+ content: `Extract additional information from this webpage about: "${researchQuery}"
1022
+ Look for details not covered in the previous source.`
1023
+ }]);
1024
+ researchResults.push(`\nSource 2 Content:\n${pageContent2.content}`);
1025
+
1026
+ // Step 4: Synthesize all gathered information
1027
+ const synthesis = await chat([{
1028
+ role: 'user',
1029
+ content: `Based on the following research gathered about "${researchQuery}", provide a comprehensive summary:
1030
+
1031
+ ${researchResults.join('\n\n')}
1032
+
1033
+ Create a well-organized summary with:
1034
+ 1. Key findings
1035
+ 2. Important details
1036
+ 3. Any notable facts or statistics
1037
+ 4. Conclusion
1038
+
1039
+ Be thorough but concise.`
1040
+ }]);
1041
+
1042
+ step.result = `🔬 Research Summary: ${researchQuery}\n\n${synthesis.content}`;
1043
+ break;
1044
+ }
1045
+
599
1046
  case 'chat':
600
1047
  // This is a fallback - just describe what user wants
601
1048
  step.result = `Task noted: ${params}`;