@projectservan8n/cnapse 0.6.1 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -925,10 +925,10 @@ async function captureScreenFallback() {
925
925
  const { exec: exec6 } = await import("child_process");
926
926
  const { promisify: promisify6 } = await import("util");
927
927
  const { tmpdir } = await import("os");
928
- const { join: join2 } = await import("path");
929
- const { readFile, unlink } = await import("fs/promises");
928
+ const { join: join3 } = await import("path");
929
+ const { readFile: readFile2, unlink } = await import("fs/promises");
930
930
  const execAsync6 = promisify6(exec6);
931
- const tempFile = join2(tmpdir(), `cnapse-screen-${Date.now()}.png`);
931
+ const tempFile = join3(tmpdir(), `cnapse-screen-${Date.now()}.png`);
932
932
  try {
933
933
  const platform = process.platform;
934
934
  if (platform === "win32") {
@@ -947,7 +947,7 @@ async function captureScreenFallback() {
947
947
  } else {
948
948
  await execAsync6(`gnome-screenshot -f "${tempFile}" 2>/dev/null || scrot "${tempFile}" 2>/dev/null || import -window root "${tempFile}"`);
949
949
  }
950
- const imageBuffer = await readFile(tempFile);
950
+ const imageBuffer = await readFile2(tempFile);
951
951
  await unlink(tempFile).catch(() => {
952
952
  });
953
953
  return imageBuffer.toString("base64");
@@ -1238,6 +1238,56 @@ import { EventEmitter } from "events";
1238
1238
  import { exec as exec5 } from "child_process";
1239
1239
  import { promisify as promisify5 } from "util";
1240
1240
 
1241
+ // src/tools/filesystem.ts
1242
+ import { promises as fs } from "fs";
1243
+ import { join, dirname } from "path";
1244
+ async function readFile(path2) {
1245
+ try {
1246
+ const content = await fs.readFile(path2, "utf-8");
1247
+ return ok(content);
1248
+ } catch (error) {
1249
+ return err(`Failed to read file: ${error.message}`);
1250
+ }
1251
+ }
1252
+ async function writeFile(path2, content) {
1253
+ try {
1254
+ const dir = dirname(path2);
1255
+ await fs.mkdir(dir, { recursive: true });
1256
+ await fs.writeFile(path2, content, "utf-8");
1257
+ return ok(`Written ${content.length} bytes to ${path2}`);
1258
+ } catch (error) {
1259
+ return err(`Failed to write file: ${error.message}`);
1260
+ }
1261
+ }
1262
+ async function listDir(path2, recursive = false) {
1263
+ try {
1264
+ const stat = await fs.stat(path2);
1265
+ if (!stat.isDirectory()) {
1266
+ return err(`Not a directory: ${path2}`);
1267
+ }
1268
+ const entries = [];
1269
+ async function walkDir(dir, prefix) {
1270
+ const items = await fs.readdir(dir, { withFileTypes: true });
1271
+ for (const item of items) {
1272
+ const displayPath = prefix ? `${prefix}/${item.name}` : item.name;
1273
+ if (item.isDirectory()) {
1274
+ entries.push(`${displayPath}/`);
1275
+ if (recursive) {
1276
+ await walkDir(join(dir, item.name), displayPath);
1277
+ }
1278
+ } else {
1279
+ entries.push(displayPath);
1280
+ }
1281
+ }
1282
+ }
1283
+ await walkDir(path2, "");
1284
+ entries.sort();
1285
+ return ok(entries.join("\n"));
1286
+ } catch (error) {
1287
+ return err(`Failed to list directory: ${error.message}`);
1288
+ }
1289
+ }
1290
+
1241
1291
  // src/tools/clipboard.ts
1242
1292
  import clipboardy from "clipboardy";
1243
1293
 
@@ -1748,6 +1798,65 @@ ${stderr}`
1748
1798
  }
1749
1799
 
1750
1800
  // src/services/telegram.ts
1801
+ function formatForTelegram(text) {
1802
+ const hasMarkdown = /[*_`\[\]()]/.test(text);
1803
+ if (!hasMarkdown) {
1804
+ return { text, parseMode: void 0 };
1805
+ }
1806
+ try {
1807
+ let formatted = text;
1808
+ const escapeChars = ["\\", "_", "*", "[", "]", "(", ")", "~", "`", ">", "#", "+", "-", "=", "|", "{", "}", ".", "!"];
1809
+ const placeholders = [];
1810
+ let placeholderIndex = 0;
1811
+ formatted = formatted.replace(/```([\s\S]*?)```/g, (match, code) => {
1812
+ const placeholder = `__CODEBLOCK_${placeholderIndex++}__`;
1813
+ placeholders.push({ placeholder, original: "```" + code.replace(/\\/g, "\\\\") + "```" });
1814
+ return placeholder;
1815
+ });
1816
+ formatted = formatted.replace(/`([^`]+)`/g, (match, code) => {
1817
+ const placeholder = `__INLINECODE_${placeholderIndex++}__`;
1818
+ placeholders.push({ placeholder, original: "`" + code.replace(/\\/g, "\\\\") + "`" });
1819
+ return placeholder;
1820
+ });
1821
+ formatted = formatted.replace(/\*\*(.+?)\*\*/g, (match, text2) => {
1822
+ const placeholder = `__BOLD_${placeholderIndex++}__`;
1823
+ placeholders.push({ placeholder, original: "*" + text2 + "*" });
1824
+ return placeholder;
1825
+ });
1826
+ formatted = formatted.replace(/(?<!\*)\*(?!\*)(.+?)(?<!\*)\*(?!\*)/g, (match, text2) => {
1827
+ const placeholder = `__ITALIC_${placeholderIndex++}__`;
1828
+ placeholders.push({ placeholder, original: "_" + text2 + "_" });
1829
+ return placeholder;
1830
+ });
1831
+ formatted = formatted.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (match, text2, url) => {
1832
+ const placeholder = `__LINK_${placeholderIndex++}__`;
1833
+ placeholders.push({ placeholder, original: "[" + text2 + "](" + url + ")" });
1834
+ return placeholder;
1835
+ });
1836
+ for (const char of escapeChars) {
1837
+ if (char === "\\") continue;
1838
+ formatted = formatted.split(char).join("\\" + char);
1839
+ }
1840
+ for (const { placeholder, original } of placeholders) {
1841
+ formatted = formatted.replace(placeholder, original);
1842
+ }
1843
+ return { text: formatted, parseMode: "MarkdownV2" };
1844
+ } catch {
1845
+ return { text, parseMode: void 0 };
1846
+ }
1847
+ }
1848
+ async function sendFormattedMessage(ctx, text) {
1849
+ const { text: formatted, parseMode } = formatForTelegram(text);
1850
+ try {
1851
+ if (parseMode) {
1852
+ await ctx.reply(formatted, { parse_mode: parseMode });
1853
+ } else {
1854
+ await ctx.reply(text);
1855
+ }
1856
+ } catch {
1857
+ await ctx.reply(text);
1858
+ }
1859
+ }
1751
1860
  var TelegramBotService = class extends EventEmitter {
1752
1861
  bot = null;
1753
1862
  isRunning = false;
@@ -1931,7 +2040,7 @@ ${result.error}
1931
2040
  await ctx.sendChatAction("typing");
1932
2041
  const computerControlResult = await this.tryComputerControl(userText);
1933
2042
  if (computerControlResult) {
1934
- await ctx.reply(computerControlResult);
2043
+ await sendFormattedMessage(ctx, computerControlResult);
1935
2044
  history.push({ role: "assistant", content: computerControlResult });
1936
2045
  return;
1937
2046
  }
@@ -1952,10 +2061,10 @@ ${result.error}
1952
2061
  if (responseText.length > 4e3) {
1953
2062
  const chunks = responseText.match(/.{1,4000}/gs) || [responseText];
1954
2063
  for (const chunk of chunks) {
1955
- await ctx.reply(chunk);
2064
+ await sendFormattedMessage(ctx, chunk);
1956
2065
  }
1957
2066
  } else {
1958
- await ctx.reply(responseText);
2067
+ await sendFormattedMessage(ctx, responseText);
1959
2068
  }
1960
2069
  } catch (error) {
1961
2070
  const errorMsg = error instanceof Error ? error.message : "Unknown error";
@@ -2150,14 +2259,14 @@ function useTelegram(onMessage) {
2150
2259
  import { useState as useState6, useCallback as useCallback4 } from "react";
2151
2260
 
2152
2261
  // src/lib/tasks.ts
2153
- import * as fs from "fs";
2262
+ import * as fs2 from "fs";
2154
2263
  import * as path from "path";
2155
2264
  import * as os2 from "os";
2156
2265
  var TASK_MEMORY_FILE = path.join(os2.homedir(), ".cnapse", "task-memory.json");
2157
2266
  function loadTaskMemory() {
2158
2267
  try {
2159
- if (fs.existsSync(TASK_MEMORY_FILE)) {
2160
- const data = fs.readFileSync(TASK_MEMORY_FILE, "utf-8");
2268
+ if (fs2.existsSync(TASK_MEMORY_FILE)) {
2269
+ const data = fs2.readFileSync(TASK_MEMORY_FILE, "utf-8");
2161
2270
  return JSON.parse(data);
2162
2271
  }
2163
2272
  } catch {
@@ -2184,10 +2293,10 @@ function saveTaskPattern(input, steps) {
2184
2293
  }
2185
2294
  memory.patterns = memory.patterns.sort((a, b) => b.successCount - a.successCount).slice(0, 100);
2186
2295
  const dir = path.dirname(TASK_MEMORY_FILE);
2187
- if (!fs.existsSync(dir)) {
2188
- fs.mkdirSync(dir, { recursive: true });
2296
+ if (!fs2.existsSync(dir)) {
2297
+ fs2.mkdirSync(dir, { recursive: true });
2189
2298
  }
2190
- fs.writeFileSync(TASK_MEMORY_FILE, JSON.stringify(memory, null, 2));
2299
+ fs2.writeFileSync(TASK_MEMORY_FILE, JSON.stringify(memory, null, 2));
2191
2300
  } catch {
2192
2301
  }
2193
2302
  }
@@ -2247,14 +2356,36 @@ Before outputting steps, THINK through these questions:
2247
2356
  - Typing too fast -> add small waits
2248
2357
 
2249
2358
  ## AVAILABLE ACTIONS
2359
+
2360
+ ### App Control
2250
2361
  - open_app: Open app via Run dialog (e.g., "open_app:notepad", "open_app:code", "open_app:chrome")
2362
+ - open_folder: Open VS Code with folder (e.g., "open_folder:E:/MyProject")
2363
+ - focus_window: Focus by title (e.g., "focus_window:Notepad")
2364
+
2365
+ ### Input
2251
2366
  - type_text: Type text string (e.g., "type_text:Hello World")
2252
2367
  - press_key: Single key (e.g., "press_key:enter", "press_key:escape", "press_key:tab")
2253
2368
  - key_combo: Key combination (e.g., "key_combo:control+s", "key_combo:alt+f4", "key_combo:meta+r")
2254
2369
  - click: Mouse click (e.g., "click:left", "click:right")
2370
+
2371
+ ### File Operations
2372
+ - read_file: Read file contents (e.g., "read_file:E:/test/index.html")
2373
+ - write_file: Write content to file (e.g., "write_file:E:/test/output.txt|Hello World")
2374
+ - list_files: List files in directory (e.g., "list_files:E:/test")
2375
+
2376
+ ### AI Coding
2377
+ - generate_code: AI generates code based on description (e.g., "generate_code:E:/test/index.html|create an HTML page with input on left, output on right")
2378
+ - edit_code: AI modifies existing code (e.g., "edit_code:E:/test/app.js|add error handling to the fetch calls")
2379
+
2380
+ ### Web Browsing
2381
+ - open_url: Open URL in default browser (e.g., "open_url:https://perplexity.ai")
2382
+ - browse_and_ask: Open AI website, type question, wait for response (e.g., "browse_and_ask:perplexity|What is the capital of France?")
2383
+ - browse_and_ask: Supports: perplexity, chatgpt, claude, google
2384
+
2385
+ ### Utility
2255
2386
  - wait: Wait N seconds (e.g., "wait:2" - use 1-3s for app loads)
2256
- - focus_window: Focus by title (e.g., "focus_window:Notepad")
2257
2387
  - screenshot: Capture and describe screen
2388
+ - shell: Run shell command (e.g., "shell:npm install")
2258
2389
  ${learnedExamples}
2259
2390
  ## EXAMPLES WITH REASONING
2260
2391
 
@@ -2297,6 +2428,59 @@ Output:
2297
2428
  { "description": "Close active window with Alt+F4", "action": "key_combo:alt+f4" }
2298
2429
  ]
2299
2430
 
2431
+ ### Example 4: "open folder E:/Test in vscode and create an HTML editor"
2432
+ Thinking:
2433
+ - Goal: Open VS Code with folder, then create/edit HTML file to be an editor
2434
+ - How: Use open_folder to launch VS Code with the folder, then use AI to generate code
2435
+ - Sequence: Open folder -> List files to see what exists -> Generate/edit the HTML
2436
+ - Edge case: File might not exist yet
2437
+
2438
+ Output:
2439
+ [
2440
+ { "description": "Open VS Code with the Test folder", "action": "open_folder:E:/Test" },
2441
+ { "description": "Wait for VS Code to load", "action": "wait:3" },
2442
+ { "description": "List files in the folder", "action": "list_files:E:/Test" },
2443
+ { "description": "Generate HTML editor code", "action": "generate_code:E:/Test/editor.html|Create an HTML page with a code editor layout: textarea input on the left side, live preview output on the right side. Include basic CSS for split layout and JavaScript to update preview on input." }
2444
+ ]
2445
+
2446
+ ### Example 5: "read the config.json and add a new setting"
2447
+ Thinking:
2448
+ - Goal: Read existing file, understand it, modify it
2449
+ - How: read_file to get contents, then edit_code to modify
2450
+ - Sequence: Read first, then edit
2451
+
2452
+ Output:
2453
+ [
2454
+ { "description": "Read the config file", "action": "read_file:config.json" },
2455
+ { "description": "Add new setting to config", "action": "edit_code:config.json|add a new setting called 'darkMode' with value true" }
2456
+ ]
2457
+
2458
+ ### Example 6: "ask perplexity what is the best programming language"
2459
+ Thinking:
2460
+ - Goal: Open Perplexity AI in browser and ask a question
2461
+ - How: Use browse_and_ask with perplexity target
2462
+ - Sequence: Open site -> type question -> wait for response -> screenshot result
2463
+
2464
+ Output:
2465
+ [
2466
+ { "description": "Ask Perplexity the question", "action": "browse_and_ask:perplexity|what is the best programming language" },
2467
+ { "description": "Wait for response to generate", "action": "wait:5" },
2468
+ { "description": "Capture the response", "action": "screenshot" }
2469
+ ]
2470
+
2471
+ ### Example 7: "search google for weather today"
2472
+ Thinking:
2473
+ - Goal: Open Google and search for something
2474
+ - How: Use browse_and_ask with google target
2475
+ - Sequence: Open Google, search, capture results
2476
+
2477
+ Output:
2478
+ [
2479
+ { "description": "Search Google", "action": "browse_and_ask:google|weather today" },
2480
+ { "description": "Wait for results", "action": "wait:2" },
2481
+ { "description": "Capture search results", "action": "screenshot" }
2482
+ ]
2483
+
2300
2484
  ## YOUR TASK
2301
2485
  Now parse this request: "${input}"
2302
2486
 
@@ -2386,6 +2570,168 @@ async function executeStep(step) {
2386
2570
  await focusWindow(params);
2387
2571
  step.result = `Focused window: ${params}`;
2388
2572
  break;
2573
+ case "open_folder":
2574
+ await runCommand(`code "${params}"`, 1e4);
2575
+ step.result = `Opened VS Code with folder: ${params}`;
2576
+ break;
2577
+ case "read_file": {
2578
+ const readResult = await readFile(params);
2579
+ if (readResult.success) {
2580
+ step.result = readResult.output;
2581
+ } else {
2582
+ throw new Error(readResult.error || "Failed to read file");
2583
+ }
2584
+ break;
2585
+ }
2586
+ case "write_file": {
2587
+ const [filePath, ...contentParts] = params.split("|");
2588
+ const content = contentParts.join("|");
2589
+ const writeResult = await writeFile(filePath, content);
2590
+ if (writeResult.success) {
2591
+ step.result = `Written to ${filePath}`;
2592
+ } else {
2593
+ throw new Error(writeResult.error || "Failed to write file");
2594
+ }
2595
+ break;
2596
+ }
2597
+ case "list_files": {
2598
+ const listResult = await listDir(params, false);
2599
+ if (listResult.success) {
2600
+ step.result = listResult.output;
2601
+ } else {
2602
+ throw new Error(listResult.error || "Failed to list files");
2603
+ }
2604
+ break;
2605
+ }
2606
+ case "generate_code": {
2607
+ const [codePath, ...descParts] = params.split("|");
2608
+ const codeDescription = descParts.join("|");
2609
+ const codePrompt = `Generate complete, working code for this request. Output ONLY the code, no explanations or markdown:
2610
+
2611
+ Request: ${codeDescription}
2612
+
2613
+ File: ${codePath}`;
2614
+ const codeResponse = await chat([{ role: "user", content: codePrompt }]);
2615
+ let generatedCode = codeResponse.content;
2616
+ generatedCode = generatedCode.replace(/^```[\w]*\n?/gm, "").replace(/\n?```$/gm, "").trim();
2617
+ const genResult = await writeFile(codePath, generatedCode);
2618
+ if (genResult.success) {
2619
+ step.result = `Generated and saved code to ${codePath}`;
2620
+ } else {
2621
+ throw new Error(genResult.error || "Failed to write generated code");
2622
+ }
2623
+ break;
2624
+ }
2625
+ case "edit_code": {
2626
+ const [editPath, ...instrParts] = params.split("|");
2627
+ const instructions = instrParts.join("|");
2628
+ const existingResult = await readFile(editPath);
2629
+ if (!existingResult.success) {
2630
+ throw new Error(`Cannot read file: ${existingResult.error}`);
2631
+ }
2632
+ const editPrompt = `Edit this code according to the instructions. Output ONLY the complete modified code, no explanations or markdown:
2633
+
2634
+ Instructions: ${instructions}
2635
+
2636
+ Current code:
2637
+ ${existingResult.output}`;
2638
+ const editResponse = await chat([{ role: "user", content: editPrompt }]);
2639
+ let editedCode = editResponse.content;
2640
+ editedCode = editedCode.replace(/^```[\w]*\n?/gm, "").replace(/\n?```$/gm, "").trim();
2641
+ const editWriteResult = await writeFile(editPath, editedCode);
2642
+ if (editWriteResult.success) {
2643
+ step.result = `Edited and saved ${editPath}`;
2644
+ } else {
2645
+ throw new Error(editWriteResult.error || "Failed to write edited code");
2646
+ }
2647
+ break;
2648
+ }
2649
+ case "shell": {
2650
+ const shellResult = await runCommand(params, 3e4);
2651
+ if (shellResult.success) {
2652
+ step.result = shellResult.output || "Command completed";
2653
+ } else {
2654
+ throw new Error(shellResult.error || "Command failed");
2655
+ }
2656
+ break;
2657
+ }
2658
+ case "open_url": {
2659
+ const url = params.startsWith("http") ? params : `https://${params}`;
2660
+ if (process.platform === "win32") {
2661
+ await runCommand(`start "" "${url}"`, 5e3);
2662
+ } else if (process.platform === "darwin") {
2663
+ await runCommand(`open "${url}"`, 5e3);
2664
+ } else {
2665
+ await runCommand(`xdg-open "${url}"`, 5e3);
2666
+ }
2667
+ step.result = `Opened ${url} in browser`;
2668
+ break;
2669
+ }
2670
+ case "browse_and_ask": {
2671
+ const [site, ...questionParts] = params.split("|");
2672
+ const question = questionParts.join("|");
2673
+ const sites = {
2674
+ perplexity: { url: "https://www.perplexity.ai", loadTime: 3, responseTime: 10 },
2675
+ chatgpt: { url: "https://chat.openai.com", loadTime: 4, responseTime: 15 },
2676
+ claude: { url: "https://claude.ai", loadTime: 4, responseTime: 15 },
2677
+ google: { url: "https://www.google.com", loadTime: 2, responseTime: 3 },
2678
+ bing: { url: "https://www.bing.com", loadTime: 2, responseTime: 3 },
2679
+ bard: { url: "https://bard.google.com", loadTime: 3, responseTime: 12 },
2680
+ copilot: { url: "https://copilot.microsoft.com", loadTime: 3, responseTime: 12 }
2681
+ };
2682
+ const siteConfig = sites[site.toLowerCase()] || { url: `https://${site}`, loadTime: 3, responseTime: 10 };
2683
+ if (process.platform === "win32") {
2684
+ await runCommand(`start "" "${siteConfig.url}"`, 5e3);
2685
+ } else if (process.platform === "darwin") {
2686
+ await runCommand(`open "${siteConfig.url}"`, 5e3);
2687
+ } else {
2688
+ await runCommand(`xdg-open "${siteConfig.url}"`, 5e3);
2689
+ }
2690
+ await sleep(siteConfig.loadTime * 1e3);
2691
+ await typeText(question);
2692
+ await sleep(300);
2693
+ await pressKey("Return");
2694
+ await sleep(siteConfig.responseTime * 1e3);
2695
+ const extractedParts = [];
2696
+ const maxScrolls = 5;
2697
+ for (let scrollIndex = 0; scrollIndex < maxScrolls; scrollIndex++) {
2698
+ const screenResult = await describeScreen();
2699
+ const extractPrompt = `You are looking at screenshot ${scrollIndex + 1} of ${site}. The user asked: "${question}"
2700
+
2701
+ Extract ONLY the AI's response/answer text visible on screen. Do NOT include:
2702
+ - The user's question
2703
+ - Any UI elements, buttons, navigation, or headers
2704
+ - Any disclaimers, suggestions, or "related questions"
2705
+ - Any "Sources" or citation links
2706
+ - Any text you already extracted (avoid duplicates)
2707
+
2708
+ ${scrollIndex > 0 ? `Previous parts already extracted:
2709
+ ${extractedParts.join("\n---\n")}
2710
+
2711
+ Only extract NEW text that continues from where we left off.` : ""}
2712
+
2713
+ Just give me the actual answer text, word for word as it appears. If there's no more response text visible, respond with exactly: "END_OF_RESPONSE"`;
2714
+ const extractResponse = await chat([{ role: "user", content: extractPrompt }]);
2715
+ const extracted = extractResponse.content.trim();
2716
+ if (extracted === "END_OF_RESPONSE" || extracted.includes("END_OF_RESPONSE")) {
2717
+ break;
2718
+ }
2719
+ if (extracted.toLowerCase().includes("response not ready") || extracted.toLowerCase().includes("no response visible") || extracted.toLowerCase().includes("no additional text")) {
2720
+ if (scrollIndex === 0) {
2721
+ extractedParts.push("Response not ready yet or page still loading.");
2722
+ }
2723
+ break;
2724
+ }
2725
+ extractedParts.push(extracted);
2726
+ await scrollMouse(-5);
2727
+ await sleep(1e3);
2728
+ }
2729
+ const fullResponse = extractedParts.join("\n\n");
2730
+ step.result = `\u{1F4DD} ${site.charAt(0).toUpperCase() + site.slice(1)} says:
2731
+
2732
+ ${fullResponse}`;
2733
+ break;
2734
+ }
2389
2735
  case "screenshot":
2390
2736
  const vision = await describeScreen();
2391
2737
  step.result = vision.description;
@@ -2442,8 +2788,8 @@ function getTaskMemoryStats() {
2442
2788
  }
2443
2789
  function clearTaskMemory() {
2444
2790
  try {
2445
- if (fs.existsSync(TASK_MEMORY_FILE)) {
2446
- fs.unlinkSync(TASK_MEMORY_FILE);
2791
+ if (fs2.existsSync(TASK_MEMORY_FILE)) {
2792
+ fs2.unlinkSync(TASK_MEMORY_FILE);
2447
2793
  }
2448
2794
  } catch {
2449
2795
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@projectservan8n/cnapse",
3
- "version": "0.6.1",
3
+ "version": "0.6.3",
4
4
  "description": "Autonomous PC intelligence - AI assistant for desktop automation",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
package/src/lib/tasks.ts CHANGED
@@ -7,6 +7,8 @@
7
7
  import { chat, Message } from './api.js';
8
8
  import * as computer from '../tools/computer.js';
9
9
  import { describeScreen } from './vision.js';
10
+ import * as filesystem from '../tools/filesystem.js';
11
+ import { runCommand } from '../tools/shell.js';
10
12
  import * as fs from 'fs';
11
13
  import * as path from 'path';
12
14
  import * as os from 'os';
@@ -185,14 +187,36 @@ Before outputting steps, THINK through these questions:
185
187
  - Typing too fast -> add small waits
186
188
 
187
189
  ## AVAILABLE ACTIONS
190
+
191
+ ### App Control
188
192
  - open_app: Open app via Run dialog (e.g., "open_app:notepad", "open_app:code", "open_app:chrome")
193
+ - open_folder: Open VS Code with folder (e.g., "open_folder:E:/MyProject")
194
+ - focus_window: Focus by title (e.g., "focus_window:Notepad")
195
+
196
+ ### Input
189
197
  - type_text: Type text string (e.g., "type_text:Hello World")
190
198
  - press_key: Single key (e.g., "press_key:enter", "press_key:escape", "press_key:tab")
191
199
  - key_combo: Key combination (e.g., "key_combo:control+s", "key_combo:alt+f4", "key_combo:meta+r")
192
200
  - click: Mouse click (e.g., "click:left", "click:right")
201
+
202
+ ### File Operations
203
+ - read_file: Read file contents (e.g., "read_file:E:/test/index.html")
204
+ - write_file: Write content to file (e.g., "write_file:E:/test/output.txt|Hello World")
205
+ - list_files: List files in directory (e.g., "list_files:E:/test")
206
+
207
+ ### AI Coding
208
+ - generate_code: AI generates code based on description (e.g., "generate_code:E:/test/index.html|create an HTML page with input on left, output on right")
209
+ - edit_code: AI modifies existing code (e.g., "edit_code:E:/test/app.js|add error handling to the fetch calls")
210
+
211
+ ### Web Browsing
212
+ - open_url: Open URL in default browser (e.g., "open_url:https://perplexity.ai")
213
+ - browse_and_ask: Open AI website, type question, wait for response (e.g., "browse_and_ask:perplexity|What is the capital of France?")
214
+ - browse_and_ask: Supports: perplexity, chatgpt, claude, google
215
+
216
+ ### Utility
193
217
  - wait: Wait N seconds (e.g., "wait:2" - use 1-3s for app loads)
194
- - focus_window: Focus by title (e.g., "focus_window:Notepad")
195
218
  - screenshot: Capture and describe screen
219
+ - shell: Run shell command (e.g., "shell:npm install")
196
220
  ${learnedExamples}
197
221
  ## EXAMPLES WITH REASONING
198
222
 
@@ -235,6 +259,59 @@ Output:
235
259
  { "description": "Close active window with Alt+F4", "action": "key_combo:alt+f4" }
236
260
  ]
237
261
 
262
+ ### Example 4: "open folder E:/Test in vscode and create an HTML editor"
263
+ Thinking:
264
+ - Goal: Open VS Code with folder, then create/edit HTML file to be an editor
265
+ - How: Use open_folder to launch VS Code with the folder, then use AI to generate code
266
+ - Sequence: Open folder -> List files to see what exists -> Generate/edit the HTML
267
+ - Edge case: File might not exist yet
268
+
269
+ Output:
270
+ [
271
+ { "description": "Open VS Code with the Test folder", "action": "open_folder:E:/Test" },
272
+ { "description": "Wait for VS Code to load", "action": "wait:3" },
273
+ { "description": "List files in the folder", "action": "list_files:E:/Test" },
274
+ { "description": "Generate HTML editor code", "action": "generate_code:E:/Test/editor.html|Create an HTML page with a code editor layout: textarea input on the left side, live preview output on the right side. Include basic CSS for split layout and JavaScript to update preview on input." }
275
+ ]
276
+
277
+ ### Example 5: "read the config.json and add a new setting"
278
+ Thinking:
279
+ - Goal: Read existing file, understand it, modify it
280
+ - How: read_file to get contents, then edit_code to modify
281
+ - Sequence: Read first, then edit
282
+
283
+ Output:
284
+ [
285
+ { "description": "Read the config file", "action": "read_file:config.json" },
286
+ { "description": "Add new setting to config", "action": "edit_code:config.json|add a new setting called 'darkMode' with value true" }
287
+ ]
288
+
289
+ ### Example 6: "ask perplexity what is the best programming language"
290
+ Thinking:
291
+ - Goal: Open Perplexity AI in browser and ask a question
292
+ - How: Use browse_and_ask with perplexity target
293
+ - Sequence: Open site -> type question -> wait for response -> screenshot result
294
+
295
+ Output:
296
+ [
297
+ { "description": "Ask Perplexity the question", "action": "browse_and_ask:perplexity|what is the best programming language" },
298
+ { "description": "Wait for response to generate", "action": "wait:5" },
299
+ { "description": "Capture the response", "action": "screenshot" }
300
+ ]
301
+
302
+ ### Example 7: "search google for weather today"
303
+ Thinking:
304
+ - Goal: Open Google and search for something
305
+ - How: Use browse_and_ask with google target
306
+ - Sequence: Open Google, search, capture results
307
+
308
+ Output:
309
+ [
310
+ { "description": "Search Google", "action": "browse_and_ask:google|weather today" },
311
+ { "description": "Wait for results", "action": "wait:2" },
312
+ { "description": "Capture search results", "action": "screenshot" }
313
+ ]
314
+
238
315
  ## YOUR TASK
239
316
  Now parse this request: "${input}"
240
317
 
@@ -349,6 +426,225 @@ async function executeStep(step: TaskStep): Promise<void> {
349
426
  step.result = `Focused window: ${params}`;
350
427
  break;
351
428
 
429
+ case 'open_folder':
430
+ // Open VS Code with a specific folder
431
+ await runCommand(`code "${params}"`, 10000);
432
+ step.result = `Opened VS Code with folder: ${params}`;
433
+ break;
434
+
435
+ case 'read_file': {
436
+ const readResult = await filesystem.readFile(params);
437
+ if (readResult.success) {
438
+ step.result = readResult.output;
439
+ } else {
440
+ throw new Error(readResult.error || 'Failed to read file');
441
+ }
442
+ break;
443
+ }
444
+
445
+ case 'write_file': {
446
+ // Format: write_file:path|content
447
+ const [filePath, ...contentParts] = params.split('|');
448
+ const content = contentParts.join('|');
449
+ const writeResult = await filesystem.writeFile(filePath, content);
450
+ if (writeResult.success) {
451
+ step.result = `Written to ${filePath}`;
452
+ } else {
453
+ throw new Error(writeResult.error || 'Failed to write file');
454
+ }
455
+ break;
456
+ }
457
+
458
+ case 'list_files': {
459
+ const listResult = await filesystem.listDir(params, false);
460
+ if (listResult.success) {
461
+ step.result = listResult.output;
462
+ } else {
463
+ throw new Error(listResult.error || 'Failed to list files');
464
+ }
465
+ break;
466
+ }
467
+
468
+ case 'generate_code': {
469
+ // Format: generate_code:path|description
470
+ const [codePath, ...descParts] = params.split('|');
471
+ const codeDescription = descParts.join('|');
472
+
473
+ // Ask AI to generate the code
474
+ const codePrompt = `Generate complete, working code for this request. Output ONLY the code, no explanations or markdown:
475
+
476
+ Request: ${codeDescription}
477
+
478
+ File: ${codePath}`;
479
+
480
+ const codeResponse = await chat([{ role: 'user', content: codePrompt }]);
481
+ let generatedCode = codeResponse.content;
482
+
483
+ // Strip markdown code blocks if present
484
+ generatedCode = generatedCode.replace(/^```[\w]*\n?/gm, '').replace(/\n?```$/gm, '').trim();
485
+
486
+ // Write the generated code to file
487
+ const genResult = await filesystem.writeFile(codePath, generatedCode);
488
+ if (genResult.success) {
489
+ step.result = `Generated and saved code to ${codePath}`;
490
+ } else {
491
+ throw new Error(genResult.error || 'Failed to write generated code');
492
+ }
493
+ break;
494
+ }
495
+
496
+ case 'edit_code': {
497
+ // Format: edit_code:path|instructions
498
+ const [editPath, ...instrParts] = params.split('|');
499
+ const instructions = instrParts.join('|');
500
+
501
+ // Read existing file
502
+ const existingResult = await filesystem.readFile(editPath);
503
+ if (!existingResult.success) {
504
+ throw new Error(`Cannot read file: ${existingResult.error}`);
505
+ }
506
+
507
+ // Ask AI to edit the code
508
+ const editPrompt = `Edit this code according to the instructions. Output ONLY the complete modified code, no explanations or markdown:
509
+
510
+ Instructions: ${instructions}
511
+
512
+ Current code:
513
+ ${existingResult.output}`;
514
+
515
+ const editResponse = await chat([{ role: 'user', content: editPrompt }]);
516
+ let editedCode = editResponse.content;
517
+
518
+ // Strip markdown code blocks if present
519
+ editedCode = editedCode.replace(/^```[\w]*\n?/gm, '').replace(/\n?```$/gm, '').trim();
520
+
521
+ // Write the edited code back
522
+ const editWriteResult = await filesystem.writeFile(editPath, editedCode);
523
+ if (editWriteResult.success) {
524
+ step.result = `Edited and saved ${editPath}`;
525
+ } else {
526
+ throw new Error(editWriteResult.error || 'Failed to write edited code');
527
+ }
528
+ break;
529
+ }
530
+
531
+ case 'shell': {
532
+ const shellResult = await runCommand(params, 30000);
533
+ if (shellResult.success) {
534
+ step.result = shellResult.output || 'Command completed';
535
+ } else {
536
+ throw new Error(shellResult.error || 'Command failed');
537
+ }
538
+ break;
539
+ }
540
+
541
+ case 'open_url': {
542
+ // Open URL in default browser
543
+ const url = params.startsWith('http') ? params : `https://${params}`;
544
+ if (process.platform === 'win32') {
545
+ await runCommand(`start "" "${url}"`, 5000);
546
+ } else if (process.platform === 'darwin') {
547
+ await runCommand(`open "${url}"`, 5000);
548
+ } else {
549
+ await runCommand(`xdg-open "${url}"`, 5000);
550
+ }
551
+ step.result = `Opened ${url} in browser`;
552
+ break;
553
+ }
554
+
555
+ case 'browse_and_ask': {
556
+ // Format: browse_and_ask:site|question
557
+ const [site, ...questionParts] = params.split('|');
558
+ const question = questionParts.join('|');
559
+
560
+ // Site-specific URLs and response wait times
561
+ const sites: Record<string, { url: string; loadTime: number; responseTime: number }> = {
562
+ perplexity: { url: 'https://www.perplexity.ai', loadTime: 3, responseTime: 10 },
563
+ chatgpt: { url: 'https://chat.openai.com', loadTime: 4, responseTime: 15 },
564
+ claude: { url: 'https://claude.ai', loadTime: 4, responseTime: 15 },
565
+ google: { url: 'https://www.google.com', loadTime: 2, responseTime: 3 },
566
+ bing: { url: 'https://www.bing.com', loadTime: 2, responseTime: 3 },
567
+ bard: { url: 'https://bard.google.com', loadTime: 3, responseTime: 12 },
568
+ copilot: { url: 'https://copilot.microsoft.com', loadTime: 3, responseTime: 12 },
569
+ };
570
+
571
+ const siteConfig = sites[site.toLowerCase()] || { url: `https://${site}`, loadTime: 3, responseTime: 10 };
572
+
573
+ // Open the site
574
+ if (process.platform === 'win32') {
575
+ await runCommand(`start "" "${siteConfig.url}"`, 5000);
576
+ } else if (process.platform === 'darwin') {
577
+ await runCommand(`open "${siteConfig.url}"`, 5000);
578
+ } else {
579
+ await runCommand(`xdg-open "${siteConfig.url}"`, 5000);
580
+ }
581
+
582
+ // Wait for page to load
583
+ await sleep(siteConfig.loadTime * 1000);
584
+
585
+ // Type the question (most sites have autofocus on search/input)
586
+ await computer.typeText(question);
587
+ await sleep(300);
588
+
589
+ // Press Enter to submit
590
+ await computer.pressKey('Return');
591
+
592
+ // Wait for AI to generate response
593
+ await sleep(siteConfig.responseTime * 1000);
594
+
595
+ // Capture multiple screenshots by scrolling to get full response
596
+ const extractedParts: string[] = [];
597
+ const maxScrolls = 5; // Maximum number of scroll captures
598
+
599
+ for (let scrollIndex = 0; scrollIndex < maxScrolls; scrollIndex++) {
600
+ // Capture current view
601
+ const screenResult = await describeScreen();
602
+
603
+ // Ask AI to extract just the response text from what it sees
604
+ const extractPrompt = `You are looking at screenshot ${scrollIndex + 1} of ${site}. The user asked: "${question}"
605
+
606
+ Extract ONLY the AI's response/answer text visible on screen. Do NOT include:
607
+ - The user's question
608
+ - Any UI elements, buttons, navigation, or headers
609
+ - Any disclaimers, suggestions, or "related questions"
610
+ - Any "Sources" or citation links
611
+ - Any text you already extracted (avoid duplicates)
612
+
613
+ ${scrollIndex > 0 ? `Previous parts already extracted:\n${extractedParts.join('\n---\n')}\n\nOnly extract NEW text that continues from where we left off.` : ''}
614
+
615
+ Just give me the actual answer text, word for word as it appears. If there's no more response text visible, respond with exactly: "END_OF_RESPONSE"`;
616
+
617
+ const extractResponse = await chat([{ role: 'user', content: extractPrompt }]);
618
+ const extracted = extractResponse.content.trim();
619
+
620
+ // Check if we've reached the end
621
+ if (extracted === 'END_OF_RESPONSE' || extracted.includes('END_OF_RESPONSE')) {
622
+ break;
623
+ }
624
+
625
+ // Check for "no response" indicators
626
+ if (extracted.toLowerCase().includes('response not ready') ||
627
+ extracted.toLowerCase().includes('no response visible') ||
628
+ extracted.toLowerCase().includes('no additional text')) {
629
+ if (scrollIndex === 0) {
630
+ extractedParts.push('Response not ready yet or page still loading.');
631
+ }
632
+ break;
633
+ }
634
+
635
+ extractedParts.push(extracted);
636
+
637
+ // Scroll down to see more content
638
+ await computer.scrollMouse(-5); // Scroll down
639
+ await sleep(1000); // Wait for scroll animation
640
+ }
641
+
642
+ // Combine all extracted parts
643
+ const fullResponse = extractedParts.join('\n\n');
644
+ step.result = `📝 ${site.charAt(0).toUpperCase() + site.slice(1)} says:\n\n${fullResponse}`;
645
+ break;
646
+ }
647
+
352
648
  case 'screenshot':
353
649
  const vision = await describeScreen();
354
650
  step.result = vision.description;
@@ -23,6 +23,101 @@ export interface TelegramBotEvents {
23
23
  stopped: () => void;
24
24
  }
25
25
 
26
+ /**
27
+ * Convert markdown to Telegram-safe format (MarkdownV2)
28
+ * Escapes special characters and converts some markdown syntax
29
+ */
30
+ function formatForTelegram(text: string): { text: string; parseMode: 'MarkdownV2' | undefined } {
31
+ // Check if text has markdown that could be rendered
32
+ const hasMarkdown = /[*_`\[\]()]/.test(text);
33
+
34
+ if (!hasMarkdown) {
35
+ return { text, parseMode: undefined };
36
+ }
37
+
38
+ try {
39
+ // Convert to Telegram MarkdownV2 format
40
+ let formatted = text;
41
+
42
+ // First, escape special characters that aren't part of markdown
43
+ // MarkdownV2 requires escaping: _ * [ ] ( ) ~ ` > # + - = | { } . !
44
+ const escapeChars = ['\\', '_', '*', '[', ']', '(', ')', '~', '`', '>', '#', '+', '-', '=', '|', '{', '}', '.', '!'];
45
+
46
+ // Temporarily replace valid markdown with placeholders
47
+ const placeholders: { placeholder: string; original: string }[] = [];
48
+ let placeholderIndex = 0;
49
+
50
+ // Protect code blocks (```code```)
51
+ formatted = formatted.replace(/```([\s\S]*?)```/g, (match, code) => {
52
+ const placeholder = `__CODEBLOCK_${placeholderIndex++}__`;
53
+ placeholders.push({ placeholder, original: '```' + code.replace(/\\/g, '\\\\') + '```' });
54
+ return placeholder;
55
+ });
56
+
57
+ // Protect inline code (`code`)
58
+ formatted = formatted.replace(/`([^`]+)`/g, (match, code) => {
59
+ const placeholder = `__INLINECODE_${placeholderIndex++}__`;
60
+ placeholders.push({ placeholder, original: '`' + code.replace(/\\/g, '\\\\') + '`' });
61
+ return placeholder;
62
+ });
63
+
64
+ // Protect bold (**text** or __text__)
65
+ formatted = formatted.replace(/\*\*(.+?)\*\*/g, (match, text) => {
66
+ const placeholder = `__BOLD_${placeholderIndex++}__`;
67
+ placeholders.push({ placeholder, original: '*' + text + '*' });
68
+ return placeholder;
69
+ });
70
+
71
+ // Protect italic (*text* or _text_) - but only single asterisks
72
+ formatted = formatted.replace(/(?<!\*)\*(?!\*)(.+?)(?<!\*)\*(?!\*)/g, (match, text) => {
73
+ const placeholder = `__ITALIC_${placeholderIndex++}__`;
74
+ placeholders.push({ placeholder, original: '_' + text + '_' });
75
+ return placeholder;
76
+ });
77
+
78
+ // Protect links [text](url)
79
+ formatted = formatted.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (match, text, url) => {
80
+ const placeholder = `__LINK_${placeholderIndex++}__`;
81
+ placeholders.push({ placeholder, original: '[' + text + '](' + url + ')' });
82
+ return placeholder;
83
+ });
84
+
85
+ // Now escape remaining special characters
86
+ for (const char of escapeChars) {
87
+ if (char === '\\') continue; // Skip backslash for now
88
+ formatted = formatted.split(char).join('\\' + char);
89
+ }
90
+
91
+ // Restore placeholders
92
+ for (const { placeholder, original } of placeholders) {
93
+ formatted = formatted.replace(placeholder, original);
94
+ }
95
+
96
+ return { text: formatted, parseMode: 'MarkdownV2' };
97
+ } catch {
98
+ // If formatting fails, return plain text
99
+ return { text, parseMode: undefined };
100
+ }
101
+ }
102
+
103
+ /**
104
+ * Send a message with proper formatting, falling back to plain text if markdown fails
105
+ */
106
+ async function sendFormattedMessage(ctx: any, text: string): Promise<void> {
107
+ const { text: formatted, parseMode } = formatForTelegram(text);
108
+
109
+ try {
110
+ if (parseMode) {
111
+ await ctx.reply(formatted, { parse_mode: parseMode });
112
+ } else {
113
+ await ctx.reply(text);
114
+ }
115
+ } catch {
116
+ // If markdown parsing fails, send as plain text
117
+ await ctx.reply(text);
118
+ }
119
+ }
120
+
26
121
  export class TelegramBotService extends EventEmitter {
27
122
  private bot: any = null;
28
123
  private isRunning = false;
@@ -252,7 +347,7 @@ export class TelegramBotService extends EventEmitter {
252
347
  // Check if this looks like a computer control request
253
348
  const computerControlResult = await this.tryComputerControl(userText);
254
349
  if (computerControlResult) {
255
- await ctx.reply(computerControlResult);
350
+ await sendFormattedMessage(ctx, computerControlResult);
256
351
  history.push({ role: 'assistant', content: computerControlResult });
257
352
  return;
258
353
  }
@@ -276,16 +371,16 @@ export class TelegramBotService extends EventEmitter {
276
371
  // Add assistant response to history
277
372
  history.push({ role: 'assistant', content: response.content });
278
373
 
279
- // Send response (split if too long for Telegram)
374
+ // Send response with proper formatting (split if too long for Telegram)
280
375
  const responseText = response.content || '(no response)';
281
376
  if (responseText.length > 4000) {
282
377
  // Split into chunks
283
378
  const chunks = responseText.match(/.{1,4000}/gs) || [responseText];
284
379
  for (const chunk of chunks) {
285
- await ctx.reply(chunk);
380
+ await sendFormattedMessage(ctx, chunk);
286
381
  }
287
382
  } else {
288
- await ctx.reply(responseText);
383
+ await sendFormattedMessage(ctx, responseText);
289
384
  }
290
385
  } catch (error) {
291
386
  const errorMsg = error instanceof Error ? error.message : 'Unknown error';