@projectservan8n/cnapse 0.5.6 → 0.5.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -745,134 +745,163 @@ async function chatOpenAI(messages, model) {
745
745
  const content = data.choices?.[0]?.message?.content || "";
746
746
  return { content, model };
747
747
  }
748
-
749
- // src/lib/screen.ts
750
- import { exec as exec3 } from "child_process";
751
- import { promisify as promisify3 } from "util";
752
- var execAsync3 = promisify3(exec3);
753
- async function getScreenDescription() {
754
- try {
755
- const platform = process.platform;
756
- if (platform === "win32") {
757
- const { stdout } = await execAsync3(`
758
- Add-Type -AssemblyName System.Windows.Forms
759
- $screen = [System.Windows.Forms.Screen]::PrimaryScreen.Bounds
760
- Write-Output "$($screen.Width)x$($screen.Height)"
761
- `, { shell: "powershell.exe" });
762
- return `Screen ${stdout.trim()} captured`;
763
- } else if (platform === "darwin") {
764
- const { stdout } = await execAsync3(`system_profiler SPDisplaysDataType | grep Resolution | head -1`);
765
- return `Screen ${stdout.trim()}`;
766
- } else {
767
- const { stdout } = await execAsync3(`xdpyinfo | grep dimensions | awk '{print $2}'`);
768
- return `Screen ${stdout.trim()} captured`;
748
+ async function chatWithVision(messages, screenshotBase64) {
749
+ const config = getConfig();
750
+ const systemPrompt = await getSystemPrompt();
751
+ const visionPrompt = systemPrompt + "\n\nYou can see the user's screen. Describe what you see and help them with their request.";
752
+ switch (config.provider) {
753
+ case "openrouter":
754
+ return chatWithVisionOpenRouter(messages, screenshotBase64, visionPrompt);
755
+ case "ollama":
756
+ return chatWithVisionOllama(messages, screenshotBase64, visionPrompt);
757
+ case "anthropic":
758
+ return chatWithVisionAnthropic(messages, screenshotBase64, visionPrompt);
759
+ case "openai":
760
+ return chatWithVisionOpenAI(messages, screenshotBase64, visionPrompt);
761
+ default:
762
+ throw new Error(`Vision not supported for provider: ${config.provider}`);
763
+ }
764
+ }
765
+ async function chatWithVisionOpenRouter(messages, screenshot, systemPrompt) {
766
+ const apiKey = getApiKey("openrouter");
767
+ if (!apiKey) throw new Error("OpenRouter API key not configured");
768
+ const config = getConfig();
769
+ let model = config.model;
770
+ if (!model.includes("gpt-5") && !model.includes("claude") && !model.includes("gemini")) {
771
+ model = "openai/gpt-5-nano";
772
+ }
773
+ const lastUserIdx = messages.length - 1;
774
+ const visionMessages = messages.map((m, i) => {
775
+ if (i === lastUserIdx && m.role === "user") {
776
+ return {
777
+ role: "user",
778
+ content: [
779
+ { type: "text", text: m.content },
780
+ { type: "image_url", image_url: { url: `data:image/png;base64,${screenshot}` } }
781
+ ]
782
+ };
769
783
  }
770
- } catch {
771
- return null;
784
+ return m;
785
+ });
786
+ const response = await fetch("https://openrouter.ai/api/v1/chat/completions", {
787
+ method: "POST",
788
+ headers: {
789
+ "Authorization": `Bearer ${apiKey}`,
790
+ "Content-Type": "application/json",
791
+ "HTTP-Referer": config.openrouter.siteUrl,
792
+ "X-Title": config.openrouter.appName
793
+ },
794
+ body: JSON.stringify({
795
+ model,
796
+ messages: [{ role: "system", content: systemPrompt }, ...visionMessages],
797
+ max_tokens: 2048
798
+ })
799
+ });
800
+ if (!response.ok) {
801
+ const error = await response.text();
802
+ throw new Error(`OpenRouter vision error: ${response.status} - ${error}`);
772
803
  }
804
+ const data = await response.json();
805
+ return { content: data.choices?.[0]?.message?.content || "", model };
773
806
  }
807
+ async function chatWithVisionOllama(messages, screenshot, systemPrompt) {
808
+ const config = getConfig();
809
+ const visionModels = ["llava", "llama3.2-vision", "bakllava"];
810
+ const model = visionModels.find((m) => config.model.includes(m)) || "llava";
811
+ const lastUserMsg = messages.filter((m) => m.role === "user").pop();
812
+ const response = await fetch(`${config.ollamaHost}/api/generate`, {
813
+ method: "POST",
814
+ headers: { "Content-Type": "application/json" },
815
+ body: JSON.stringify({
816
+ model,
817
+ prompt: `${systemPrompt}
774
818
 
775
- // src/hooks/useChat.ts
776
- var WELCOME_MESSAGE = {
777
- id: "0",
778
- role: "system",
779
- content: "Welcome to C-napse! Type your message and press Enter.\n\nShortcuts: Ctrl+H for help, Ctrl+P for provider",
780
- timestamp: /* @__PURE__ */ new Date()
781
- };
782
- function useChat(screenWatch = false) {
783
- const [messages, setMessages] = useState3([WELCOME_MESSAGE]);
784
- const [isProcessing, setIsProcessing] = useState3(false);
785
- const [error, setError] = useState3(null);
786
- const screenContextRef = useRef(null);
787
- useEffect2(() => {
788
- if (!screenWatch) {
789
- screenContextRef.current = null;
790
- return;
819
+ User: ${lastUserMsg?.content || "What do you see?"}`,
820
+ images: [screenshot],
821
+ stream: false
822
+ })
823
+ });
824
+ if (!response.ok) {
825
+ const error = await response.text();
826
+ throw new Error(`Ollama vision error: ${error}`);
827
+ }
828
+ const data = await response.json();
829
+ return { content: data.response || "", model };
830
+ }
831
+ async function chatWithVisionAnthropic(messages, screenshot, systemPrompt) {
832
+ const apiKey = getApiKey("anthropic");
833
+ if (!apiKey) throw new Error("Anthropic API key not configured");
834
+ const chatMessages = messages.filter((m) => m.role !== "system");
835
+ const lastUserIdx = chatMessages.length - 1;
836
+ const visionMessages = chatMessages.map((m, i) => {
837
+ if (i === lastUserIdx && m.role === "user") {
838
+ return {
839
+ role: "user",
840
+ content: [
841
+ { type: "image", source: { type: "base64", media_type: "image/png", data: screenshot } },
842
+ { type: "text", text: m.content }
843
+ ]
844
+ };
791
845
  }
792
- const checkScreen = async () => {
793
- const desc = await getScreenDescription();
794
- if (desc) {
795
- screenContextRef.current = desc;
796
- }
797
- };
798
- checkScreen();
799
- const interval = setInterval(checkScreen, 5e3);
800
- return () => clearInterval(interval);
801
- }, [screenWatch]);
802
- const addSystemMessage = useCallback((content) => {
803
- setMessages((prev) => [
804
- ...prev,
805
- {
806
- id: Date.now().toString(),
807
- role: "system",
808
- content,
809
- timestamp: /* @__PURE__ */ new Date()
810
- }
811
- ]);
812
- }, []);
813
- const sendMessage = useCallback(async (content) => {
814
- if (!content.trim() || isProcessing) return;
815
- setError(null);
816
- const userMsg = {
817
- id: Date.now().toString(),
818
- role: "user",
819
- content,
820
- timestamp: /* @__PURE__ */ new Date()
821
- };
822
- const assistantId = (Date.now() + 1).toString();
823
- const assistantMsg = {
824
- id: assistantId,
825
- role: "assistant",
826
- content: "",
827
- timestamp: /* @__PURE__ */ new Date(),
828
- isStreaming: true
829
- };
830
- setMessages((prev) => [...prev, userMsg, assistantMsg]);
831
- setIsProcessing(true);
832
- try {
833
- const apiMessages = messages.filter((m) => m.role === "user" || m.role === "assistant").slice(-10).map((m) => ({ role: m.role, content: m.content }));
834
- let finalContent = content;
835
- if (screenWatch && screenContextRef.current) {
836
- finalContent = `[Screen context: ${screenContextRef.current}]
837
-
838
- ${content}`;
839
- }
840
- apiMessages.push({ role: "user", content: finalContent });
841
- const response = await chat(apiMessages);
842
- setMessages(
843
- (prev) => prev.map(
844
- (m) => m.id === assistantId ? { ...m, content: response.content || "(no response)", isStreaming: false } : m
845
- )
846
- );
847
- } catch (err2) {
848
- const errorMsg = err2 instanceof Error ? err2.message : "Unknown error";
849
- setError(errorMsg);
850
- setMessages(
851
- (prev) => prev.map(
852
- (m) => m.id === assistantId ? { ...m, content: `Error: ${errorMsg}`, isStreaming: false } : m
853
- )
854
- );
855
- } finally {
856
- setIsProcessing(false);
846
+ return { role: m.role, content: m.content };
847
+ });
848
+ const response = await fetch("https://api.anthropic.com/v1/messages", {
849
+ method: "POST",
850
+ headers: {
851
+ "x-api-key": apiKey,
852
+ "anthropic-version": "2023-06-01",
853
+ "Content-Type": "application/json"
854
+ },
855
+ body: JSON.stringify({
856
+ model: "claude-3-5-sonnet-20241022",
857
+ max_tokens: 2048,
858
+ system: systemPrompt,
859
+ messages: visionMessages
860
+ })
861
+ });
862
+ if (!response.ok) {
863
+ const error = await response.text();
864
+ throw new Error(`Anthropic vision error: ${error}`);
865
+ }
866
+ const data = await response.json();
867
+ return { content: data.content?.[0]?.text || "", model: "claude-3-5-sonnet-20241022" };
868
+ }
869
+ async function chatWithVisionOpenAI(messages, screenshot, systemPrompt) {
870
+ const apiKey = getApiKey("openai");
871
+ if (!apiKey) throw new Error("OpenAI API key not configured");
872
+ const lastUserIdx = messages.length - 1;
873
+ const visionMessages = messages.map((m, i) => {
874
+ if (i === lastUserIdx && m.role === "user") {
875
+ return {
876
+ role: "user",
877
+ content: [
878
+ { type: "text", text: m.content },
879
+ { type: "image_url", image_url: { url: `data:image/png;base64,${screenshot}` } }
880
+ ]
881
+ };
857
882
  }
858
- }, [messages, isProcessing, screenWatch]);
859
- const clearMessages = useCallback(() => {
860
- setMessages([WELCOME_MESSAGE]);
861
- setError(null);
862
- }, []);
863
- return {
864
- messages,
865
- isProcessing,
866
- error,
867
- sendMessage,
868
- addSystemMessage,
869
- clearMessages
870
- };
883
+ return m;
884
+ });
885
+ const response = await fetch("https://api.openai.com/v1/chat/completions", {
886
+ method: "POST",
887
+ headers: {
888
+ "Authorization": `Bearer ${apiKey}`,
889
+ "Content-Type": "application/json"
890
+ },
891
+ body: JSON.stringify({
892
+ model: "gpt-4o",
893
+ messages: [{ role: "system", content: systemPrompt }, ...visionMessages],
894
+ max_tokens: 2048
895
+ })
896
+ });
897
+ if (!response.ok) {
898
+ const error = await response.text();
899
+ throw new Error(`OpenAI vision error: ${error}`);
900
+ }
901
+ const data = await response.json();
902
+ return { content: data.choices?.[0]?.message?.content || "", model: "gpt-4o" };
871
903
  }
872
904
 
873
- // src/hooks/useVision.ts
874
- import { useState as useState4, useCallback as useCallback2 } from "react";
875
-
876
905
  // src/lib/vision.ts
877
906
  async function describeScreen() {
878
907
  const screenshot = await captureScreenshot();
@@ -893,17 +922,17 @@ async function captureScreenshot() {
893
922
  }
894
923
  }
895
924
  async function captureScreenFallback() {
896
- const { exec: exec7 } = await import("child_process");
897
- const { promisify: promisify7 } = await import("util");
925
+ const { exec: exec6 } = await import("child_process");
926
+ const { promisify: promisify6 } = await import("util");
898
927
  const { tmpdir } = await import("os");
899
928
  const { join: join2 } = await import("path");
900
929
  const { readFile, unlink } = await import("fs/promises");
901
- const execAsync7 = promisify7(exec7);
930
+ const execAsync6 = promisify6(exec6);
902
931
  const tempFile = join2(tmpdir(), `cnapse-screen-${Date.now()}.png`);
903
932
  try {
904
933
  const platform = process.platform;
905
934
  if (platform === "win32") {
906
- await execAsync7(`
935
+ await execAsync6(`
907
936
  Add-Type -AssemblyName System.Windows.Forms
908
937
  $screen = [System.Windows.Forms.Screen]::PrimaryScreen.Bounds
909
938
  $bitmap = New-Object System.Drawing.Bitmap($screen.Width, $screen.Height)
@@ -914,9 +943,9 @@ async function captureScreenFallback() {
914
943
  $bitmap.Dispose()
915
944
  `, { shell: "powershell.exe" });
916
945
  } else if (platform === "darwin") {
917
- await execAsync7(`screencapture -x "${tempFile}"`);
946
+ await execAsync6(`screencapture -x "${tempFile}"`);
918
947
  } else {
919
- await execAsync7(`gnome-screenshot -f "${tempFile}" 2>/dev/null || scrot "${tempFile}" 2>/dev/null || import -window root "${tempFile}"`);
948
+ await execAsync6(`gnome-screenshot -f "${tempFile}" 2>/dev/null || scrot "${tempFile}" 2>/dev/null || import -window root "${tempFile}"`);
920
949
  }
921
950
  const imageBuffer = await readFile(tempFile);
922
951
  await unlink(tempFile).catch(() => {
@@ -1077,7 +1106,98 @@ async function analyzeWithOpenAI(base64Image, prompt) {
1077
1106
  return data.choices?.[0]?.message?.content || "Unable to analyze image";
1078
1107
  }
1079
1108
 
1109
+ // src/hooks/useChat.ts
1110
+ var WELCOME_MESSAGE = {
1111
+ id: "0",
1112
+ role: "system",
1113
+ content: "Welcome to C-napse! Type your message and press Enter.\n\nShortcuts: Ctrl+H for help, Ctrl+P for provider",
1114
+ timestamp: /* @__PURE__ */ new Date()
1115
+ };
1116
+ function useChat(screenWatch = false) {
1117
+ const [messages, setMessages] = useState3([WELCOME_MESSAGE]);
1118
+ const [isProcessing, setIsProcessing] = useState3(false);
1119
+ const [error, setError] = useState3(null);
1120
+ const screenWatchRef = useRef(screenWatch);
1121
+ useEffect2(() => {
1122
+ screenWatchRef.current = screenWatch;
1123
+ }, [screenWatch]);
1124
+ const addSystemMessage = useCallback((content) => {
1125
+ setMessages((prev) => [
1126
+ ...prev,
1127
+ {
1128
+ id: Date.now().toString(),
1129
+ role: "system",
1130
+ content,
1131
+ timestamp: /* @__PURE__ */ new Date()
1132
+ }
1133
+ ]);
1134
+ }, []);
1135
+ const sendMessage = useCallback(async (content) => {
1136
+ if (!content.trim() || isProcessing) return;
1137
+ setError(null);
1138
+ const userMsg = {
1139
+ id: Date.now().toString(),
1140
+ role: "user",
1141
+ content,
1142
+ timestamp: /* @__PURE__ */ new Date()
1143
+ };
1144
+ const assistantId = (Date.now() + 1).toString();
1145
+ const assistantMsg = {
1146
+ id: assistantId,
1147
+ role: "assistant",
1148
+ content: "",
1149
+ timestamp: /* @__PURE__ */ new Date(),
1150
+ isStreaming: true
1151
+ };
1152
+ setMessages((prev) => [...prev, userMsg, assistantMsg]);
1153
+ setIsProcessing(true);
1154
+ try {
1155
+ const apiMessages = messages.filter((m) => m.role === "user" || m.role === "assistant").slice(-10).map((m) => ({ role: m.role, content: m.content }));
1156
+ apiMessages.push({ role: "user", content });
1157
+ let response;
1158
+ if (screenWatchRef.current) {
1159
+ const screenshot = await captureScreenshot();
1160
+ if (screenshot) {
1161
+ response = await chatWithVision(apiMessages, screenshot);
1162
+ } else {
1163
+ response = await chat(apiMessages);
1164
+ }
1165
+ } else {
1166
+ response = await chat(apiMessages);
1167
+ }
1168
+ setMessages(
1169
+ (prev) => prev.map(
1170
+ (m) => m.id === assistantId ? { ...m, content: response.content || "(no response)", isStreaming: false } : m
1171
+ )
1172
+ );
1173
+ } catch (err2) {
1174
+ const errorMsg = err2 instanceof Error ? err2.message : "Unknown error";
1175
+ setError(errorMsg);
1176
+ setMessages(
1177
+ (prev) => prev.map(
1178
+ (m) => m.id === assistantId ? { ...m, content: `Error: ${errorMsg}`, isStreaming: false } : m
1179
+ )
1180
+ );
1181
+ } finally {
1182
+ setIsProcessing(false);
1183
+ }
1184
+ }, [messages, isProcessing]);
1185
+ const clearMessages = useCallback(() => {
1186
+ setMessages([WELCOME_MESSAGE]);
1187
+ setError(null);
1188
+ }, []);
1189
+ return {
1190
+ messages,
1191
+ isProcessing,
1192
+ error,
1193
+ sendMessage,
1194
+ addSystemMessage,
1195
+ clearMessages
1196
+ };
1197
+ }
1198
+
1080
1199
  // src/hooks/useVision.ts
1200
+ import { useState as useState4, useCallback as useCallback2 } from "react";
1081
1201
  function useVision() {
1082
1202
  const [isAnalyzing, setIsAnalyzing] = useState4(false);
1083
1203
  const [lastDescription, setLastDescription] = useState4(null);
@@ -1115,21 +1235,21 @@ import { useState as useState5, useCallback as useCallback3, useEffect as useEff
1115
1235
  import { EventEmitter } from "events";
1116
1236
 
1117
1237
  // src/tools/shell.ts
1118
- import { exec as exec6 } from "child_process";
1119
- import { promisify as promisify6 } from "util";
1238
+ import { exec as exec5 } from "child_process";
1239
+ import { promisify as promisify5 } from "util";
1120
1240
 
1121
1241
  // src/tools/clipboard.ts
1122
1242
  import clipboardy from "clipboardy";
1123
1243
 
1124
1244
  // src/tools/process.ts
1245
+ import { exec as exec3 } from "child_process";
1246
+ import { promisify as promisify3 } from "util";
1247
+ var execAsync3 = promisify3(exec3);
1248
+
1249
+ // src/tools/computer.ts
1125
1250
  import { exec as exec4 } from "child_process";
1126
1251
  import { promisify as promisify4 } from "util";
1127
1252
  var execAsync4 = promisify4(exec4);
1128
-
1129
- // src/tools/computer.ts
1130
- import { exec as exec5 } from "child_process";
1131
- import { promisify as promisify5 } from "util";
1132
- var execAsync5 = promisify5(exec5);
1133
1253
  async function clickMouse(button = "left") {
1134
1254
  try {
1135
1255
  if (process.platform === "win32") {
@@ -1139,12 +1259,12 @@ Add-Type -MemberDefinition @"
1139
1259
  public static extern void mouse_event(long dwFlags, long dx, long dy, long cButtons, long dwExtraInfo);
1140
1260
  "@ -Name Mouse -Namespace Win32
1141
1261
  ${button === "left" ? "[Win32.Mouse]::mouse_event(0x02, 0, 0, 0, 0); [Win32.Mouse]::mouse_event(0x04, 0, 0, 0, 0)" : button === "right" ? "[Win32.Mouse]::mouse_event(0x08, 0, 0, 0, 0); [Win32.Mouse]::mouse_event(0x10, 0, 0, 0, 0)" : "[Win32.Mouse]::mouse_event(0x20, 0, 0, 0, 0); [Win32.Mouse]::mouse_event(0x40, 0, 0, 0, 0)"}`;
1142
- await execAsync5(`powershell -Command "${script.replace(/\n/g, " ")}"`, { shell: "cmd.exe" });
1262
+ await execAsync4(`powershell -Command "${script.replace(/\n/g, " ")}"`, { shell: "cmd.exe" });
1143
1263
  } else if (process.platform === "darwin") {
1144
- await execAsync5(`cliclick c:.`);
1264
+ await execAsync4(`cliclick c:.`);
1145
1265
  } else {
1146
1266
  const btn = button === "left" ? "1" : button === "right" ? "3" : "2";
1147
- await execAsync5(`xdotool click ${btn}`);
1267
+ await execAsync4(`xdotool click ${btn}`);
1148
1268
  }
1149
1269
  return ok(`Clicked ${button} button`);
1150
1270
  } catch (error) {
@@ -1155,13 +1275,13 @@ async function typeText(text) {
1155
1275
  try {
1156
1276
  if (process.platform === "win32") {
1157
1277
  const escapedText = text.replace(/'/g, "''").replace(/[+^%~(){}[\]]/g, "{$&}");
1158
- await execAsync5(`powershell -Command "Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.SendKeys]::SendWait('${escapedText}')"`, { shell: "cmd.exe" });
1278
+ await execAsync4(`powershell -Command "Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.SendKeys]::SendWait('${escapedText}')"`, { shell: "cmd.exe" });
1159
1279
  } else if (process.platform === "darwin") {
1160
1280
  const escaped = text.replace(/'/g, "'\\''");
1161
- await execAsync5(`osascript -e 'tell application "System Events" to keystroke "${escaped}"'`);
1281
+ await execAsync4(`osascript -e 'tell application "System Events" to keystroke "${escaped}"'`);
1162
1282
  } else {
1163
1283
  const escaped = text.replace(/'/g, "'\\''");
1164
- await execAsync5(`xdotool type '${escaped}'`);
1284
+ await execAsync4(`xdotool type '${escaped}'`);
1165
1285
  }
1166
1286
  return ok(`Typed: ${text}`);
1167
1287
  } catch (error) {
@@ -1202,7 +1322,7 @@ async function pressKey(key) {
1202
1322
  "f12": "{F12}"
1203
1323
  };
1204
1324
  const winKey = winKeyMap[key.toLowerCase()] || key;
1205
- await execAsync5(`powershell -Command "Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.SendKeys]::SendWait('${winKey}')"`, { shell: "cmd.exe" });
1325
+ await execAsync4(`powershell -Command "Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.SendKeys]::SendWait('${winKey}')"`, { shell: "cmd.exe" });
1206
1326
  } else if (process.platform === "darwin") {
1207
1327
  const macKeyMap = {
1208
1328
  "return": 36,
@@ -1220,12 +1340,12 @@ async function pressKey(key) {
1220
1340
  };
1221
1341
  const keyCode = macKeyMap[key.toLowerCase()];
1222
1342
  if (keyCode) {
1223
- await execAsync5(`osascript -e 'tell application "System Events" to key code ${keyCode}'`);
1343
+ await execAsync4(`osascript -e 'tell application "System Events" to key code ${keyCode}'`);
1224
1344
  } else {
1225
- await execAsync5(`osascript -e 'tell application "System Events" to keystroke "${key}"'`);
1345
+ await execAsync4(`osascript -e 'tell application "System Events" to keystroke "${key}"'`);
1226
1346
  }
1227
1347
  } else {
1228
- await execAsync5(`xdotool key ${key}`);
1348
+ await execAsync4(`xdotool key ${key}`);
1229
1349
  }
1230
1350
  return ok(`Pressed: ${key}`);
1231
1351
  } catch (error) {
@@ -1238,7 +1358,7 @@ async function keyCombo(keys) {
1238
1358
  const hasWin = keys.some((k) => k.toLowerCase() === "meta" || k.toLowerCase() === "win");
1239
1359
  const hasR = keys.some((k) => k.toLowerCase() === "r");
1240
1360
  if (hasWin && hasR) {
1241
- await execAsync5(`powershell -Command "$shell = New-Object -ComObject WScript.Shell; $shell.Run('explorer shell:::{2559a1f3-21d7-11d4-bdaf-00c04f60b9f0}')"`, { shell: "cmd.exe" });
1361
+ await execAsync4(`powershell -Command "$shell = New-Object -ComObject WScript.Shell; $shell.Run('explorer shell:::{2559a1f3-21d7-11d4-bdaf-00c04f60b9f0}')"`, { shell: "cmd.exe" });
1242
1362
  return ok(`Pressed: ${keys.join("+")}`);
1243
1363
  }
1244
1364
  const modifierMap = {
@@ -1258,7 +1378,7 @@ async function keyCombo(keys) {
1258
1378
  }
1259
1379
  }
1260
1380
  combo += regularKeys.join("");
1261
- await execAsync5(`powershell -Command "Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.SendKeys]::SendWait('${combo}')"`, { shell: "cmd.exe" });
1381
+ await execAsync4(`powershell -Command "Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.SendKeys]::SendWait('${combo}')"`, { shell: "cmd.exe" });
1262
1382
  } else if (process.platform === "darwin") {
1263
1383
  const modifiers = keys.filter((k) => ["control", "ctrl", "alt", "shift", "command", "meta"].includes(k.toLowerCase()));
1264
1384
  const regular = keys.filter((k) => !["control", "ctrl", "alt", "shift", "command", "meta"].includes(k.toLowerCase()));
@@ -1274,9 +1394,9 @@ async function keyCombo(keys) {
1274
1394
  };
1275
1395
  cmd += " using {" + modifiers.map((m) => modMap[m.toLowerCase()]).join(", ") + "}";
1276
1396
  }
1277
- await execAsync5(`osascript -e '${cmd}'`);
1397
+ await execAsync4(`osascript -e '${cmd}'`);
1278
1398
  } else {
1279
- await execAsync5(`xdotool key ${keys.join("+")}`);
1399
+ await execAsync4(`xdotool key ${keys.join("+")}`);
1280
1400
  }
1281
1401
  return ok(`Pressed: ${keys.join("+")}`);
1282
1402
  } catch (error) {
@@ -1287,11 +1407,11 @@ async function focusWindow(title) {
1287
1407
  try {
1288
1408
  if (process.platform === "win32") {
1289
1409
  const escaped = title.replace(/'/g, "''");
1290
- await execAsync5(`powershell -Command "$wshell = New-Object -ComObject wscript.shell; $wshell.AppActivate('${escaped}')"`, { shell: "cmd.exe" });
1410
+ await execAsync4(`powershell -Command "$wshell = New-Object -ComObject wscript.shell; $wshell.AppActivate('${escaped}')"`, { shell: "cmd.exe" });
1291
1411
  } else if (process.platform === "darwin") {
1292
- await execAsync5(`osascript -e 'tell application "${title}" to activate'`);
1412
+ await execAsync4(`osascript -e 'tell application "${title}" to activate'`);
1293
1413
  } else {
1294
- await execAsync5(`wmctrl -a "${title}"`);
1414
+ await execAsync4(`wmctrl -a "${title}"`);
1295
1415
  }
1296
1416
  return ok(`Focused window: ${title}`);
1297
1417
  } catch (error) {
@@ -1308,13 +1428,13 @@ function err(error) {
1308
1428
  }
1309
1429
 
1310
1430
  // src/tools/shell.ts
1311
- var execAsync6 = promisify6(exec6);
1431
+ var execAsync5 = promisify5(exec5);
1312
1432
  async function runCommand(cmd, timeout = 3e4) {
1313
1433
  try {
1314
1434
  const isWindows = process.platform === "win32";
1315
1435
  const shell = isWindows ? "cmd.exe" : "/bin/sh";
1316
1436
  const shellArg = isWindows ? "/C" : "-c";
1317
- const { stdout, stderr } = await execAsync6(cmd, {
1437
+ const { stdout, stderr } = await execAsync5(cmd, {
1318
1438
  shell,
1319
1439
  timeout,
1320
1440
  maxBuffer: 10 * 1024 * 1024
@@ -2232,16 +2352,19 @@ async function main() {
2232
2352
  const key = args[2];
2233
2353
  if (!provider || !key) {
2234
2354
  console.log("Usage: cnapse auth <provider> <api-key>");
2235
- console.log("Providers: openrouter, anthropic, openai");
2355
+ console.log("Providers: openrouter, anthropic, openai, telegram");
2236
2356
  process.exit(1);
2237
2357
  }
2238
- if (!["openrouter", "anthropic", "openai"].includes(provider)) {
2358
+ if (!["openrouter", "anthropic", "openai", "telegram"].includes(provider)) {
2239
2359
  console.log(`Invalid provider: ${provider}`);
2240
- console.log("Valid providers: openrouter, anthropic, openai");
2360
+ console.log("Valid providers: openrouter, anthropic, openai, telegram");
2241
2361
  process.exit(1);
2242
2362
  }
2243
2363
  setApiKey(provider, key);
2244
2364
  console.log(`\u2713 ${provider} API key saved`);
2365
+ if (provider === "telegram") {
2366
+ console.log("Start the bot with: cnapse, then /telegram or Ctrl+T");
2367
+ }
2245
2368
  process.exit(0);
2246
2369
  }
2247
2370
  case "config": {
@@ -2288,32 +2411,58 @@ async function main() {
2288
2411
  case "help":
2289
2412
  case "--help":
2290
2413
  case "-h": {
2414
+ const cyan = "\x1B[36m";
2415
+ const green = "\x1B[32m";
2416
+ const yellow = "\x1B[33m";
2417
+ const magenta = "\x1B[35m";
2418
+ const bold = "\x1B[1m";
2419
+ const dim = "\x1B[2m";
2420
+ const reset = "\x1B[0m";
2291
2421
  console.log(`
2292
- C-napse - Autonomous PC Intelligence
2293
-
2294
- Usage:
2295
- cnapse Start interactive chat
2296
- cnapse init Interactive setup wizard
2297
- cnapse config Interactive configuration
2298
- cnapse config show Show current configuration
2299
- cnapse config set <k> <v> Set config value
2300
- cnapse auth <provider> <key> Set API key
2301
- cnapse help Show this help
2302
-
2303
- Providers:
2304
- ollama - Local AI (default, free)
2305
- openrouter - OpenRouter API (many models)
2306
- anthropic - Anthropic Claude
2307
- openai - OpenAI GPT
2308
-
2309
- Quick Start:
2310
- cnapse init # Interactive setup
2311
- cnapse config # Change provider/model
2312
-
2313
- Manual Setup:
2314
- cnapse auth openrouter sk-or-xxxxx
2315
- cnapse config set provider openrouter
2316
- cnapse config set model qwen/qwen-2.5-coder-32b-instruct
2422
+ ${cyan}${bold}\u2554\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2557
2423
+ \u2551 \u2551
2424
+ \u2551 ${magenta}\u2588\u2588\u2588\u2588\u2588\u2588\u2557 ${cyan}\u2588\u2588\u2588\u2557 \u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557${reset}${cyan}${bold} \u2551
2425
+ \u2551 ${magenta}\u2588\u2588\u2554\u2550\u2550\u2550\u2550\u255D ${cyan}\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2551\u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2557\u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2557\u2588\u2588\u2554\u2550\u2550\u2550\u2550\u255D\u2588\u2588\u2554\u2550\u2550\u2550\u2550\u255D${reset}${cyan}${bold} \u2551
2426
+ \u2551 ${magenta}\u2588\u2588\u2551 \u2588\u2588\u2588\u2588\u2588\u2557${cyan}\u2588\u2588\u2554\u2588\u2588\u2557 \u2588\u2588\u2551\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2551\u2588\u2588\u2588\u2588\u2588\u2588\u2554\u255D\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2588\u2588\u2588\u2557${reset}${cyan}${bold} \u2551
2427
+ \u2551 ${magenta}\u2588\u2588\u2551 \u255A\u2550\u2550\u2550\u2550\u255D${cyan}\u2588\u2588\u2551\u255A\u2588\u2588\u2557\u2588\u2588\u2551\u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2551\u2588\u2588\u2554\u2550\u2550\u2550\u255D \u255A\u2550\u2550\u2550\u2550\u2588\u2588\u2551\u2588\u2588\u2554\u2550\u2550\u255D${reset}${cyan}${bold} \u2551
2428
+ \u2551 ${magenta}\u255A\u2588\u2588\u2588\u2588\u2588\u2588\u2557 ${cyan}\u2588\u2588\u2551 \u255A\u2588\u2588\u2588\u2588\u2551\u2588\u2588\u2551 \u2588\u2588\u2551\u2588\u2588\u2551 \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2551\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557${reset}${cyan}${bold} \u2551
2429
+ \u2551 ${magenta} \u255A\u2550\u2550\u2550\u2550\u2550\u255D ${cyan}\u255A\u2550\u255D \u255A\u2550\u2550\u2550\u255D\u255A\u2550\u255D \u255A\u2550\u255D\u255A\u2550\u255D \u255A\u2550\u2550\u2550\u2550\u2550\u2550\u255D\u255A\u2550\u2550\u2550\u2550\u2550\u2550\u255D${reset}${cyan}${bold} \u2551
2430
+ \u2551 \u2551
2431
+ \u2551 ${reset}${dim}Autonomous PC Intelligence${reset}${cyan}${bold} \u2551
2432
+ \u255A\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u255D${reset}
2433
+
2434
+ ${yellow}${bold}USAGE${reset}
2435
+ ${green}cnapse${reset} Start interactive chat
2436
+ ${green}cnapse init${reset} Interactive setup wizard
2437
+ ${green}cnapse config${reset} Interactive configuration
2438
+ ${green}cnapse config show${reset} Show current configuration
2439
+ ${green}cnapse auth <provider> <key>${reset} Set API key
2440
+ ${green}cnapse help${reset} Show this help
2441
+
2442
+ ${yellow}${bold}PROVIDERS${reset}
2443
+ ${cyan}ollama${reset} Local AI ${dim}(default, free, private)${reset}
2444
+ ${cyan}openrouter${reset} OpenRouter API ${dim}(many models, pay-per-use)${reset}
2445
+ ${cyan}anthropic${reset} Anthropic Claude ${dim}(best reasoning)${reset}
2446
+ ${cyan}openai${reset} OpenAI GPT ${dim}(reliable)${reset}
2447
+ ${cyan}telegram${reset} Telegram bot token ${dim}(remote control)${reset}
2448
+
2449
+ ${yellow}${bold}QUICK START${reset}
2450
+ ${dim}# Interactive setup - easiest way${reset}
2451
+ ${green}cnapse init${reset}
2452
+
2453
+ ${dim}# Manual setup with OpenRouter${reset}
2454
+ ${green}cnapse auth openrouter sk-or-v1-xxxxx${reset}
2455
+ ${green}cnapse config set provider openrouter${reset}
2456
+
2457
+ ${dim}# Add Telegram for remote control${reset}
2458
+ ${green}cnapse auth telegram YOUR_BOT_TOKEN${reset}
2459
+
2460
+ ${yellow}${bold}IN-APP SHORTCUTS${reset}
2461
+ ${cyan}Ctrl+H${reset} Help menu ${cyan}Ctrl+P${reset} Change provider
2462
+ ${cyan}Ctrl+E${reset} Screen watch ${cyan}Ctrl+T${reset} Toggle Telegram
2463
+ ${cyan}Ctrl+L${reset} Clear chat ${cyan}Ctrl+C${reset} Exit
2464
+
2465
+ ${dim}GitHub: https://github.com/projectservan8n/C-napse${reset}
2317
2466
  `);
2318
2467
  process.exit(0);
2319
2468
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@projectservan8n/cnapse",
3
- "version": "0.5.6",
3
+ "version": "0.5.8",
4
4
  "description": "Autonomous PC intelligence - AI assistant for desktop automation",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -3,8 +3,8 @@
3
3
  */
4
4
 
5
5
  import { useState, useCallback, useRef, useEffect } from 'react';
6
- import { chat, Message } from '../lib/api.js';
7
- import { getScreenDescription } from '../lib/screen.js';
6
+ import { chat, chatWithVision, Message } from '../lib/api.js';
7
+ import { captureScreenshot } from '../lib/vision.js';
8
8
 
9
9
  export interface ChatMessage {
10
10
  id: string;
@@ -34,25 +34,11 @@ export function useChat(screenWatch: boolean = false): UseChatResult {
34
34
  const [messages, setMessages] = useState<ChatMessage[]>([WELCOME_MESSAGE]);
35
35
  const [isProcessing, setIsProcessing] = useState(false);
36
36
  const [error, setError] = useState<string | null>(null);
37
- const screenContextRef = useRef<string | null>(null);
37
+ const screenWatchRef = useRef(screenWatch);
38
38
 
39
- // Screen watching effect
39
+ // Keep ref in sync with prop
40
40
  useEffect(() => {
41
- if (!screenWatch) {
42
- screenContextRef.current = null;
43
- return;
44
- }
45
-
46
- const checkScreen = async () => {
47
- const desc = await getScreenDescription();
48
- if (desc) {
49
- screenContextRef.current = desc;
50
- }
51
- };
52
-
53
- checkScreen();
54
- const interval = setInterval(checkScreen, 5000);
55
- return () => clearInterval(interval);
41
+ screenWatchRef.current = screenWatch;
56
42
  }, [screenWatch]);
57
43
 
58
44
  const addSystemMessage = useCallback((content: string) => {
@@ -100,16 +86,23 @@ export function useChat(screenWatch: boolean = false): UseChatResult {
100
86
  .slice(-10)
101
87
  .map(m => ({ role: m.role as 'user' | 'assistant', content: m.content }));
102
88
 
103
- // Add screen context if watching
104
- let finalContent = content;
105
- if (screenWatch && screenContextRef.current) {
106
- finalContent = `[Screen context: ${screenContextRef.current}]\n\n${content}`;
89
+ apiMessages.push({ role: 'user', content });
90
+
91
+ let response;
92
+
93
+ // If screen watching is enabled, capture screenshot and use vision API
94
+ if (screenWatchRef.current) {
95
+ const screenshot = await captureScreenshot();
96
+ if (screenshot) {
97
+ response = await chatWithVision(apiMessages, screenshot);
98
+ } else {
99
+ // Fallback to regular chat if screenshot fails
100
+ response = await chat(apiMessages);
101
+ }
102
+ } else {
103
+ response = await chat(apiMessages);
107
104
  }
108
105
 
109
- apiMessages.push({ role: 'user', content: finalContent });
110
-
111
- const response = await chat(apiMessages);
112
-
113
106
  // Update assistant message
114
107
  setMessages(prev =>
115
108
  prev.map(m =>
@@ -131,7 +124,7 @@ export function useChat(screenWatch: boolean = false): UseChatResult {
131
124
  } finally {
132
125
  setIsProcessing(false);
133
126
  }
134
- }, [messages, isProcessing, screenWatch]);
127
+ }, [messages, isProcessing]);
135
128
 
136
129
  const clearMessages = useCallback(() => {
137
130
  setMessages([WELCOME_MESSAGE]);
package/src/index.tsx CHANGED
@@ -13,23 +13,26 @@ async function main() {
13
13
 
14
14
  switch (command) {
15
15
  case 'auth': {
16
- const provider = args[1] as 'openrouter' | 'anthropic' | 'openai';
16
+ const provider = args[1] as 'openrouter' | 'anthropic' | 'openai' | 'telegram';
17
17
  const key = args[2];
18
18
 
19
19
  if (!provider || !key) {
20
20
  console.log('Usage: cnapse auth <provider> <api-key>');
21
- console.log('Providers: openrouter, anthropic, openai');
21
+ console.log('Providers: openrouter, anthropic, openai, telegram');
22
22
  process.exit(1);
23
23
  }
24
24
 
25
- if (!['openrouter', 'anthropic', 'openai'].includes(provider)) {
25
+ if (!['openrouter', 'anthropic', 'openai', 'telegram'].includes(provider)) {
26
26
  console.log(`Invalid provider: ${provider}`);
27
- console.log('Valid providers: openrouter, anthropic, openai');
27
+ console.log('Valid providers: openrouter, anthropic, openai, telegram');
28
28
  process.exit(1);
29
29
  }
30
30
 
31
31
  setApiKey(provider, key);
32
32
  console.log(`✓ ${provider} API key saved`);
33
+ if (provider === 'telegram') {
34
+ console.log('Start the bot with: cnapse, then /telegram or Ctrl+T');
35
+ }
33
36
  process.exit(0);
34
37
  }
35
38
 
@@ -84,32 +87,60 @@ async function main() {
84
87
  case 'help':
85
88
  case '--help':
86
89
  case '-h': {
90
+ // Colorful help using ANSI escape codes
91
+ const cyan = '\x1b[36m';
92
+ const green = '\x1b[32m';
93
+ const yellow = '\x1b[33m';
94
+ const magenta = '\x1b[35m';
95
+ const bold = '\x1b[1m';
96
+ const dim = '\x1b[2m';
97
+ const reset = '\x1b[0m';
98
+
87
99
  console.log(`
88
- C-napse - Autonomous PC Intelligence
89
-
90
- Usage:
91
- cnapse Start interactive chat
92
- cnapse init Interactive setup wizard
93
- cnapse config Interactive configuration
94
- cnapse config show Show current configuration
95
- cnapse config set <k> <v> Set config value
96
- cnapse auth <provider> <key> Set API key
97
- cnapse help Show this help
98
-
99
- Providers:
100
- ollama - Local AI (default, free)
101
- openrouter - OpenRouter API (many models)
102
- anthropic - Anthropic Claude
103
- openai - OpenAI GPT
104
-
105
- Quick Start:
106
- cnapse init # Interactive setup
107
- cnapse config # Change provider/model
108
-
109
- Manual Setup:
110
- cnapse auth openrouter sk-or-xxxxx
111
- cnapse config set provider openrouter
112
- cnapse config set model qwen/qwen-2.5-coder-32b-instruct
100
+ ${cyan}${bold}╔═══════════════════════════════════════════════════════════╗
101
+ ║ ║
102
+ ║ ${magenta}██████╗ ${cyan}███╗ ██╗ █████╗ ██████╗ ███████╗███████╗${reset}${cyan}${bold} ║
103
+ ║ ${magenta}██╔════╝ ${cyan}████╗ ██║██╔══██╗██╔══██╗██╔════╝██╔════╝${reset}${cyan}${bold} ║
104
+ ║ ${magenta}██║ █████╗${cyan}██╔██╗ ██║███████║██████╔╝███████╗█████╗${reset}${cyan}${bold} ║
105
+ ║ ${magenta}██║ ╚════╝${cyan}██║╚██╗██║██╔══██║██╔═══╝ ╚════██║██╔══╝${reset}${cyan}${bold} ║
106
+ ║ ${magenta}╚██████╗ ${cyan}██║ ╚████║██║ ██║██║ ███████║███████╗${reset}${cyan}${bold} ║
107
+ ║ ${magenta} ╚═════╝ ${cyan}╚═╝ ╚═══╝╚═╝ ╚═╝╚═╝ ╚══════╝╚══════╝${reset}${cyan}${bold} ║
108
+ ║ ║
109
+ ║ ${reset}${dim}Autonomous PC Intelligence${reset}${cyan}${bold} ║
110
+ ╚═══════════════════════════════════════════════════════════╝${reset}
111
+
112
+ ${yellow}${bold}USAGE${reset}
113
+ ${green}cnapse${reset} Start interactive chat
114
+ ${green}cnapse init${reset} Interactive setup wizard
115
+ ${green}cnapse config${reset} Interactive configuration
116
+ ${green}cnapse config show${reset} Show current configuration
117
+ ${green}cnapse auth <provider> <key>${reset} Set API key
118
+ ${green}cnapse help${reset} Show this help
119
+
120
+ ${yellow}${bold}PROVIDERS${reset}
121
+ ${cyan}ollama${reset} Local AI ${dim}(default, free, private)${reset}
122
+ ${cyan}openrouter${reset} OpenRouter API ${dim}(many models, pay-per-use)${reset}
123
+ ${cyan}anthropic${reset} Anthropic Claude ${dim}(best reasoning)${reset}
124
+ ${cyan}openai${reset} OpenAI GPT ${dim}(reliable)${reset}
125
+ ${cyan}telegram${reset} Telegram bot token ${dim}(remote control)${reset}
126
+
127
+ ${yellow}${bold}QUICK START${reset}
128
+ ${dim}# Interactive setup - easiest way${reset}
129
+ ${green}cnapse init${reset}
130
+
131
+ ${dim}# Manual setup with OpenRouter${reset}
132
+ ${green}cnapse auth openrouter sk-or-v1-xxxxx${reset}
133
+ ${green}cnapse config set provider openrouter${reset}
134
+
135
+ ${dim}# Add Telegram for remote control${reset}
136
+ ${green}cnapse auth telegram YOUR_BOT_TOKEN${reset}
137
+
138
+ ${yellow}${bold}IN-APP SHORTCUTS${reset}
139
+ ${cyan}Ctrl+H${reset} Help menu ${cyan}Ctrl+P${reset} Change provider
140
+ ${cyan}Ctrl+E${reset} Screen watch ${cyan}Ctrl+T${reset} Toggle Telegram
141
+ ${cyan}Ctrl+L${reset} Clear chat ${cyan}Ctrl+C${reset} Exit
142
+
143
+ ${dim}GitHub: https://github.com/projectservan8n/C-napse${reset}
113
144
  `);
114
145
  process.exit(0);
115
146
  }
package/src/lib/api.ts CHANGED
@@ -184,6 +184,194 @@ async function chatOpenAI(messages: Message[], model: string): Promise<ChatRespo
184
184
  return { content, model };
185
185
  }
186
186
 
187
+ /**
188
+ * Chat with vision - sends screenshot along with messages
189
+ */
190
+ export async function chatWithVision(messages: Message[], screenshotBase64: string): Promise<ChatResponse> {
191
+ const config = getConfig();
192
+ const systemPrompt = await getSystemPrompt();
193
+
194
+ // Add vision context to system prompt
195
+ const visionPrompt = systemPrompt + '\n\nYou can see the user\'s screen. Describe what you see and help them with their request.';
196
+
197
+ switch (config.provider) {
198
+ case 'openrouter':
199
+ return chatWithVisionOpenRouter(messages, screenshotBase64, visionPrompt);
200
+ case 'ollama':
201
+ return chatWithVisionOllama(messages, screenshotBase64, visionPrompt);
202
+ case 'anthropic':
203
+ return chatWithVisionAnthropic(messages, screenshotBase64, visionPrompt);
204
+ case 'openai':
205
+ return chatWithVisionOpenAI(messages, screenshotBase64, visionPrompt);
206
+ default:
207
+ throw new Error(`Vision not supported for provider: ${config.provider}`);
208
+ }
209
+ }
210
+
211
+ async function chatWithVisionOpenRouter(messages: Message[], screenshot: string, systemPrompt: string): Promise<ChatResponse> {
212
+ const apiKey = getApiKey('openrouter');
213
+ if (!apiKey) throw new Error('OpenRouter API key not configured');
214
+
215
+ const config = getConfig();
216
+
217
+ // Use vision-capable model - prefer GPT-5 Nano or Claude
218
+ let model = config.model;
219
+ if (!model.includes('gpt-5') && !model.includes('claude') && !model.includes('gemini')) {
220
+ model = 'openai/gpt-5-nano'; // Default to GPT-5 Nano for vision
221
+ }
222
+
223
+ // Build messages with image in the last user message
224
+ const lastUserIdx = messages.length - 1;
225
+ const visionMessages = messages.map((m, i) => {
226
+ if (i === lastUserIdx && m.role === 'user') {
227
+ return {
228
+ role: 'user',
229
+ content: [
230
+ { type: 'text', text: m.content },
231
+ { type: 'image_url', image_url: { url: `data:image/png;base64,${screenshot}` } },
232
+ ],
233
+ };
234
+ }
235
+ return m;
236
+ });
237
+
238
+ const response = await fetch('https://openrouter.ai/api/v1/chat/completions', {
239
+ method: 'POST',
240
+ headers: {
241
+ 'Authorization': `Bearer ${apiKey}`,
242
+ 'Content-Type': 'application/json',
243
+ 'HTTP-Referer': config.openrouter.siteUrl,
244
+ 'X-Title': config.openrouter.appName,
245
+ },
246
+ body: JSON.stringify({
247
+ model,
248
+ messages: [{ role: 'system', content: systemPrompt }, ...visionMessages],
249
+ max_tokens: 2048,
250
+ }),
251
+ });
252
+
253
+ if (!response.ok) {
254
+ const error = await response.text();
255
+ throw new Error(`OpenRouter vision error: ${response.status} - ${error}`);
256
+ }
257
+
258
+ const data = await response.json() as any;
259
+ return { content: data.choices?.[0]?.message?.content || '', model };
260
+ }
261
+
262
+ async function chatWithVisionOllama(messages: Message[], screenshot: string, systemPrompt: string): Promise<ChatResponse> {
263
+ const config = getConfig();
264
+
265
+ // Use vision model
266
+ const visionModels = ['llava', 'llama3.2-vision', 'bakllava'];
267
+ const model = visionModels.find(m => config.model.includes(m)) || 'llava';
268
+
269
+ const lastUserMsg = messages.filter(m => m.role === 'user').pop();
270
+
271
+ const response = await fetch(`${config.ollamaHost}/api/generate`, {
272
+ method: 'POST',
273
+ headers: { 'Content-Type': 'application/json' },
274
+ body: JSON.stringify({
275
+ model,
276
+ prompt: `${systemPrompt}\n\nUser: ${lastUserMsg?.content || 'What do you see?'}`,
277
+ images: [screenshot],
278
+ stream: false,
279
+ }),
280
+ });
281
+
282
+ if (!response.ok) {
283
+ const error = await response.text();
284
+ throw new Error(`Ollama vision error: ${error}`);
285
+ }
286
+
287
+ const data = await response.json() as any;
288
+ return { content: data.response || '', model };
289
+ }
290
+
291
+ async function chatWithVisionAnthropic(messages: Message[], screenshot: string, systemPrompt: string): Promise<ChatResponse> {
292
+ const apiKey = getApiKey('anthropic');
293
+ if (!apiKey) throw new Error('Anthropic API key not configured');
294
+
295
+ const chatMessages = messages.filter(m => m.role !== 'system');
296
+ const lastUserIdx = chatMessages.length - 1;
297
+
298
+ const visionMessages = chatMessages.map((m, i) => {
299
+ if (i === lastUserIdx && m.role === 'user') {
300
+ return {
301
+ role: 'user',
302
+ content: [
303
+ { type: 'image', source: { type: 'base64', media_type: 'image/png', data: screenshot } },
304
+ { type: 'text', text: m.content },
305
+ ],
306
+ };
307
+ }
308
+ return { role: m.role, content: m.content };
309
+ });
310
+
311
+ const response = await fetch('https://api.anthropic.com/v1/messages', {
312
+ method: 'POST',
313
+ headers: {
314
+ 'x-api-key': apiKey,
315
+ 'anthropic-version': '2023-06-01',
316
+ 'Content-Type': 'application/json',
317
+ },
318
+ body: JSON.stringify({
319
+ model: 'claude-3-5-sonnet-20241022',
320
+ max_tokens: 2048,
321
+ system: systemPrompt,
322
+ messages: visionMessages,
323
+ }),
324
+ });
325
+
326
+ if (!response.ok) {
327
+ const error = await response.text();
328
+ throw new Error(`Anthropic vision error: ${error}`);
329
+ }
330
+
331
+ const data = await response.json() as any;
332
+ return { content: data.content?.[0]?.text || '', model: 'claude-3-5-sonnet-20241022' };
333
+ }
334
+
335
+ async function chatWithVisionOpenAI(messages: Message[], screenshot: string, systemPrompt: string): Promise<ChatResponse> {
336
+ const apiKey = getApiKey('openai');
337
+ if (!apiKey) throw new Error('OpenAI API key not configured');
338
+
339
+ const lastUserIdx = messages.length - 1;
340
+ const visionMessages = messages.map((m, i) => {
341
+ if (i === lastUserIdx && m.role === 'user') {
342
+ return {
343
+ role: 'user',
344
+ content: [
345
+ { type: 'text', text: m.content },
346
+ { type: 'image_url', image_url: { url: `data:image/png;base64,${screenshot}` } },
347
+ ],
348
+ };
349
+ }
350
+ return m;
351
+ });
352
+
353
+ const response = await fetch('https://api.openai.com/v1/chat/completions', {
354
+ method: 'POST',
355
+ headers: {
356
+ 'Authorization': `Bearer ${apiKey}`,
357
+ 'Content-Type': 'application/json',
358
+ },
359
+ body: JSON.stringify({
360
+ model: 'gpt-4o',
361
+ messages: [{ role: 'system', content: systemPrompt }, ...visionMessages],
362
+ max_tokens: 2048,
363
+ }),
364
+ });
365
+
366
+ if (!response.ok) {
367
+ const error = await response.text();
368
+ throw new Error(`OpenAI vision error: ${error}`);
369
+ }
370
+
371
+ const data = await response.json() as any;
372
+ return { content: data.choices?.[0]?.message?.content || '', model: 'gpt-4o' };
373
+ }
374
+
187
375
  export async function testConnection(): Promise<boolean> {
188
376
  try {
189
377
  await chat([{ role: 'user', content: 'hi' }]);