@projectservan8n/cnapse 0.5.6 → 0.5.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -745,134 +745,163 @@ async function chatOpenAI(messages, model) {
745
745
  const content = data.choices?.[0]?.message?.content || "";
746
746
  return { content, model };
747
747
  }
748
-
749
- // src/lib/screen.ts
750
- import { exec as exec3 } from "child_process";
751
- import { promisify as promisify3 } from "util";
752
- var execAsync3 = promisify3(exec3);
753
- async function getScreenDescription() {
754
- try {
755
- const platform = process.platform;
756
- if (platform === "win32") {
757
- const { stdout } = await execAsync3(`
758
- Add-Type -AssemblyName System.Windows.Forms
759
- $screen = [System.Windows.Forms.Screen]::PrimaryScreen.Bounds
760
- Write-Output "$($screen.Width)x$($screen.Height)"
761
- `, { shell: "powershell.exe" });
762
- return `Screen ${stdout.trim()} captured`;
763
- } else if (platform === "darwin") {
764
- const { stdout } = await execAsync3(`system_profiler SPDisplaysDataType | grep Resolution | head -1`);
765
- return `Screen ${stdout.trim()}`;
766
- } else {
767
- const { stdout } = await execAsync3(`xdpyinfo | grep dimensions | awk '{print $2}'`);
768
- return `Screen ${stdout.trim()} captured`;
748
+ async function chatWithVision(messages, screenshotBase64) {
749
+ const config = getConfig();
750
+ const systemPrompt = await getSystemPrompt();
751
+ const visionPrompt = systemPrompt + "\n\nYou can see the user's screen. Describe what you see and help them with their request.";
752
+ switch (config.provider) {
753
+ case "openrouter":
754
+ return chatWithVisionOpenRouter(messages, screenshotBase64, visionPrompt);
755
+ case "ollama":
756
+ return chatWithVisionOllama(messages, screenshotBase64, visionPrompt);
757
+ case "anthropic":
758
+ return chatWithVisionAnthropic(messages, screenshotBase64, visionPrompt);
759
+ case "openai":
760
+ return chatWithVisionOpenAI(messages, screenshotBase64, visionPrompt);
761
+ default:
762
+ throw new Error(`Vision not supported for provider: ${config.provider}`);
763
+ }
764
+ }
765
+ async function chatWithVisionOpenRouter(messages, screenshot, systemPrompt) {
766
+ const apiKey = getApiKey("openrouter");
767
+ if (!apiKey) throw new Error("OpenRouter API key not configured");
768
+ const config = getConfig();
769
+ let model = config.model;
770
+ if (!model.includes("gpt-5") && !model.includes("claude") && !model.includes("gemini")) {
771
+ model = "openai/gpt-5-nano";
772
+ }
773
+ const lastUserIdx = messages.length - 1;
774
+ const visionMessages = messages.map((m, i) => {
775
+ if (i === lastUserIdx && m.role === "user") {
776
+ return {
777
+ role: "user",
778
+ content: [
779
+ { type: "text", text: m.content },
780
+ { type: "image_url", image_url: { url: `data:image/png;base64,${screenshot}` } }
781
+ ]
782
+ };
769
783
  }
770
- } catch {
771
- return null;
784
+ return m;
785
+ });
786
+ const response = await fetch("https://openrouter.ai/api/v1/chat/completions", {
787
+ method: "POST",
788
+ headers: {
789
+ "Authorization": `Bearer ${apiKey}`,
790
+ "Content-Type": "application/json",
791
+ "HTTP-Referer": config.openrouter.siteUrl,
792
+ "X-Title": config.openrouter.appName
793
+ },
794
+ body: JSON.stringify({
795
+ model,
796
+ messages: [{ role: "system", content: systemPrompt }, ...visionMessages],
797
+ max_tokens: 2048
798
+ })
799
+ });
800
+ if (!response.ok) {
801
+ const error = await response.text();
802
+ throw new Error(`OpenRouter vision error: ${response.status} - ${error}`);
772
803
  }
804
+ const data = await response.json();
805
+ return { content: data.choices?.[0]?.message?.content || "", model };
773
806
  }
807
+ async function chatWithVisionOllama(messages, screenshot, systemPrompt) {
808
+ const config = getConfig();
809
+ const visionModels = ["llava", "llama3.2-vision", "bakllava"];
810
+ const model = visionModels.find((m) => config.model.includes(m)) || "llava";
811
+ const lastUserMsg = messages.filter((m) => m.role === "user").pop();
812
+ const response = await fetch(`${config.ollamaHost}/api/generate`, {
813
+ method: "POST",
814
+ headers: { "Content-Type": "application/json" },
815
+ body: JSON.stringify({
816
+ model,
817
+ prompt: `${systemPrompt}
774
818
 
775
- // src/hooks/useChat.ts
776
- var WELCOME_MESSAGE = {
777
- id: "0",
778
- role: "system",
779
- content: "Welcome to C-napse! Type your message and press Enter.\n\nShortcuts: Ctrl+H for help, Ctrl+P for provider",
780
- timestamp: /* @__PURE__ */ new Date()
781
- };
782
- function useChat(screenWatch = false) {
783
- const [messages, setMessages] = useState3([WELCOME_MESSAGE]);
784
- const [isProcessing, setIsProcessing] = useState3(false);
785
- const [error, setError] = useState3(null);
786
- const screenContextRef = useRef(null);
787
- useEffect2(() => {
788
- if (!screenWatch) {
789
- screenContextRef.current = null;
790
- return;
819
+ User: ${lastUserMsg?.content || "What do you see?"}`,
820
+ images: [screenshot],
821
+ stream: false
822
+ })
823
+ });
824
+ if (!response.ok) {
825
+ const error = await response.text();
826
+ throw new Error(`Ollama vision error: ${error}`);
827
+ }
828
+ const data = await response.json();
829
+ return { content: data.response || "", model };
830
+ }
831
+ async function chatWithVisionAnthropic(messages, screenshot, systemPrompt) {
832
+ const apiKey = getApiKey("anthropic");
833
+ if (!apiKey) throw new Error("Anthropic API key not configured");
834
+ const chatMessages = messages.filter((m) => m.role !== "system");
835
+ const lastUserIdx = chatMessages.length - 1;
836
+ const visionMessages = chatMessages.map((m, i) => {
837
+ if (i === lastUserIdx && m.role === "user") {
838
+ return {
839
+ role: "user",
840
+ content: [
841
+ { type: "image", source: { type: "base64", media_type: "image/png", data: screenshot } },
842
+ { type: "text", text: m.content }
843
+ ]
844
+ };
791
845
  }
792
- const checkScreen = async () => {
793
- const desc = await getScreenDescription();
794
- if (desc) {
795
- screenContextRef.current = desc;
796
- }
797
- };
798
- checkScreen();
799
- const interval = setInterval(checkScreen, 5e3);
800
- return () => clearInterval(interval);
801
- }, [screenWatch]);
802
- const addSystemMessage = useCallback((content) => {
803
- setMessages((prev) => [
804
- ...prev,
805
- {
806
- id: Date.now().toString(),
807
- role: "system",
808
- content,
809
- timestamp: /* @__PURE__ */ new Date()
810
- }
811
- ]);
812
- }, []);
813
- const sendMessage = useCallback(async (content) => {
814
- if (!content.trim() || isProcessing) return;
815
- setError(null);
816
- const userMsg = {
817
- id: Date.now().toString(),
818
- role: "user",
819
- content,
820
- timestamp: /* @__PURE__ */ new Date()
821
- };
822
- const assistantId = (Date.now() + 1).toString();
823
- const assistantMsg = {
824
- id: assistantId,
825
- role: "assistant",
826
- content: "",
827
- timestamp: /* @__PURE__ */ new Date(),
828
- isStreaming: true
829
- };
830
- setMessages((prev) => [...prev, userMsg, assistantMsg]);
831
- setIsProcessing(true);
832
- try {
833
- const apiMessages = messages.filter((m) => m.role === "user" || m.role === "assistant").slice(-10).map((m) => ({ role: m.role, content: m.content }));
834
- let finalContent = content;
835
- if (screenWatch && screenContextRef.current) {
836
- finalContent = `[Screen context: ${screenContextRef.current}]
837
-
838
- ${content}`;
839
- }
840
- apiMessages.push({ role: "user", content: finalContent });
841
- const response = await chat(apiMessages);
842
- setMessages(
843
- (prev) => prev.map(
844
- (m) => m.id === assistantId ? { ...m, content: response.content || "(no response)", isStreaming: false } : m
845
- )
846
- );
847
- } catch (err2) {
848
- const errorMsg = err2 instanceof Error ? err2.message : "Unknown error";
849
- setError(errorMsg);
850
- setMessages(
851
- (prev) => prev.map(
852
- (m) => m.id === assistantId ? { ...m, content: `Error: ${errorMsg}`, isStreaming: false } : m
853
- )
854
- );
855
- } finally {
856
- setIsProcessing(false);
846
+ return { role: m.role, content: m.content };
847
+ });
848
+ const response = await fetch("https://api.anthropic.com/v1/messages", {
849
+ method: "POST",
850
+ headers: {
851
+ "x-api-key": apiKey,
852
+ "anthropic-version": "2023-06-01",
853
+ "Content-Type": "application/json"
854
+ },
855
+ body: JSON.stringify({
856
+ model: "claude-3-5-sonnet-20241022",
857
+ max_tokens: 2048,
858
+ system: systemPrompt,
859
+ messages: visionMessages
860
+ })
861
+ });
862
+ if (!response.ok) {
863
+ const error = await response.text();
864
+ throw new Error(`Anthropic vision error: ${error}`);
865
+ }
866
+ const data = await response.json();
867
+ return { content: data.content?.[0]?.text || "", model: "claude-3-5-sonnet-20241022" };
868
+ }
869
+ async function chatWithVisionOpenAI(messages, screenshot, systemPrompt) {
870
+ const apiKey = getApiKey("openai");
871
+ if (!apiKey) throw new Error("OpenAI API key not configured");
872
+ const lastUserIdx = messages.length - 1;
873
+ const visionMessages = messages.map((m, i) => {
874
+ if (i === lastUserIdx && m.role === "user") {
875
+ return {
876
+ role: "user",
877
+ content: [
878
+ { type: "text", text: m.content },
879
+ { type: "image_url", image_url: { url: `data:image/png;base64,${screenshot}` } }
880
+ ]
881
+ };
857
882
  }
858
- }, [messages, isProcessing, screenWatch]);
859
- const clearMessages = useCallback(() => {
860
- setMessages([WELCOME_MESSAGE]);
861
- setError(null);
862
- }, []);
863
- return {
864
- messages,
865
- isProcessing,
866
- error,
867
- sendMessage,
868
- addSystemMessage,
869
- clearMessages
870
- };
883
+ return m;
884
+ });
885
+ const response = await fetch("https://api.openai.com/v1/chat/completions", {
886
+ method: "POST",
887
+ headers: {
888
+ "Authorization": `Bearer ${apiKey}`,
889
+ "Content-Type": "application/json"
890
+ },
891
+ body: JSON.stringify({
892
+ model: "gpt-4o",
893
+ messages: [{ role: "system", content: systemPrompt }, ...visionMessages],
894
+ max_tokens: 2048
895
+ })
896
+ });
897
+ if (!response.ok) {
898
+ const error = await response.text();
899
+ throw new Error(`OpenAI vision error: ${error}`);
900
+ }
901
+ const data = await response.json();
902
+ return { content: data.choices?.[0]?.message?.content || "", model: "gpt-4o" };
871
903
  }
872
904
 
873
- // src/hooks/useVision.ts
874
- import { useState as useState4, useCallback as useCallback2 } from "react";
875
-
876
905
  // src/lib/vision.ts
877
906
  async function describeScreen() {
878
907
  const screenshot = await captureScreenshot();
@@ -893,17 +922,17 @@ async function captureScreenshot() {
893
922
  }
894
923
  }
895
924
  async function captureScreenFallback() {
896
- const { exec: exec7 } = await import("child_process");
897
- const { promisify: promisify7 } = await import("util");
925
+ const { exec: exec6 } = await import("child_process");
926
+ const { promisify: promisify6 } = await import("util");
898
927
  const { tmpdir } = await import("os");
899
928
  const { join: join2 } = await import("path");
900
929
  const { readFile, unlink } = await import("fs/promises");
901
- const execAsync7 = promisify7(exec7);
930
+ const execAsync6 = promisify6(exec6);
902
931
  const tempFile = join2(tmpdir(), `cnapse-screen-${Date.now()}.png`);
903
932
  try {
904
933
  const platform = process.platform;
905
934
  if (platform === "win32") {
906
- await execAsync7(`
935
+ await execAsync6(`
907
936
  Add-Type -AssemblyName System.Windows.Forms
908
937
  $screen = [System.Windows.Forms.Screen]::PrimaryScreen.Bounds
909
938
  $bitmap = New-Object System.Drawing.Bitmap($screen.Width, $screen.Height)
@@ -914,9 +943,9 @@ async function captureScreenFallback() {
914
943
  $bitmap.Dispose()
915
944
  `, { shell: "powershell.exe" });
916
945
  } else if (platform === "darwin") {
917
- await execAsync7(`screencapture -x "${tempFile}"`);
946
+ await execAsync6(`screencapture -x "${tempFile}"`);
918
947
  } else {
919
- await execAsync7(`gnome-screenshot -f "${tempFile}" 2>/dev/null || scrot "${tempFile}" 2>/dev/null || import -window root "${tempFile}"`);
948
+ await execAsync6(`gnome-screenshot -f "${tempFile}" 2>/dev/null || scrot "${tempFile}" 2>/dev/null || import -window root "${tempFile}"`);
920
949
  }
921
950
  const imageBuffer = await readFile(tempFile);
922
951
  await unlink(tempFile).catch(() => {
@@ -1077,7 +1106,98 @@ async function analyzeWithOpenAI(base64Image, prompt) {
1077
1106
  return data.choices?.[0]?.message?.content || "Unable to analyze image";
1078
1107
  }
1079
1108
 
1109
+ // src/hooks/useChat.ts
1110
+ var WELCOME_MESSAGE = {
1111
+ id: "0",
1112
+ role: "system",
1113
+ content: "Welcome to C-napse! Type your message and press Enter.\n\nShortcuts: Ctrl+H for help, Ctrl+P for provider",
1114
+ timestamp: /* @__PURE__ */ new Date()
1115
+ };
1116
+ function useChat(screenWatch = false) {
1117
+ const [messages, setMessages] = useState3([WELCOME_MESSAGE]);
1118
+ const [isProcessing, setIsProcessing] = useState3(false);
1119
+ const [error, setError] = useState3(null);
1120
+ const screenWatchRef = useRef(screenWatch);
1121
+ useEffect2(() => {
1122
+ screenWatchRef.current = screenWatch;
1123
+ }, [screenWatch]);
1124
+ const addSystemMessage = useCallback((content) => {
1125
+ setMessages((prev) => [
1126
+ ...prev,
1127
+ {
1128
+ id: Date.now().toString(),
1129
+ role: "system",
1130
+ content,
1131
+ timestamp: /* @__PURE__ */ new Date()
1132
+ }
1133
+ ]);
1134
+ }, []);
1135
+ const sendMessage = useCallback(async (content) => {
1136
+ if (!content.trim() || isProcessing) return;
1137
+ setError(null);
1138
+ const userMsg = {
1139
+ id: Date.now().toString(),
1140
+ role: "user",
1141
+ content,
1142
+ timestamp: /* @__PURE__ */ new Date()
1143
+ };
1144
+ const assistantId = (Date.now() + 1).toString();
1145
+ const assistantMsg = {
1146
+ id: assistantId,
1147
+ role: "assistant",
1148
+ content: "",
1149
+ timestamp: /* @__PURE__ */ new Date(),
1150
+ isStreaming: true
1151
+ };
1152
+ setMessages((prev) => [...prev, userMsg, assistantMsg]);
1153
+ setIsProcessing(true);
1154
+ try {
1155
+ const apiMessages = messages.filter((m) => m.role === "user" || m.role === "assistant").slice(-10).map((m) => ({ role: m.role, content: m.content }));
1156
+ apiMessages.push({ role: "user", content });
1157
+ let response;
1158
+ if (screenWatchRef.current) {
1159
+ const screenshot = await captureScreenshot();
1160
+ if (screenshot) {
1161
+ response = await chatWithVision(apiMessages, screenshot);
1162
+ } else {
1163
+ response = await chat(apiMessages);
1164
+ }
1165
+ } else {
1166
+ response = await chat(apiMessages);
1167
+ }
1168
+ setMessages(
1169
+ (prev) => prev.map(
1170
+ (m) => m.id === assistantId ? { ...m, content: response.content || "(no response)", isStreaming: false } : m
1171
+ )
1172
+ );
1173
+ } catch (err2) {
1174
+ const errorMsg = err2 instanceof Error ? err2.message : "Unknown error";
1175
+ setError(errorMsg);
1176
+ setMessages(
1177
+ (prev) => prev.map(
1178
+ (m) => m.id === assistantId ? { ...m, content: `Error: ${errorMsg}`, isStreaming: false } : m
1179
+ )
1180
+ );
1181
+ } finally {
1182
+ setIsProcessing(false);
1183
+ }
1184
+ }, [messages, isProcessing]);
1185
+ const clearMessages = useCallback(() => {
1186
+ setMessages([WELCOME_MESSAGE]);
1187
+ setError(null);
1188
+ }, []);
1189
+ return {
1190
+ messages,
1191
+ isProcessing,
1192
+ error,
1193
+ sendMessage,
1194
+ addSystemMessage,
1195
+ clearMessages
1196
+ };
1197
+ }
1198
+
1080
1199
  // src/hooks/useVision.ts
1200
+ import { useState as useState4, useCallback as useCallback2 } from "react";
1081
1201
  function useVision() {
1082
1202
  const [isAnalyzing, setIsAnalyzing] = useState4(false);
1083
1203
  const [lastDescription, setLastDescription] = useState4(null);
@@ -1115,21 +1235,21 @@ import { useState as useState5, useCallback as useCallback3, useEffect as useEff
1115
1235
  import { EventEmitter } from "events";
1116
1236
 
1117
1237
  // src/tools/shell.ts
1118
- import { exec as exec6 } from "child_process";
1119
- import { promisify as promisify6 } from "util";
1238
+ import { exec as exec5 } from "child_process";
1239
+ import { promisify as promisify5 } from "util";
1120
1240
 
1121
1241
  // src/tools/clipboard.ts
1122
1242
  import clipboardy from "clipboardy";
1123
1243
 
1124
1244
  // src/tools/process.ts
1245
+ import { exec as exec3 } from "child_process";
1246
+ import { promisify as promisify3 } from "util";
1247
+ var execAsync3 = promisify3(exec3);
1248
+
1249
+ // src/tools/computer.ts
1125
1250
  import { exec as exec4 } from "child_process";
1126
1251
  import { promisify as promisify4 } from "util";
1127
1252
  var execAsync4 = promisify4(exec4);
1128
-
1129
- // src/tools/computer.ts
1130
- import { exec as exec5 } from "child_process";
1131
- import { promisify as promisify5 } from "util";
1132
- var execAsync5 = promisify5(exec5);
1133
1253
  async function clickMouse(button = "left") {
1134
1254
  try {
1135
1255
  if (process.platform === "win32") {
@@ -1139,12 +1259,12 @@ Add-Type -MemberDefinition @"
1139
1259
  public static extern void mouse_event(long dwFlags, long dx, long dy, long cButtons, long dwExtraInfo);
1140
1260
  "@ -Name Mouse -Namespace Win32
1141
1261
  ${button === "left" ? "[Win32.Mouse]::mouse_event(0x02, 0, 0, 0, 0); [Win32.Mouse]::mouse_event(0x04, 0, 0, 0, 0)" : button === "right" ? "[Win32.Mouse]::mouse_event(0x08, 0, 0, 0, 0); [Win32.Mouse]::mouse_event(0x10, 0, 0, 0, 0)" : "[Win32.Mouse]::mouse_event(0x20, 0, 0, 0, 0); [Win32.Mouse]::mouse_event(0x40, 0, 0, 0, 0)"}`;
1142
- await execAsync5(`powershell -Command "${script.replace(/\n/g, " ")}"`, { shell: "cmd.exe" });
1262
+ await execAsync4(`powershell -Command "${script.replace(/\n/g, " ")}"`, { shell: "cmd.exe" });
1143
1263
  } else if (process.platform === "darwin") {
1144
- await execAsync5(`cliclick c:.`);
1264
+ await execAsync4(`cliclick c:.`);
1145
1265
  } else {
1146
1266
  const btn = button === "left" ? "1" : button === "right" ? "3" : "2";
1147
- await execAsync5(`xdotool click ${btn}`);
1267
+ await execAsync4(`xdotool click ${btn}`);
1148
1268
  }
1149
1269
  return ok(`Clicked ${button} button`);
1150
1270
  } catch (error) {
@@ -1155,13 +1275,13 @@ async function typeText(text) {
1155
1275
  try {
1156
1276
  if (process.platform === "win32") {
1157
1277
  const escapedText = text.replace(/'/g, "''").replace(/[+^%~(){}[\]]/g, "{$&}");
1158
- await execAsync5(`powershell -Command "Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.SendKeys]::SendWait('${escapedText}')"`, { shell: "cmd.exe" });
1278
+ await execAsync4(`powershell -Command "Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.SendKeys]::SendWait('${escapedText}')"`, { shell: "cmd.exe" });
1159
1279
  } else if (process.platform === "darwin") {
1160
1280
  const escaped = text.replace(/'/g, "'\\''");
1161
- await execAsync5(`osascript -e 'tell application "System Events" to keystroke "${escaped}"'`);
1281
+ await execAsync4(`osascript -e 'tell application "System Events" to keystroke "${escaped}"'`);
1162
1282
  } else {
1163
1283
  const escaped = text.replace(/'/g, "'\\''");
1164
- await execAsync5(`xdotool type '${escaped}'`);
1284
+ await execAsync4(`xdotool type '${escaped}'`);
1165
1285
  }
1166
1286
  return ok(`Typed: ${text}`);
1167
1287
  } catch (error) {
@@ -1202,7 +1322,7 @@ async function pressKey(key) {
1202
1322
  "f12": "{F12}"
1203
1323
  };
1204
1324
  const winKey = winKeyMap[key.toLowerCase()] || key;
1205
- await execAsync5(`powershell -Command "Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.SendKeys]::SendWait('${winKey}')"`, { shell: "cmd.exe" });
1325
+ await execAsync4(`powershell -Command "Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.SendKeys]::SendWait('${winKey}')"`, { shell: "cmd.exe" });
1206
1326
  } else if (process.platform === "darwin") {
1207
1327
  const macKeyMap = {
1208
1328
  "return": 36,
@@ -1220,12 +1340,12 @@ async function pressKey(key) {
1220
1340
  };
1221
1341
  const keyCode = macKeyMap[key.toLowerCase()];
1222
1342
  if (keyCode) {
1223
- await execAsync5(`osascript -e 'tell application "System Events" to key code ${keyCode}'`);
1343
+ await execAsync4(`osascript -e 'tell application "System Events" to key code ${keyCode}'`);
1224
1344
  } else {
1225
- await execAsync5(`osascript -e 'tell application "System Events" to keystroke "${key}"'`);
1345
+ await execAsync4(`osascript -e 'tell application "System Events" to keystroke "${key}"'`);
1226
1346
  }
1227
1347
  } else {
1228
- await execAsync5(`xdotool key ${key}`);
1348
+ await execAsync4(`xdotool key ${key}`);
1229
1349
  }
1230
1350
  return ok(`Pressed: ${key}`);
1231
1351
  } catch (error) {
@@ -1238,7 +1358,7 @@ async function keyCombo(keys) {
1238
1358
  const hasWin = keys.some((k) => k.toLowerCase() === "meta" || k.toLowerCase() === "win");
1239
1359
  const hasR = keys.some((k) => k.toLowerCase() === "r");
1240
1360
  if (hasWin && hasR) {
1241
- await execAsync5(`powershell -Command "$shell = New-Object -ComObject WScript.Shell; $shell.Run('explorer shell:::{2559a1f3-21d7-11d4-bdaf-00c04f60b9f0}')"`, { shell: "cmd.exe" });
1361
+ await execAsync4(`powershell -Command "$shell = New-Object -ComObject WScript.Shell; $shell.Run('explorer shell:::{2559a1f3-21d7-11d4-bdaf-00c04f60b9f0}')"`, { shell: "cmd.exe" });
1242
1362
  return ok(`Pressed: ${keys.join("+")}`);
1243
1363
  }
1244
1364
  const modifierMap = {
@@ -1258,7 +1378,7 @@ async function keyCombo(keys) {
1258
1378
  }
1259
1379
  }
1260
1380
  combo += regularKeys.join("");
1261
- await execAsync5(`powershell -Command "Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.SendKeys]::SendWait('${combo}')"`, { shell: "cmd.exe" });
1381
+ await execAsync4(`powershell -Command "Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.SendKeys]::SendWait('${combo}')"`, { shell: "cmd.exe" });
1262
1382
  } else if (process.platform === "darwin") {
1263
1383
  const modifiers = keys.filter((k) => ["control", "ctrl", "alt", "shift", "command", "meta"].includes(k.toLowerCase()));
1264
1384
  const regular = keys.filter((k) => !["control", "ctrl", "alt", "shift", "command", "meta"].includes(k.toLowerCase()));
@@ -1274,9 +1394,9 @@ async function keyCombo(keys) {
1274
1394
  };
1275
1395
  cmd += " using {" + modifiers.map((m) => modMap[m.toLowerCase()]).join(", ") + "}";
1276
1396
  }
1277
- await execAsync5(`osascript -e '${cmd}'`);
1397
+ await execAsync4(`osascript -e '${cmd}'`);
1278
1398
  } else {
1279
- await execAsync5(`xdotool key ${keys.join("+")}`);
1399
+ await execAsync4(`xdotool key ${keys.join("+")}`);
1280
1400
  }
1281
1401
  return ok(`Pressed: ${keys.join("+")}`);
1282
1402
  } catch (error) {
@@ -1287,11 +1407,11 @@ async function focusWindow(title) {
1287
1407
  try {
1288
1408
  if (process.platform === "win32") {
1289
1409
  const escaped = title.replace(/'/g, "''");
1290
- await execAsync5(`powershell -Command "$wshell = New-Object -ComObject wscript.shell; $wshell.AppActivate('${escaped}')"`, { shell: "cmd.exe" });
1410
+ await execAsync4(`powershell -Command "$wshell = New-Object -ComObject wscript.shell; $wshell.AppActivate('${escaped}')"`, { shell: "cmd.exe" });
1291
1411
  } else if (process.platform === "darwin") {
1292
- await execAsync5(`osascript -e 'tell application "${title}" to activate'`);
1412
+ await execAsync4(`osascript -e 'tell application "${title}" to activate'`);
1293
1413
  } else {
1294
- await execAsync5(`wmctrl -a "${title}"`);
1414
+ await execAsync4(`wmctrl -a "${title}"`);
1295
1415
  }
1296
1416
  return ok(`Focused window: ${title}`);
1297
1417
  } catch (error) {
@@ -1308,13 +1428,13 @@ function err(error) {
1308
1428
  }
1309
1429
 
1310
1430
  // src/tools/shell.ts
1311
- var execAsync6 = promisify6(exec6);
1431
+ var execAsync5 = promisify5(exec5);
1312
1432
  async function runCommand(cmd, timeout = 3e4) {
1313
1433
  try {
1314
1434
  const isWindows = process.platform === "win32";
1315
1435
  const shell = isWindows ? "cmd.exe" : "/bin/sh";
1316
1436
  const shellArg = isWindows ? "/C" : "-c";
1317
- const { stdout, stderr } = await execAsync6(cmd, {
1437
+ const { stdout, stderr } = await execAsync5(cmd, {
1318
1438
  shell,
1319
1439
  timeout,
1320
1440
  maxBuffer: 10 * 1024 * 1024
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@projectservan8n/cnapse",
3
- "version": "0.5.6",
3
+ "version": "0.5.7",
4
4
  "description": "Autonomous PC intelligence - AI assistant for desktop automation",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -3,8 +3,8 @@
3
3
  */
4
4
 
5
5
  import { useState, useCallback, useRef, useEffect } from 'react';
6
- import { chat, Message } from '../lib/api.js';
7
- import { getScreenDescription } from '../lib/screen.js';
6
+ import { chat, chatWithVision, Message } from '../lib/api.js';
7
+ import { captureScreenshot } from '../lib/vision.js';
8
8
 
9
9
  export interface ChatMessage {
10
10
  id: string;
@@ -34,25 +34,11 @@ export function useChat(screenWatch: boolean = false): UseChatResult {
34
34
  const [messages, setMessages] = useState<ChatMessage[]>([WELCOME_MESSAGE]);
35
35
  const [isProcessing, setIsProcessing] = useState(false);
36
36
  const [error, setError] = useState<string | null>(null);
37
- const screenContextRef = useRef<string | null>(null);
37
+ const screenWatchRef = useRef(screenWatch);
38
38
 
39
- // Screen watching effect
39
+ // Keep ref in sync with prop
40
40
  useEffect(() => {
41
- if (!screenWatch) {
42
- screenContextRef.current = null;
43
- return;
44
- }
45
-
46
- const checkScreen = async () => {
47
- const desc = await getScreenDescription();
48
- if (desc) {
49
- screenContextRef.current = desc;
50
- }
51
- };
52
-
53
- checkScreen();
54
- const interval = setInterval(checkScreen, 5000);
55
- return () => clearInterval(interval);
41
+ screenWatchRef.current = screenWatch;
56
42
  }, [screenWatch]);
57
43
 
58
44
  const addSystemMessage = useCallback((content: string) => {
@@ -100,16 +86,23 @@ export function useChat(screenWatch: boolean = false): UseChatResult {
100
86
  .slice(-10)
101
87
  .map(m => ({ role: m.role as 'user' | 'assistant', content: m.content }));
102
88
 
103
- // Add screen context if watching
104
- let finalContent = content;
105
- if (screenWatch && screenContextRef.current) {
106
- finalContent = `[Screen context: ${screenContextRef.current}]\n\n${content}`;
89
+ apiMessages.push({ role: 'user', content });
90
+
91
+ let response;
92
+
93
+ // If screen watching is enabled, capture screenshot and use vision API
94
+ if (screenWatchRef.current) {
95
+ const screenshot = await captureScreenshot();
96
+ if (screenshot) {
97
+ response = await chatWithVision(apiMessages, screenshot);
98
+ } else {
99
+ // Fallback to regular chat if screenshot fails
100
+ response = await chat(apiMessages);
101
+ }
102
+ } else {
103
+ response = await chat(apiMessages);
107
104
  }
108
105
 
109
- apiMessages.push({ role: 'user', content: finalContent });
110
-
111
- const response = await chat(apiMessages);
112
-
113
106
  // Update assistant message
114
107
  setMessages(prev =>
115
108
  prev.map(m =>
@@ -131,7 +124,7 @@ export function useChat(screenWatch: boolean = false): UseChatResult {
131
124
  } finally {
132
125
  setIsProcessing(false);
133
126
  }
134
- }, [messages, isProcessing, screenWatch]);
127
+ }, [messages, isProcessing]);
135
128
 
136
129
  const clearMessages = useCallback(() => {
137
130
  setMessages([WELCOME_MESSAGE]);
package/src/lib/api.ts CHANGED
@@ -184,6 +184,194 @@ async function chatOpenAI(messages: Message[], model: string): Promise<ChatRespo
184
184
  return { content, model };
185
185
  }
186
186
 
187
+ /**
188
+ * Chat with vision - sends screenshot along with messages
189
+ */
190
+ export async function chatWithVision(messages: Message[], screenshotBase64: string): Promise<ChatResponse> {
191
+ const config = getConfig();
192
+ const systemPrompt = await getSystemPrompt();
193
+
194
+ // Add vision context to system prompt
195
+ const visionPrompt = systemPrompt + '\n\nYou can see the user\'s screen. Describe what you see and help them with their request.';
196
+
197
+ switch (config.provider) {
198
+ case 'openrouter':
199
+ return chatWithVisionOpenRouter(messages, screenshotBase64, visionPrompt);
200
+ case 'ollama':
201
+ return chatWithVisionOllama(messages, screenshotBase64, visionPrompt);
202
+ case 'anthropic':
203
+ return chatWithVisionAnthropic(messages, screenshotBase64, visionPrompt);
204
+ case 'openai':
205
+ return chatWithVisionOpenAI(messages, screenshotBase64, visionPrompt);
206
+ default:
207
+ throw new Error(`Vision not supported for provider: ${config.provider}`);
208
+ }
209
+ }
210
+
211
+ async function chatWithVisionOpenRouter(messages: Message[], screenshot: string, systemPrompt: string): Promise<ChatResponse> {
212
+ const apiKey = getApiKey('openrouter');
213
+ if (!apiKey) throw new Error('OpenRouter API key not configured');
214
+
215
+ const config = getConfig();
216
+
217
+ // Use vision-capable model - prefer GPT-5 Nano or Claude
218
+ let model = config.model;
219
+ if (!model.includes('gpt-5') && !model.includes('claude') && !model.includes('gemini')) {
220
+ model = 'openai/gpt-5-nano'; // Default to GPT-5 Nano for vision
221
+ }
222
+
223
+ // Build messages with image in the last user message
224
+ const lastUserIdx = messages.length - 1;
225
+ const visionMessages = messages.map((m, i) => {
226
+ if (i === lastUserIdx && m.role === 'user') {
227
+ return {
228
+ role: 'user',
229
+ content: [
230
+ { type: 'text', text: m.content },
231
+ { type: 'image_url', image_url: { url: `data:image/png;base64,${screenshot}` } },
232
+ ],
233
+ };
234
+ }
235
+ return m;
236
+ });
237
+
238
+ const response = await fetch('https://openrouter.ai/api/v1/chat/completions', {
239
+ method: 'POST',
240
+ headers: {
241
+ 'Authorization': `Bearer ${apiKey}`,
242
+ 'Content-Type': 'application/json',
243
+ 'HTTP-Referer': config.openrouter.siteUrl,
244
+ 'X-Title': config.openrouter.appName,
245
+ },
246
+ body: JSON.stringify({
247
+ model,
248
+ messages: [{ role: 'system', content: systemPrompt }, ...visionMessages],
249
+ max_tokens: 2048,
250
+ }),
251
+ });
252
+
253
+ if (!response.ok) {
254
+ const error = await response.text();
255
+ throw new Error(`OpenRouter vision error: ${response.status} - ${error}`);
256
+ }
257
+
258
+ const data = await response.json() as any;
259
+ return { content: data.choices?.[0]?.message?.content || '', model };
260
+ }
261
+
262
+ async function chatWithVisionOllama(messages: Message[], screenshot: string, systemPrompt: string): Promise<ChatResponse> {
263
+ const config = getConfig();
264
+
265
+ // Use vision model
266
+ const visionModels = ['llava', 'llama3.2-vision', 'bakllava'];
267
+ const model = visionModels.find(m => config.model.includes(m)) || 'llava';
268
+
269
+ const lastUserMsg = messages.filter(m => m.role === 'user').pop();
270
+
271
+ const response = await fetch(`${config.ollamaHost}/api/generate`, {
272
+ method: 'POST',
273
+ headers: { 'Content-Type': 'application/json' },
274
+ body: JSON.stringify({
275
+ model,
276
+ prompt: `${systemPrompt}\n\nUser: ${lastUserMsg?.content || 'What do you see?'}`,
277
+ images: [screenshot],
278
+ stream: false,
279
+ }),
280
+ });
281
+
282
+ if (!response.ok) {
283
+ const error = await response.text();
284
+ throw new Error(`Ollama vision error: ${error}`);
285
+ }
286
+
287
+ const data = await response.json() as any;
288
+ return { content: data.response || '', model };
289
+ }
290
+
291
+ async function chatWithVisionAnthropic(messages: Message[], screenshot: string, systemPrompt: string): Promise<ChatResponse> {
292
+ const apiKey = getApiKey('anthropic');
293
+ if (!apiKey) throw new Error('Anthropic API key not configured');
294
+
295
+ const chatMessages = messages.filter(m => m.role !== 'system');
296
+ const lastUserIdx = chatMessages.length - 1;
297
+
298
+ const visionMessages = chatMessages.map((m, i) => {
299
+ if (i === lastUserIdx && m.role === 'user') {
300
+ return {
301
+ role: 'user',
302
+ content: [
303
+ { type: 'image', source: { type: 'base64', media_type: 'image/png', data: screenshot } },
304
+ { type: 'text', text: m.content },
305
+ ],
306
+ };
307
+ }
308
+ return { role: m.role, content: m.content };
309
+ });
310
+
311
+ const response = await fetch('https://api.anthropic.com/v1/messages', {
312
+ method: 'POST',
313
+ headers: {
314
+ 'x-api-key': apiKey,
315
+ 'anthropic-version': '2023-06-01',
316
+ 'Content-Type': 'application/json',
317
+ },
318
+ body: JSON.stringify({
319
+ model: 'claude-3-5-sonnet-20241022',
320
+ max_tokens: 2048,
321
+ system: systemPrompt,
322
+ messages: visionMessages,
323
+ }),
324
+ });
325
+
326
+ if (!response.ok) {
327
+ const error = await response.text();
328
+ throw new Error(`Anthropic vision error: ${error}`);
329
+ }
330
+
331
+ const data = await response.json() as any;
332
+ return { content: data.content?.[0]?.text || '', model: 'claude-3-5-sonnet-20241022' };
333
+ }
334
+
335
+ async function chatWithVisionOpenAI(messages: Message[], screenshot: string, systemPrompt: string): Promise<ChatResponse> {
336
+ const apiKey = getApiKey('openai');
337
+ if (!apiKey) throw new Error('OpenAI API key not configured');
338
+
339
+ const lastUserIdx = messages.length - 1;
340
+ const visionMessages = messages.map((m, i) => {
341
+ if (i === lastUserIdx && m.role === 'user') {
342
+ return {
343
+ role: 'user',
344
+ content: [
345
+ { type: 'text', text: m.content },
346
+ { type: 'image_url', image_url: { url: `data:image/png;base64,${screenshot}` } },
347
+ ],
348
+ };
349
+ }
350
+ return m;
351
+ });
352
+
353
+ const response = await fetch('https://api.openai.com/v1/chat/completions', {
354
+ method: 'POST',
355
+ headers: {
356
+ 'Authorization': `Bearer ${apiKey}`,
357
+ 'Content-Type': 'application/json',
358
+ },
359
+ body: JSON.stringify({
360
+ model: 'gpt-4o',
361
+ messages: [{ role: 'system', content: systemPrompt }, ...visionMessages],
362
+ max_tokens: 2048,
363
+ }),
364
+ });
365
+
366
+ if (!response.ok) {
367
+ const error = await response.text();
368
+ throw new Error(`OpenAI vision error: ${error}`);
369
+ }
370
+
371
+ const data = await response.json() as any;
372
+ return { content: data.choices?.[0]?.message?.content || '', model: 'gpt-4o' };
373
+ }
374
+
187
375
  export async function testConnection(): Promise<boolean> {
188
376
  try {
189
377
  await chat([{ role: 'user', content: 'hi' }]);