nothumanallowed 9.8.1 → 9.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "nothumanallowed",
3
- "version": "9.8.1",
3
+ "version": "9.8.2",
4
4
  "description": "NotHumanAllowed — 38 AI agents, 53 tools. Email, calendar, browser automation, screen capture, canvas, cron/heartbeat, GitHub, Notion, Slack, voice chat, 28 languages. Zero-dependency CLI.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -718,12 +718,31 @@ export async function cmdChat(args) {
718
718
  const result = await executeTool(action, params, config);
719
719
  process.stdout.write('\r' + ' '.repeat(80) + '\r');
720
720
 
721
- // Show action-specific result header
722
- const resultHeader = formatToolResult(action, params, result);
723
- console.log(` ${resultHeader}`);
724
- console.log(` ${result.split('\n').join('\n ')}\n`);
725
-
726
- addMessages(conv, input, response + `\n\n[Tool ${action} executed. Result: ${result}]`);
721
+ // Handle screen capture vision result
722
+ if (result && typeof result === 'object' && result.__screenshot) {
723
+ console.log(` ${G}Screenshot captured${NC} — analyzing with vision...\n`);
724
+ try {
725
+ const { callLLMVision } = await import('../services/llm.mjs');
726
+ const visionResponse = await callLLMVision(config,
727
+ 'Describe EXACTLY and ONLY what you see in this screenshot. NEVER invent or fabricate details.',
728
+ `The user said: "${input}"\n\n${result.question}`,
729
+ { base64: result.base64, mimeType: 'image/png' }
730
+ );
731
+ console.log(` ${visionResponse.split('\n').join('\n ')}\n`);
732
+ addMessages(conv, input, response + `\n\n[Screenshot: ${result.path}]\n${visionResponse}`);
733
+ } catch (visionErr) {
734
+ console.log(` ${R}Vision failed: ${visionErr.message}${NC}\n`);
735
+ addMessages(conv, input, response + `\n\n[Screenshot captured but vision failed: ${visionErr.message}]`);
736
+ }
737
+ } else {
738
+ // Show action-specific result header
739
+ const resultStr = typeof result === 'string' ? result : JSON.stringify(result);
740
+ const resultHeader = formatToolResult(action, params, resultStr);
741
+ console.log(` ${resultHeader}`);
742
+ console.log(` ${resultStr.split('\n').join('\n ')}\n`);
743
+
744
+ addMessages(conv, input, response + `\n\n[Tool ${action} executed. Result: ${resultStr}]`);
745
+ }
727
746
  } catch (err) {
728
747
  process.stdout.write('\r' + ' '.repeat(80) + '\r');
729
748
  console.log(` ${R}Error executing ${action}: ${err.message}${NC}\n`);
@@ -1053,18 +1053,49 @@ export async function cmdUI(args) {
1053
1053
 
1054
1054
  // Execute ALL tool actions and collect results
1055
1055
  const toolResults = [];
1056
+ let screenshotData = null; // For vision: { base64, path, question }
1056
1057
  for (const { action, params } of actions) {
1057
1058
  try {
1058
1059
  const result = await executeTool(action, params, config);
1059
- toolResults.push({ action, result: typeof result === 'object' ? JSON.stringify(result) : String(result) });
1060
+ // Intercept structured screenshot result for vision flow
1061
+ if (result && typeof result === 'object' && result.__screenshot) {
1062
+ screenshotData = result;
1063
+ toolResults.push({ action, result: 'Screenshot captured. Analyzing with vision...' });
1064
+ } else {
1065
+ toolResults.push({ action, result: typeof result === 'object' ? JSON.stringify(result) : String(result) });
1066
+ }
1060
1067
  } catch (e) {
1061
1068
  toolResults.push({ action, result: `Error: ${e.message}` });
1062
1069
  }
1063
1070
  }
1064
1071
 
1065
1072
  let fullResponse;
1066
- if (toolResults.length > 0) {
1067
- // Second LLM call with real tool results — forces the LLM to use actual data
1073
+ if (screenshotData && screenshotData.base64) {
1074
+ // VISION FLOW: send screenshot to LLM as image via callLLMVision
1075
+ try {
1076
+ const { callLLMVision } = await import('../services/llm.mjs');
1077
+ const visionPrompt = enrichedSystemPrompt + '\n\nIMPORTANT: You are looking at a REAL screenshot from the user\'s screen. Describe ONLY what you ACTUALLY see. NEVER invent, guess, or fabricate details. If something is unclear, say so.';
1078
+ const question = `The user said: "${body.message}"\n\n${screenshotData.question}\n\nDescribe ONLY what you see. NEVER make up information.`;
1079
+ fullResponse = await callLLMVision(config, visionPrompt, question, {
1080
+ base64: screenshotData.base64,
1081
+ mimeType: 'image/png',
1082
+ });
1083
+ } catch (visionErr) {
1084
+ fullResponse = `I captured a screenshot but vision analysis failed: ${visionErr.message}. To use screen analysis, configure a vision-capable provider (Claude, GPT-4, Gemini).`;
1085
+ }
1086
+ // Prepend screenshot file marker for the UI to display inline
1087
+ if (screenshotData.path) {
1088
+ const fname = screenshotData.path.split('/').pop();
1089
+ // Copy to NHA screenshots dir for persistence
1090
+ try {
1091
+ const ssDir = path.join(NHA_DIR, 'screenshots');
1092
+ fs.mkdirSync(ssDir, { recursive: true });
1093
+ fs.copyFileSync(screenshotData.path, path.join(ssDir, fname));
1094
+ } catch {}
1095
+ fullResponse = `![Screenshot](/api/screenshots/${fname})\n\n${fullResponse}`;
1096
+ }
1097
+ } else if (toolResults.length > 0) {
1098
+ // Standard tool results flow
1068
1099
  const toolContext = toolResults.map(t => {
1069
1100
  let clean = t.result.replace(/\[Screenshot[^\]]*\]/g, '').replace(/!\[.*?\]\(data:image[^)]+\)/g, '').slice(0, 3000);
1070
1101
  return `[${t.action} result]: ${clean.trim()}`;
@@ -1073,7 +1104,6 @@ export async function cmdUI(args) {
1073
1104
  try {
1074
1105
  fullResponse = await callLLM(config, enrichedSystemPrompt, followUp);
1075
1106
  } catch {
1076
- // Fallback: show raw results
1077
1107
  fullResponse = toolResults.map(t => `${t.action}: ${t.result}`).join('\n\n');
1078
1108
  }
1079
1109
  } else {
@@ -1325,6 +1355,40 @@ export async function cmdUI(args) {
1325
1355
  }
1326
1356
 
1327
1357
  const result = await executeTool(action, params, config);
1358
+
1359
+ // Intercept screen capture vision result
1360
+ if (result && typeof result === 'object' && result.__screenshot) {
1361
+ sendSSE('tool', { action, status: 'analyzing_screen' });
1362
+ try {
1363
+ const { callLLMVision } = await import('../services/llm.mjs');
1364
+ const visionPrompt = enrichedPrompt + '\\n\\nIMPORTANT: You are looking at a REAL screenshot. Describe ONLY what you ACTUALLY see. NEVER invent or fabricate.';
1365
+ const question = `The user said: "${msg}"\\n\\n${result.question}`;
1366
+ const visionResponse = await callLLMVision(config, visionPrompt, question, { base64: result.base64, mimeType: 'image/png' });
1367
+
1368
+ // Save screenshot for display
1369
+ const fname = result.path.split('/').pop();
1370
+ const ssDir = path.join(NHA_DIR, 'screenshots');
1371
+ fs.mkdirSync(ssDir, { recursive: true });
1372
+ try { fs.copyFileSync(result.path, path.join(ssDir, fname)); } catch {}
1373
+
1374
+ // Send screenshot to client
1375
+ sendSSE('screenshot', { filename: fname, format: 'png' });
1376
+ if (!res._screenshotFiles) res._screenshotFiles = [];
1377
+ res._screenshotFiles.push(fname);
1378
+
1379
+ // Send vision analysis as tokens
1380
+ sendSSE('tool_synthesis', {});
1381
+ sendSSE('token', { content: visionResponse });
1382
+
1383
+ toolResults.push({ action, result: `[Screen analyzed] ${visionResponse.slice(0, 500)}` });
1384
+ sendSSE('tool', { action, status: 'done', result: 'Screen analyzed' });
1385
+ } catch (visionErr) {
1386
+ toolResults.push({ action, result: `Vision failed: ${visionErr.message}` });
1387
+ sendSSE('tool', { action, status: 'error', error: visionErr.message });
1388
+ }
1389
+ continue;
1390
+ }
1391
+
1328
1392
  const resultStr = typeof result === 'object' ? JSON.stringify(result) : String(result);
1329
1393
  toolResults.push({ action, result: resultStr });
1330
1394
  sendSSE('tool', { action, status: 'done', result: typeof resultStr === 'string' ? resultStr.slice(0, 500) : '' });
package/src/constants.mjs CHANGED
@@ -5,7 +5,7 @@ import { fileURLToPath } from 'url';
5
5
  const __filename = fileURLToPath(import.meta.url);
6
6
  const __dirname = path.dirname(__filename);
7
7
 
8
- export const VERSION = '9.8.1';
8
+ export const VERSION = '9.8.2';
9
9
  export const BASE_URL = 'https://nothumanallowed.com/cli';
10
10
  export const API_BASE = 'https://nothumanallowed.com/api/v1';
11
11
 
@@ -275,6 +275,96 @@ export async function callLLM(config, systemPrompt, userMessage, opts = {}) {
275
275
  return callFn(apiKey, model, systemPrompt, userMessage, false);
276
276
  }
277
277
 
278
+ /**
279
+ * Call LLM with multimodal (vision) messages — supports image + PDF content.
280
+ * Unified function for ALL vision calls (web UI, CLI, screen capture).
281
+ * @param {object} config
282
+ * @param {string} systemPrompt
283
+ * @param {string} userMessage — text question about the image
284
+ * @param {object} media — { base64, mimeType } (image/png, image/jpeg, application/pdf)
285
+ * @returns {Promise<string>}
286
+ */
287
+ export async function callLLMVision(config, systemPrompt, userMessage, media) {
288
+ const provider = config.llm.provider || 'anthropic';
289
+ const model = config.llm.model || null;
290
+ const apiKey = getApiKey(config, provider);
291
+ if (!apiKey) throw new Error(`No API key for ${provider}. Vision requires Claude, GPT-4, or Gemini.`);
292
+
293
+ const { base64, mimeType } = media;
294
+ if (!base64 || !mimeType) throw new Error('media.base64 and media.mimeType are required');
295
+
296
+ if (provider === 'anthropic') {
297
+ const isPdf = mimeType === 'application/pdf';
298
+ const contentBlock = isPdf
299
+ ? { type: 'document', source: { type: 'base64', media_type: mimeType, data: base64 } }
300
+ : { type: 'image', source: { type: 'base64', media_type: mimeType, data: base64 } };
301
+
302
+ const body = {
303
+ model: model || 'claude-sonnet-4-20250514',
304
+ max_tokens: 4096,
305
+ system: systemPrompt,
306
+ messages: [{ role: 'user', content: [contentBlock, { type: 'text', text: userMessage }] }],
307
+ };
308
+ const res = await fetch('https://api.anthropic.com/v1/messages', {
309
+ method: 'POST',
310
+ headers: { 'Content-Type': 'application/json', 'x-api-key': apiKey, 'anthropic-version': '2023-06-01' },
311
+ body: JSON.stringify(body),
312
+ });
313
+ if (!res.ok) throw new Error(`Anthropic vision ${res.status}: ${(await res.text()).slice(0, 300)}`);
314
+ const data = await res.json();
315
+ return data.content?.[0]?.text || '';
316
+ }
317
+
318
+ if (provider === 'openai' || provider === 'deepseek' || provider === 'grok' || provider === 'mistral') {
319
+ const url = provider === 'openai' ? 'https://api.openai.com/v1/chat/completions'
320
+ : provider === 'deepseek' ? 'https://api.deepseek.com/v1/chat/completions'
321
+ : provider === 'grok' ? 'https://api.x.ai/v1/chat/completions'
322
+ : 'https://api.mistral.ai/v1/chat/completions';
323
+
324
+ const body = {
325
+ model: model || (provider === 'openai' ? 'gpt-4o' : model),
326
+ max_tokens: 4096,
327
+ messages: [
328
+ { role: 'system', content: systemPrompt },
329
+ { role: 'user', content: [
330
+ { type: 'image_url', image_url: { url: `data:${mimeType};base64,${base64}` } },
331
+ { type: 'text', text: userMessage },
332
+ ] },
333
+ ],
334
+ };
335
+ const res = await fetch(url, {
336
+ method: 'POST',
337
+ headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${apiKey}` },
338
+ body: JSON.stringify(body),
339
+ });
340
+ if (!res.ok) throw new Error(`${provider} vision ${res.status}: ${(await res.text()).slice(0, 300)}`);
341
+ const data = await res.json();
342
+ return data.choices?.[0]?.message?.content || '';
343
+ }
344
+
345
+ if (provider === 'gemini') {
346
+ const geminiModel = model || 'gemini-2.0-flash';
347
+ const body = {
348
+ contents: [{ parts: [
349
+ { inline_data: { mime_type: mimeType, data: base64 } },
350
+ { text: userMessage },
351
+ ] }],
352
+ systemInstruction: { parts: [{ text: systemPrompt }] },
353
+ generationConfig: { maxOutputTokens: 4096 },
354
+ };
355
+ const res = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/${geminiModel}:generateContent?key=${apiKey}`, {
356
+ method: 'POST',
357
+ headers: { 'Content-Type': 'application/json' },
358
+ body: JSON.stringify(body),
359
+ });
360
+ if (!res.ok) throw new Error(`Gemini vision ${res.status}: ${(await res.text()).slice(0, 300)}`);
361
+ const data = await res.json();
362
+ return data.candidates?.[0]?.content?.parts?.[0]?.text || '';
363
+ }
364
+
365
+ throw new Error(`Vision not supported for provider: ${provider}. Use Claude, GPT-4, or Gemini.`);
366
+ }
367
+
278
368
  /**
279
369
  * Call an LLM provider with streaming enabled.
280
370
  * Calls onToken(chunk) for each token, returns full text at the end.