nothumanallowed 9.6.0 → 9.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "nothumanallowed",
3
- "version": "9.6.0",
3
+ "version": "9.7.0",
4
4
  "description": "NotHumanAllowed — 38 AI agents, 53 tools. Email, calendar, browser automation, screen capture, canvas, cron/heartbeat, GitHub, Notion, Slack, voice chat, 28 languages. Zero-dependency CLI.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -15,7 +15,7 @@ import fs from 'fs';
15
15
  import path from 'path';
16
16
  import { loadConfig } from '../config.mjs';
17
17
  import { detectMailProvider, hasMailProvider, getProviderStatus } from '../services/mail-router.mjs';
18
- import { callLLM, callAgent, parseAgentFile } from '../services/llm.mjs';
18
+ import { callLLM, callLLMVision, callAgent, parseAgentFile } from '../services/llm.mjs';
19
19
  import { getUnreadImportant, getMessage, listMessages, sendEmail, createDraft } from '../services/mail-router.mjs';
20
20
  import { getTodayEvents, getUpcomingEvents, createEvent, updateEvent, getEventsForDate } from '../services/mail-router.mjs';
21
21
  import {
@@ -158,7 +158,12 @@ export async function cmdUI(args) {
158
158
  const UI_PERSONA = `You are NHA Chat, a personal operations assistant inside the NotHumanAllowed web UI. ` +
159
159
  `You help the user manage their emails, calendar, tasks, GitHub issues, Notion pages, and Slack channels through natural conversation. ` +
160
160
  `Be concise, helpful, and proactive. When presenting data, format it clearly. ` +
161
- `Never output raw JSON to the user.`;
161
+ `Never output raw JSON to the user.\n\n` +
162
+ `ABSOLUTE RULE — NEVER LIE: You MUST ALWAYS tell the truth. NEVER fabricate, invent, or guess information. ` +
163
+ `If you don't know something, say "I don't know." If a tool fails, say it failed. If you cannot see something, say you cannot see it. ` +
164
+ `If you receive a screenshot but cannot analyze it (no vision support), say so honestly. ` +
165
+ `NEVER describe things you haven't actually seen or data you haven't actually received. ` +
166
+ `Honesty is MORE important than being helpful. A truthful "I don't know" is ALWAYS better than a fabricated answer.`;
162
167
  const chatSystemPrompt = buildSystemPrompt('NHA UI', UI_PERSONA, config);
163
168
 
164
169
  // ── Route Handlers ──────────────────────────────────────────────────────
@@ -675,24 +680,49 @@ export async function cmdUI(args) {
675
680
 
676
681
  // Execute ALL tool actions and collect results
677
682
  const toolResults = [];
683
+ let screenshotData = null; // For vision: { base64, path, question }
684
+ let screenshotFiles = []; // For displaying inline
678
685
  for (const { action, params } of actions) {
679
686
  try {
680
687
  const result = await executeTool(action, params, config);
681
- toolResults.push({ action, result: typeof result === 'object' ? JSON.stringify(result) : String(result) });
688
+ // Check if result is a structured screenshot object
689
+ if (result && typeof result === 'object' && result.__screenshot) {
690
+ screenshotData = result;
691
+ screenshotFiles.push(result.path);
692
+ toolResults.push({ action, result: 'Screenshot captured. Analyzing with vision...' });
693
+ } else {
694
+ toolResults.push({ action, result: typeof result === 'object' ? JSON.stringify(result) : String(result) });
695
+ }
682
696
  } catch (e) {
683
697
  toolResults.push({ action, result: `Error: ${e.message}` });
684
698
  }
685
699
  }
686
700
 
687
701
  let fullResponse;
688
- if (toolResults.length > 0) {
689
- // Second LLM call with real tool results — forces the LLM to use actual data
702
+ if (screenshotData && screenshotData.base64) {
703
+ // VISION FLOW: send screenshot image to LLM as multimodal content
704
+ try {
705
+ const visionMessages = [
706
+ { role: 'system', content: enrichedSystemPrompt + '\n\nIMPORTANT: You are looking at a REAL screenshot from the user\'s screen. Describe ONLY what you ACTUALLY see. NEVER invent, guess, or fabricate details. If something is unclear, say so. Be specific about windows, text, UI elements you can identify.' },
707
+ { role: 'user', content: [
708
+ { type: 'image_url', image_url: { url: `data:image/png;base64,${screenshotData.base64}` } },
709
+ { type: 'text', text: `The user said: "${body.message}"\n\n${screenshotData.question}\n\nDescribe ONLY what you see. NEVER make up information.` },
710
+ ] },
711
+ ];
712
+ fullResponse = await callLLMVision(config, visionMessages);
713
+ } catch (visionErr) {
714
+ // Fallback: try regular call explaining we can't do vision
715
+ fullResponse = `I captured a screenshot but your current LLM provider doesn't support vision/image analysis. The screenshot is saved at: ${screenshotData.path}\n\nTo use screen analysis, configure a vision-capable provider (Claude, GPT-4, Gemini).`;
716
+ }
717
+ // Prepend screenshot file marker for the UI to display
718
+ fullResponse = `[SCREENSHOT_FILE]${screenshotData.path}[/SCREENSHOT_FILE]\n${fullResponse}`;
719
+ } else if (toolResults.length > 0) {
720
+ // Standard tool results flow
690
721
  const toolContext = toolResults.map(t => `[${t.action} result]: ${t.result}`).join('\n\n');
691
722
  const followUp = `The user asked: "${body.message}"\n\nI executed these tools and got REAL results:\n\n${toolContext}\n\nNow respond to the user based ONLY on the REAL data above. Do NOT invent or fabricate any information. Present the actual results clearly.`;
692
723
  try {
693
724
  fullResponse = await callLLM(config, enrichedSystemPrompt, followUp);
694
725
  } catch {
695
- // Fallback: show raw results
696
726
  fullResponse = toolResults.map(t => `${t.action}: ${t.result}`).join('\n\n');
697
727
  }
698
728
  } else {
@@ -708,7 +738,7 @@ export async function cmdUI(args) {
708
738
  } catch { /* non-critical */ }
709
739
  try { extractMemory('chat', body.message, fullResponse); } catch { /* non-critical */ }
710
740
 
711
- sendJSON(res, 200, { response: fullResponse, toolResults, actions });
741
+ sendJSON(res, 200, { response: fullResponse, toolResults, actions, screenshotFiles });
712
742
  } catch (e) {
713
743
  sendJSON(res, 200, { response: null, error: e.message });
714
744
  }
package/src/constants.mjs CHANGED
@@ -5,7 +5,7 @@ import { fileURLToPath } from 'url';
5
5
  const __filename = fileURLToPath(import.meta.url);
6
6
  const __dirname = path.dirname(__filename);
7
7
 
8
- export const VERSION = '9.6.0';
8
+ export const VERSION = '9.7.0';
9
9
  export const BASE_URL = 'https://nothumanallowed.com/cli';
10
10
  export const API_BASE = 'https://nothumanallowed.com/api/v1';
11
11
 
@@ -275,6 +275,113 @@ export async function callLLM(config, systemPrompt, userMessage, opts = {}) {
275
275
  return callFn(apiKey, model, systemPrompt, userMessage, false);
276
276
  }
277
277
 
278
+ /**
279
+ * Call LLM with multimodal (vision) messages — supports image content.
280
+ * Uses the provider's native vision format.
281
+ * @param {object} config
282
+ * @param {Array} messages - Array of { role, content } where content can be string or array of content blocks
283
+ * @returns {Promise<string>}
284
+ */
285
+ export async function callLLMVision(config, messages) {
286
+ const provider = config.llm.provider || 'anthropic';
287
+ const model = config.llm.model || null;
288
+ const apiKey = getApiKey(config, provider);
289
+ if (!apiKey) throw new Error(`No API key for ${provider}. Vision requires Claude, GPT-4, or Gemini.`);
290
+
291
+ if (provider === 'anthropic') {
292
+ // Anthropic format: system separate, messages with content blocks
293
+ const systemMsg = messages.find(m => m.role === 'system');
294
+ const userMsgs = messages.filter(m => m.role !== 'system');
295
+
296
+ // Convert OpenAI-style image_url to Anthropic format
297
+ const anthropicMessages = userMsgs.map(m => {
298
+ if (typeof m.content === 'string') return m;
299
+ const blocks = m.content.map(block => {
300
+ if (block.type === 'text') return block;
301
+ if (block.type === 'image_url') {
302
+ const url = block.image_url.url;
303
+ const match = url.match(/^data:image\/(png|jpeg|gif|webp);base64,(.+)$/);
304
+ if (match) {
305
+ return { type: 'image', source: { type: 'base64', media_type: `image/${match[1]}`, data: match[2] } };
306
+ }
307
+ }
308
+ return block;
309
+ });
310
+ return { role: m.role, content: blocks };
311
+ });
312
+
313
+ const body = {
314
+ model: model || 'claude-sonnet-4-20250514',
315
+ max_tokens: 4096,
316
+ system: systemMsg?.content || '',
317
+ messages: anthropicMessages,
318
+ };
319
+ const res = await fetch('https://api.anthropic.com/v1/messages', {
320
+ method: 'POST',
321
+ headers: { 'Content-Type': 'application/json', 'x-api-key': apiKey, 'anthropic-version': '2023-06-01' },
322
+ body: JSON.stringify(body),
323
+ });
324
+ if (!res.ok) throw new Error(`Anthropic vision ${res.status}: ${await res.text()}`);
325
+ const data = await res.json();
326
+ return data.content?.[0]?.text || '';
327
+ }
328
+
329
+ if (provider === 'openai' || provider === 'deepseek' || provider === 'grok' || provider === 'mistral') {
330
+ // OpenAI-compatible format — works with GPT-4V, DeepSeek VL, etc.
331
+ const url = provider === 'openai' ? 'https://api.openai.com/v1/chat/completions'
332
+ : provider === 'deepseek' ? 'https://api.deepseek.com/chat/completions'
333
+ : provider === 'grok' ? 'https://api.x.ai/v1/chat/completions'
334
+ : 'https://api.mistral.ai/v1/chat/completions';
335
+
336
+ const visionModel = model || (provider === 'openai' ? 'gpt-4o' : model);
337
+ const body = { model: visionModel, max_tokens: 4096, messages };
338
+ const res = await fetch(url, {
339
+ method: 'POST',
340
+ headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${apiKey}` },
341
+ body: JSON.stringify(body),
342
+ });
343
+ if (!res.ok) throw new Error(`${provider} vision ${res.status}: ${await res.text()}`);
344
+ const data = await res.json();
345
+ return data.choices?.[0]?.message?.content || '';
346
+ }
347
+
348
+ if (provider === 'gemini') {
349
+ // Gemini format — inline_data with base64
350
+ const systemMsg = messages.find(m => m.role === 'system');
351
+ const userMsgs = messages.filter(m => m.role !== 'system');
352
+ const parts = [];
353
+ for (const msg of userMsgs) {
354
+ if (typeof msg.content === 'string') {
355
+ parts.push({ text: msg.content });
356
+ } else {
357
+ for (const block of msg.content) {
358
+ if (block.type === 'text') parts.push({ text: block.text });
359
+ if (block.type === 'image_url') {
360
+ const match = block.image_url.url.match(/^data:image\/(.*?);base64,(.+)$/);
361
+ if (match) parts.push({ inline_data: { mime_type: `image/${match[1]}`, data: match[2] } });
362
+ }
363
+ }
364
+ }
365
+ }
366
+
367
+ const geminiModel = model || 'gemini-2.0-flash';
368
+ const body = {
369
+ contents: [{ parts }],
370
+ systemInstruction: systemMsg ? { parts: [{ text: systemMsg.content }] } : undefined,
371
+ };
372
+ const res = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/${geminiModel}:generateContent?key=${apiKey}`, {
373
+ method: 'POST',
374
+ headers: { 'Content-Type': 'application/json' },
375
+ body: JSON.stringify(body),
376
+ });
377
+ if (!res.ok) throw new Error(`Gemini vision ${res.status}: ${await res.text()}`);
378
+ const data = await res.json();
379
+ return data.candidates?.[0]?.content?.parts?.[0]?.text || '';
380
+ }
381
+
382
+ throw new Error(`Vision not supported for provider: ${provider}. Use Claude, GPT-4, or Gemini.`);
383
+ }
384
+
278
385
  /**
279
386
  * Call an agent by name — loads the agent file, calls LLM, returns response.
280
387
  * No streaming. Used by PAO pipeline for batch agent calls.
@@ -1060,28 +1060,21 @@ export async function executeTool(action, params, config) {
1060
1060
  }
1061
1061
 
1062
1062
  // ── Screen Capture + Vision ────────────────────────────────────────
1063
- case 'screen_capture': {
1064
- const { captureScreen } = await import('./screen-capture.mjs');
1065
- const result = captureScreen({ monitor: params.monitor || 1 });
1066
- if (!result.ok) return `Screen capture failed: ${result.error}`;
1067
-
1068
- // Save screenshot to file for the UI to display
1069
- const screenshotPath = result.path;
1070
-
1071
- // Return a compact description + mark that we have an image
1072
- // The calling layer (chat/ui) will handle displaying the image
1073
- return `[SCREENSHOT]${screenshotPath}[/SCREENSHOT]\nScreenshot captured successfully. I can see your screen. What would you like me to analyze?`;
1074
- }
1075
-
1063
+ case 'screen_capture':
1076
1064
  case 'screen_analyze': {
1077
1065
  const { captureScreen } = await import('./screen-capture.mjs');
1078
1066
  const result = captureScreen({ monitor: params.monitor || 1 });
1079
1067
  if (!result.ok) return `Screen capture failed: ${result.error}`;
1080
1068
 
1081
- // For vision analysis, we need to send the image to the LLM
1082
- // Return the base64 with a marker so the chat layer sends it as a vision message
1083
- const question = params.question || 'Describe what you see on screen.';
1084
- return `[VISION_REQUEST]${result.base64}[/VISION_REQUEST]\n[VISION_QUESTION]${question}[/VISION_QUESTION]`;
1069
+ // Return structured result the chat layer intercepts __screenshot to send as vision
1070
+ const question = params.question || 'Describe EXACTLY and ONLY what you see in this screenshot. Do NOT invent or guess anything.';
1071
+ return {
1072
+ __screenshot: true,
1073
+ path: result.path,
1074
+ base64: result.base64,
1075
+ question,
1076
+ displayText: `[SCREENSHOT_FILE]${result.path}[/SCREENSHOT_FILE]`,
1077
+ };
1085
1078
  }
1086
1079
 
1087
1080
  // ── Canvas ───────────────────────────────────────────────────────────
@@ -330,30 +330,78 @@ function renderMessages(){
330
330
  el.innerHTML='<div class="chat__empty"><div class="chat__empty-title">NHA Chat</div><div>Personal Operations Assistant</div><div class="chat__empty-hint">Try: Show my unread emails / What is on my calendar? / Add a task</div></div>';
331
331
  return;
332
332
  }
333
- var h='';chatHistory.forEach(function(m){
333
+ var h='';chatHistory.forEach(function(m,idx){
334
334
  var content = m.content || '';
335
- // Handle canvas render markers
336
- var canvasMatch = content.match(/\\[CANVAS_RENDER\\](.*?)\\[\\/CANVAS_RENDER\\]/s);
337
- if (canvasMatch) {
338
- try { var cd = JSON.parse(canvasMatch[1]); showCanvas(cd.html, cd.title); } catch(e){}
339
- content = content.replace(/\\[CANVAS_RENDER\\].*?\\[\\/CANVAS_RENDER\\]/s, '').trim();
340
- }
341
- if (content.indexOf('[CANVAS_CLEAR]') !== -1) {
342
- closeCanvas();
343
- content = content.replace(/\\[CANVAS_CLEAR\\].*?\\[\\/CANVAS_CLEAR\\]/s, '').trim();
335
+ var isAssistant = m.role === 'assistant';
336
+ var extraHtml = '';
337
+
338
+ if (isAssistant) {
339
+ // Handle canvas render markers
340
+ var canvasMatch = content.match(/\[CANVAS_RENDER\]([\s\S]*?)\[\/CANVAS_RENDER\]/);
341
+ if (canvasMatch) {
342
+ try { var cd = JSON.parse(canvasMatch[1]); showCanvas(cd.html, cd.title); } catch(e){}
343
+ content = content.replace(/\[CANVAS_RENDER\][\s\S]*?\[\/CANVAS_RENDER\]/, '').trim();
344
+ }
345
+ if (content.indexOf('[CANVAS_CLEAR]') !== -1) {
346
+ closeCanvas();
347
+ content = content.replace(/\[CANVAS_CLEAR\][\s\S]*?\[\/CANVAS_CLEAR\]/, '').trim();
348
+ }
349
+ // Handle screenshot file markers
350
+ var ssMatch = content.match(/\[SCREENSHOT_FILE\](.*?)\[\/SCREENSHOT_FILE\]/);
351
+ if (ssMatch) {
352
+ var fname = ssMatch[1].split('/').pop();
353
+ extraHtml = '<img src="/api/screenshots/' + encodeURIComponent(fname) + '" style="max-width:100%;border-radius:8px;margin:8px 0;border:1px solid rgba(0,255,65,0.2)" />';
354
+ content = content.replace(/\[SCREENSHOT_FILE\].*?\[\/SCREENSHOT_FILE\]/, '').trim();
355
+ }
344
356
  }
345
- // Handle screenshot markers
346
- var screenshotMatch = content.match(/\\[SCREENSHOT\\](.*?)\\[\\/SCREENSHOT\\]/);
347
- if (screenshotMatch) {
348
- var imgPath = screenshotMatch[1];
349
- content = content.replace(/\\[SCREENSHOT\\].*?\\[\\/SCREENSHOT\\]/, '');
350
- content = '<img src="/api/screenshots/' + encodeURIComponent(imgPath.split('/').pop()) + '" style="max-width:100%;border-radius:8px;margin:8px 0" />' + content;
357
+
358
+ var bubbleContent = isAssistant ? extraHtml + esc(content).replace(/\n/g, '<br>') : esc(content).replace(/\n/g, '<br>');
359
+
360
+ // Action buttons for each message
361
+ var actions = '<div class="msg__actions" style="display:flex;gap:6px;margin-top:4px;opacity:0.3">';
362
+ actions += '<button onclick="copyMessage('+idx+')" style="background:none;border:none;color:var(--dim);cursor:pointer;font-size:10px;font-family:var(--mono)" title="Copy">Copy</button>';
363
+ if (isAssistant) {
364
+ actions += '<button onclick="retryMessage('+idx+')" style="background:none;border:none;color:var(--dim);cursor:pointer;font-size:10px;font-family:var(--mono)" title="Retry">Retry</button>';
365
+ } else {
366
+ actions += '<button onclick="editMessage('+idx+')" style="background:none;border:none;color:var(--dim);cursor:pointer;font-size:10px;font-family:var(--mono)" title="Edit">Edit</button>';
351
367
  }
352
- var bubbleContent = m.role === 'assistant' ? content : esc(content);
353
- h+='<div class="msg msg--'+esc(m.role)+'"><div class="msg__label">'+esc(m.role==='user'?'You':'NHA')+'</div><div class="msg__bubble">'+bubbleContent+'</div></div>';
368
+ actions += '</div>';
369
+
370
+ h+='<div class="msg msg--'+esc(m.role)+'" onmouseenter="this.querySelector(\'.msg__actions\').style.opacity=1" onmouseleave="this.querySelector(\'.msg__actions\').style.opacity=0.3"><div class="msg__label">'+esc(m.role==='user'?'You':'NHA')+'</div><div class="msg__bubble">'+bubbleContent+'</div>'+actions+'</div>';
354
371
  });
355
372
  el.innerHTML=h;el.scrollTop=el.scrollHeight;
356
373
  }
374
+ function copyMessage(idx){
375
+ var m=chatHistory[idx];if(!m)return;
376
+ var text=m.content.replace(/\[SCREENSHOT_FILE\].*?\[\/SCREENSHOT_FILE\]/g,'').replace(/\[CANVAS_RENDER\][\s\S]*?\[\/CANVAS_RENDER\]/g,'').trim();
377
+ navigator.clipboard.writeText(text).then(function(){showToast('copy','Copied','Message copied to clipboard',2000)}).catch(function(){});
378
+ }
379
+ function retryMessage(idx){
380
+ // Retry = re-send the user message that preceded this assistant message
381
+ if(idx<1||chatHistory[idx].role!=='assistant')return;
382
+ var userMsg=chatHistory[idx-1];
383
+ if(!userMsg||userMsg.role!=='user')return;
384
+ // Remove this assistant response and re-send
385
+ chatHistory.splice(idx,1);
386
+ saveChatToStorage();renderMessages();
387
+ chatHistory.push({role:'assistant',content:'Thinking...'});renderMessages();
388
+ apiPost('/api/chat',{message:userMsg.content,history:chatHistory.slice(0,-1)}).then(function(r){
389
+ chatHistory.pop();
390
+ if(r&&r.response){chatHistory.push({role:'assistant',content:r.response})}
391
+ else if(r&&r.error){chatHistory.push({role:'assistant',content:'Error: '+r.error})}
392
+ else{chatHistory.push({role:'assistant',content:'Error: no response from server'})}
393
+ saveChatToStorage();renderMessages();
394
+ });
395
+ }
396
+ function editMessage(idx){
397
+ if(chatHistory[idx].role!=='user')return;
398
+ var inp=document.getElementById('chatInput');if(!inp)return;
399
+ inp.value=chatHistory[idx].content;
400
+ inp.focus();
401
+ // Remove this message and all subsequent messages
402
+ chatHistory.splice(idx);
403
+ saveChatToStorage();renderMessages();
404
+ }
357
405
  function sendChat(){
358
406
  var inp=document.getElementById('chatInput');if(!inp)return;
359
407
  var msg=inp.value.trim();if(!msg)return;