npm - nothumanallowed - Versions diffs - 9.8.0 → 9.8.2 - Mend

nothumanallowed 9.8.0 → 9.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "nothumanallowed",
-  "version": "9.8.0",
+  "version": "9.8.2",
   "description": "NotHumanAllowed — 38 AI agents, 53 tools. Email, calendar, browser automation, screen capture, canvas, cron/heartbeat, GitHub, Notion, Slack, voice chat, 28 languages. Zero-dependency CLI.",
   "type": "module",
   "bin": {

package/src/commands/chat.mjs CHANGED Viewed

@@ -718,12 +718,31 @@ export async function cmdChat(args) {
           const result = await executeTool(action, params, config);
           process.stdout.write('\r' + ' '.repeat(80) + '\r');
-          // Show action-specific result header
-          const resultHeader = formatToolResult(action, params, result);
-          console.log(`  ${resultHeader}`);
-          console.log(`  ${result.split('\n').join('\n  ')}\n`);
-          addMessages(conv, input, response + `\n\n[Tool ${action} executed. Result: ${result}]`);
+          // Handle screen capture vision result
+          if (result && typeof result === 'object' && result.__screenshot) {
+            console.log(`  ${G}Screenshot captured${NC} — analyzing with vision...\n`);
+            try {
+              const { callLLMVision } = await import('../services/llm.mjs');
+              const visionResponse = await callLLMVision(config,
+                'Describe EXACTLY and ONLY what you see in this screenshot. NEVER invent or fabricate details.',
+                `The user said: "${input}"\n\n${result.question}`,
+                { base64: result.base64, mimeType: 'image/png' }
+              );
+              console.log(`  ${visionResponse.split('\n').join('\n  ')}\n`);
+              addMessages(conv, input, response + `\n\n[Screenshot: ${result.path}]\n${visionResponse}`);
+            } catch (visionErr) {
+              console.log(`  ${R}Vision failed: ${visionErr.message}${NC}\n`);
+              addMessages(conv, input, response + `\n\n[Screenshot captured but vision failed: ${visionErr.message}]`);
+            }
+          } else {
+            // Show action-specific result header
+            const resultStr = typeof result === 'string' ? result : JSON.stringify(result);
+            const resultHeader = formatToolResult(action, params, resultStr);
+            console.log(`  ${resultHeader}`);
+            console.log(`  ${resultStr.split('\n').join('\n  ')}\n`);
+            addMessages(conv, input, response + `\n\n[Tool ${action} executed. Result: ${resultStr}]`);
+          }
         } catch (err) {
           process.stdout.write('\r' + ' '.repeat(80) + '\r');
           console.log(`  ${R}Error executing ${action}: ${err.message}${NC}\n`);

package/src/commands/ui.mjs CHANGED Viewed

@@ -1053,18 +1053,49 @@ export async function cmdUI(args) {
           // Execute ALL tool actions and collect results
           const toolResults = [];
+          let screenshotData = null; // For vision: { base64, path, question }
           for (const { action, params } of actions) {
             try {
               const result = await executeTool(action, params, config);
-              toolResults.push({ action, result: typeof result === 'object' ? JSON.stringify(result) : String(result) });
+              // Intercept structured screenshot result for vision flow
+              if (result && typeof result === 'object' && result.__screenshot) {
+                screenshotData = result;
+                toolResults.push({ action, result: 'Screenshot captured. Analyzing with vision...' });
+              } else {
+                toolResults.push({ action, result: typeof result === 'object' ? JSON.stringify(result) : String(result) });
+              }
             } catch (e) {
               toolResults.push({ action, result: `Error: ${e.message}` });
             }
           }
           let fullResponse;
-          if (toolResults.length > 0) {
-            // Second LLM call with real tool results — forces the LLM to use actual data
+          if (screenshotData && screenshotData.base64) {
+            // VISION FLOW: send screenshot to LLM as image via callLLMVision
+            try {
+              const { callLLMVision } = await import('../services/llm.mjs');
+              const visionPrompt = enrichedSystemPrompt + '\n\nIMPORTANT: You are looking at a REAL screenshot from the user\'s screen. Describe ONLY what you ACTUALLY see. NEVER invent, guess, or fabricate details. If something is unclear, say so.';
+              const question = `The user said: "${body.message}"\n\n${screenshotData.question}\n\nDescribe ONLY what you see. NEVER make up information.`;
+              fullResponse = await callLLMVision(config, visionPrompt, question, {
+                base64: screenshotData.base64,
+                mimeType: 'image/png',
+              });
+            } catch (visionErr) {
+              fullResponse = `I captured a screenshot but vision analysis failed: ${visionErr.message}. To use screen analysis, configure a vision-capable provider (Claude, GPT-4, Gemini).`;
+            }
+            // Prepend screenshot file marker for the UI to display inline
+            if (screenshotData.path) {
+              const fname = screenshotData.path.split('/').pop();
+              // Copy to NHA screenshots dir for persistence
+              try {
+                const ssDir = path.join(NHA_DIR, 'screenshots');
+                fs.mkdirSync(ssDir, { recursive: true });
+                fs.copyFileSync(screenshotData.path, path.join(ssDir, fname));
+              } catch {}
+              fullResponse = `![Screenshot](/api/screenshots/${fname})\n\n${fullResponse}`;
+            }
+          } else if (toolResults.length > 0) {
+            // Standard tool results flow
             const toolContext = toolResults.map(t => {
               let clean = t.result.replace(/\[Screenshot[^\]]*\]/g, '').replace(/!\[.*?\]\(data:image[^)]+\)/g, '').slice(0, 3000);
               return `[${t.action} result]: ${clean.trim()}`;
@@ -1073,7 +1104,6 @@ export async function cmdUI(args) {
             try {
               fullResponse = await callLLM(config, enrichedSystemPrompt, followUp);
             } catch {
-              // Fallback: show raw results
               fullResponse = toolResults.map(t => `${t.action}: ${t.result}`).join('\n\n');
             }
           } else {
@@ -1325,6 +1355,40 @@ export async function cmdUI(args) {
               }
               const result = await executeTool(action, params, config);
+              // Intercept screen capture vision result
+              if (result && typeof result === 'object' && result.__screenshot) {
+                sendSSE('tool', { action, status: 'analyzing_screen' });
+                try {
+                  const { callLLMVision } = await import('../services/llm.mjs');
+                  const visionPrompt = enrichedPrompt + '\\n\\nIMPORTANT: You are looking at a REAL screenshot. Describe ONLY what you ACTUALLY see. NEVER invent or fabricate.';
+                  const question = `The user said: "${msg}"\\n\\n${result.question}`;
+                  const visionResponse = await callLLMVision(config, visionPrompt, question, { base64: result.base64, mimeType: 'image/png' });
+                  // Save screenshot for display
+                  const fname = result.path.split('/').pop();
+                  const ssDir = path.join(NHA_DIR, 'screenshots');
+                  fs.mkdirSync(ssDir, { recursive: true });
+                  try { fs.copyFileSync(result.path, path.join(ssDir, fname)); } catch {}
+                  // Send screenshot to client
+                  sendSSE('screenshot', { filename: fname, format: 'png' });
+                  if (!res._screenshotFiles) res._screenshotFiles = [];
+                  res._screenshotFiles.push(fname);
+                  // Send vision analysis as tokens
+                  sendSSE('tool_synthesis', {});
+                  sendSSE('token', { content: visionResponse });
+                  toolResults.push({ action, result: `[Screen analyzed] ${visionResponse.slice(0, 500)}` });
+                  sendSSE('tool', { action, status: 'done', result: 'Screen analyzed' });
+                } catch (visionErr) {
+                  toolResults.push({ action, result: `Vision failed: ${visionErr.message}` });
+                  sendSSE('tool', { action, status: 'error', error: visionErr.message });
+                }
+                continue;
+              }
               const resultStr = typeof result === 'object' ? JSON.stringify(result) : String(result);
               toolResults.push({ action, result: resultStr });
               sendSSE('tool', { action, status: 'done', result: typeof resultStr === 'string' ? resultStr.slice(0, 500) : '' });

package/src/constants.mjs CHANGED Viewed

@@ -5,7 +5,7 @@ import { fileURLToPath } from 'url';
 const __filename = fileURLToPath(import.meta.url);
 const __dirname = path.dirname(__filename);
-export const VERSION = '9.8.0';
+export const VERSION = '9.8.2';
 export const BASE_URL = 'https://nothumanallowed.com/cli';
 export const API_BASE = 'https://nothumanallowed.com/api/v1';

package/src/services/llm.mjs CHANGED Viewed

@@ -276,105 +276,88 @@ export async function callLLM(config, systemPrompt, userMessage, opts = {}) {
 }
 /**
- * Call LLM with multimodal (vision) messages — supports image content.
- * Uses the provider's native vision format.
+ * Call LLM with multimodal (vision) messages — supports image + PDF content.
+ * Unified function for ALL vision calls (web UI, CLI, screen capture).
  * @param {object} config
- * @param {Array} messages - Array of { role, content } where content can be string or array of content blocks
+ * @param {string} systemPrompt
+ * @param {string} userMessage — text question about the image
+ * @param {object} media — { base64, mimeType } (image/png, image/jpeg, application/pdf)
  * @returns {Promise<string>}
  */
-export async function callLLMVision(config, messages) {
+export async function callLLMVision(config, systemPrompt, userMessage, media) {
   const provider = config.llm.provider || 'anthropic';
   const model = config.llm.model || null;
   const apiKey = getApiKey(config, provider);
   if (!apiKey) throw new Error(`No API key for ${provider}. Vision requires Claude, GPT-4, or Gemini.`);
+  const { base64, mimeType } = media;
+  if (!base64 || !mimeType) throw new Error('media.base64 and media.mimeType are required');
   if (provider === 'anthropic') {
-    // Anthropic format: system separate, messages with content blocks
-    const systemMsg = messages.find(m => m.role === 'system');
-    const userMsgs = messages.filter(m => m.role !== 'system');
-    // Convert OpenAI-style image_url to Anthropic format
-    const anthropicMessages = userMsgs.map(m => {
-      if (typeof m.content === 'string') return m;
-      const blocks = m.content.map(block => {
-        if (block.type === 'text') return block;
-        if (block.type === 'image_url') {
-          const url = block.image_url.url;
-          const match = url.match(/^data:image\/(png|jpeg|gif|webp);base64,(.+)$/);
-          if (match) {
-            return { type: 'image', source: { type: 'base64', media_type: `image/${match[1]}`, data: match[2] } };
-          }
-        }
-        return block;
-      });
-      return { role: m.role, content: blocks };
-    });
+    const isPdf = mimeType === 'application/pdf';
+    const contentBlock = isPdf
+      ? { type: 'document', source: { type: 'base64', media_type: mimeType, data: base64 } }
+      : { type: 'image', source: { type: 'base64', media_type: mimeType, data: base64 } };
     const body = {
       model: model || 'claude-sonnet-4-20250514',
       max_tokens: 4096,
-      system: systemMsg?.content || '',
-      messages: anthropicMessages,
+      system: systemPrompt,
+      messages: [{ role: 'user', content: [contentBlock, { type: 'text', text: userMessage }] }],
     };
     const res = await fetch('https://api.anthropic.com/v1/messages', {
       method: 'POST',
       headers: { 'Content-Type': 'application/json', 'x-api-key': apiKey, 'anthropic-version': '2023-06-01' },
       body: JSON.stringify(body),
     });
-    if (!res.ok) throw new Error(`Anthropic vision ${res.status}: ${await res.text()}`);
+    if (!res.ok) throw new Error(`Anthropic vision ${res.status}: ${(await res.text()).slice(0, 300)}`);
     const data = await res.json();
     return data.content?.[0]?.text || '';
   }
   if (provider === 'openai' || provider === 'deepseek' || provider === 'grok' || provider === 'mistral') {
-    // OpenAI-compatible format — works with GPT-4V, DeepSeek VL, etc.
     const url = provider === 'openai' ? 'https://api.openai.com/v1/chat/completions'
-              : provider === 'deepseek' ? 'https://api.deepseek.com/chat/completions'
+              : provider === 'deepseek' ? 'https://api.deepseek.com/v1/chat/completions'
               : provider === 'grok' ? 'https://api.x.ai/v1/chat/completions'
               : 'https://api.mistral.ai/v1/chat/completions';
-    const visionModel = model || (provider === 'openai' ? 'gpt-4o' : model);
-    const body = { model: visionModel, max_tokens: 4096, messages };
+    const body = {
+      model: model || (provider === 'openai' ? 'gpt-4o' : model),
+      max_tokens: 4096,
+      messages: [
+        { role: 'system', content: systemPrompt },
+        { role: 'user', content: [
+          { type: 'image_url', image_url: { url: `data:${mimeType};base64,${base64}` } },
+          { type: 'text', text: userMessage },
+        ] },
+      ],
+    };
     const res = await fetch(url, {
       method: 'POST',
       headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${apiKey}` },
       body: JSON.stringify(body),
     });
-    if (!res.ok) throw new Error(`${provider} vision ${res.status}: ${await res.text()}`);
+    if (!res.ok) throw new Error(`${provider} vision ${res.status}: ${(await res.text()).slice(0, 300)}`);
     const data = await res.json();
     return data.choices?.[0]?.message?.content || '';
   }
   if (provider === 'gemini') {
-    // Gemini format — inline_data with base64
-    const systemMsg = messages.find(m => m.role === 'system');
-    const userMsgs = messages.filter(m => m.role !== 'system');
-    const parts = [];
-    for (const msg of userMsgs) {
-      if (typeof msg.content === 'string') {
-        parts.push({ text: msg.content });
-      } else {
-        for (const block of msg.content) {
-          if (block.type === 'text') parts.push({ text: block.text });
-          if (block.type === 'image_url') {
-            const match = block.image_url.url.match(/^data:image\/(.*?);base64,(.+)$/);
-            if (match) parts.push({ inline_data: { mime_type: `image/${match[1]}`, data: match[2] } });
-          }
-        }
-      }
-    }
     const geminiModel = model || 'gemini-2.0-flash';
     const body = {
-      contents: [{ parts }],
-      systemInstruction: systemMsg ? { parts: [{ text: systemMsg.content }] } : undefined,
+      contents: [{ parts: [
+        { inline_data: { mime_type: mimeType, data: base64 } },
+        { text: userMessage },
+      ] }],
+      systemInstruction: { parts: [{ text: systemPrompt }] },
+      generationConfig: { maxOutputTokens: 4096 },
     };
     const res = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/${geminiModel}:generateContent?key=${apiKey}`, {
       method: 'POST',
       headers: { 'Content-Type': 'application/json' },
       body: JSON.stringify(body),
     });
-    if (!res.ok) throw new Error(`Gemini vision ${res.status}: ${await res.text()}`);
+    if (!res.ok) throw new Error(`Gemini vision ${res.status}: ${(await res.text()).slice(0, 300)}`);
     const data = await res.json();
     return data.candidates?.[0]?.content?.parts?.[0]?.text || '';
   }
@@ -382,6 +365,144 @@ export async function callLLMVision(config, messages) {
   throw new Error(`Vision not supported for provider: ${provider}. Use Claude, GPT-4, or Gemini.`);
 }
+/**
+ * Call an LLM provider with streaming enabled.
+ * Calls onToken(chunk) for each token, returns full text at the end.
+ * @returns {Promise<string>} The full LLM response text.
+ */
+export async function callLLMStream(config, systemPrompt, userMessage, onToken, opts = {}) {
+  const provider = opts.provider || config.llm.provider || 'anthropic';
+  const model = opts.model || config.llm.model || null;
+  const apiKey = getApiKey(config, provider);
+  if (!apiKey) throw new Error(`No API key for ${provider}`);
+  const callFn = getProviderCall(provider);
+  if (!callFn) throw new Error(`Unknown provider: ${provider}`);
+  // Gemini and Cohere don't support streaming — fall back to non-streaming
+  if (provider === 'gemini' || provider === 'cohere') {
+    const text = await callFn(apiKey, model, systemPrompt, userMessage, false);
+    if (onToken) onToken(text);
+    return text;
+  }
+  const format = provider === 'anthropic' ? 'anthropic' : 'openai';
+  const body = buildRequestBody(provider, model, systemPrompt, userMessage, true);
+  const url = getProviderUrl(provider, model, apiKey);
+  const headers = getProviderHeaders(provider, apiKey);
+  const res = await fetch(url, {
+    method: 'POST',
+    headers,
+    body: JSON.stringify(body),
+  });
+  if (!res.ok) {
+    const err = await res.text();
+    throw new Error(`${provider} ${res.status}: ${err}`);
+  }
+  return streamSSEWithCallback(res, format, onToken);
+}
+/** Build request body for a provider */
+function buildRequestBody(provider, model, systemPrompt, userMessage, stream) {
+  if (provider === 'anthropic') {
+    return {
+      model: model || 'claude-sonnet-4-20250514',
+      max_tokens: 8192,
+      system: systemPrompt,
+      messages: [{ role: 'user', content: userMessage }],
+      stream,
+    };
+  }
+  // OpenAI-compatible format (OpenAI, DeepSeek, Grok, Mistral)
+  const modelDefaults = {
+    openai: 'gpt-4o',
+    deepseek: 'deepseek-chat',
+    grok: 'grok-3-latest',
+    mistral: 'mistral-large-latest',
+  };
+  return {
+    model: model || modelDefaults[provider] || 'gpt-4o',
+    max_tokens: 8192,
+    messages: [
+      { role: 'system', content: systemPrompt },
+      { role: 'user', content: userMessage },
+    ],
+    stream,
+  };
+}
+/** Get provider API URL */
+function getProviderUrl(provider, model, apiKey) {
+  const urls = {
+    anthropic: 'https://api.anthropic.com/v1/messages',
+    openai: 'https://api.openai.com/v1/chat/completions',
+    deepseek: 'https://api.deepseek.com/v1/chat/completions',
+    grok: 'https://api.x.ai/v1/chat/completions',
+    mistral: 'https://api.mistral.ai/v1/chat/completions',
+  };
+  return urls[provider] || urls.openai;
+}
+/** Get provider request headers */
+function getProviderHeaders(provider, apiKey) {
+  if (provider === 'anthropic') {
+    return {
+      'Content-Type': 'application/json',
+      'x-api-key': apiKey,
+      'anthropic-version': '2023-06-01',
+    };
+  }
+  return {
+    'Content-Type': 'application/json',
+    'Authorization': `Bearer ${apiKey}`,
+  };
+}
+/** SSE stream parser with onToken callback (does NOT write to stdout directly) */
+async function streamSSEWithCallback(res, format, onToken) {
+  const reader = res.body.getReader();
+  const decoder = new TextDecoder();
+  let buffer = '';
+  let fullText = '';
+  while (true) {
+    const { done, value } = await reader.read();
+    if (done) break;
+    buffer += decoder.decode(value, { stream: true });
+    const lines = buffer.split('\n');
+    buffer = lines.pop() || '';
+    for (const line of lines) {
+      if (!line.startsWith('data: ')) continue;
+      const data = line.slice(6).trim();
+      if (data === '[DONE]') continue;
+      try {
+        const json = JSON.parse(data);
+        let chunk = '';
+        if (format === 'anthropic') {
+          if (json.type === 'content_block_delta') {
+            chunk = json.delta?.text || '';
+          }
+        } else {
+          chunk = json.choices?.[0]?.delta?.content || '';
+        }
+        if (chunk) {
+          fullText += chunk;
+          if (onToken) onToken(chunk);
+        }
+      } catch {}
+    }
+  }
+  return fullText;
+}
 /**
  * Call an agent by name — loads the agent file, calls LLM, returns response.
  * No streaming. Used by PAO pipeline for batch agent calls.