nothumanallowed 9.8.0 → 9.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "nothumanallowed",
3
- "version": "9.8.0",
3
+ "version": "9.8.2",
4
4
  "description": "NotHumanAllowed — 38 AI agents, 53 tools. Email, calendar, browser automation, screen capture, canvas, cron/heartbeat, GitHub, Notion, Slack, voice chat, 28 languages. Zero-dependency CLI.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -718,12 +718,31 @@ export async function cmdChat(args) {
718
718
  const result = await executeTool(action, params, config);
719
719
  process.stdout.write('\r' + ' '.repeat(80) + '\r');
720
720
 
721
- // Show action-specific result header
722
- const resultHeader = formatToolResult(action, params, result);
723
- console.log(` ${resultHeader}`);
724
- console.log(` ${result.split('\n').join('\n ')}\n`);
725
-
726
- addMessages(conv, input, response + `\n\n[Tool ${action} executed. Result: ${result}]`);
721
+ // Handle screen capture vision result
722
+ if (result && typeof result === 'object' && result.__screenshot) {
723
+ console.log(` ${G}Screenshot captured${NC} — analyzing with vision...\n`);
724
+ try {
725
+ const { callLLMVision } = await import('../services/llm.mjs');
726
+ const visionResponse = await callLLMVision(config,
727
+ 'Describe EXACTLY and ONLY what you see in this screenshot. NEVER invent or fabricate details.',
728
+ `The user said: "${input}"\n\n${result.question}`,
729
+ { base64: result.base64, mimeType: 'image/png' }
730
+ );
731
+ console.log(` ${visionResponse.split('\n').join('\n ')}\n`);
732
+ addMessages(conv, input, response + `\n\n[Screenshot: ${result.path}]\n${visionResponse}`);
733
+ } catch (visionErr) {
734
+ console.log(` ${R}Vision failed: ${visionErr.message}${NC}\n`);
735
+ addMessages(conv, input, response + `\n\n[Screenshot captured but vision failed: ${visionErr.message}]`);
736
+ }
737
+ } else {
738
+ // Show action-specific result header
739
+ const resultStr = typeof result === 'string' ? result : JSON.stringify(result);
740
+ const resultHeader = formatToolResult(action, params, resultStr);
741
+ console.log(` ${resultHeader}`);
742
+ console.log(` ${resultStr.split('\n').join('\n ')}\n`);
743
+
744
+ addMessages(conv, input, response + `\n\n[Tool ${action} executed. Result: ${resultStr}]`);
745
+ }
727
746
  } catch (err) {
728
747
  process.stdout.write('\r' + ' '.repeat(80) + '\r');
729
748
  console.log(` ${R}Error executing ${action}: ${err.message}${NC}\n`);
@@ -1053,18 +1053,49 @@ export async function cmdUI(args) {
1053
1053
 
1054
1054
  // Execute ALL tool actions and collect results
1055
1055
  const toolResults = [];
1056
+ let screenshotData = null; // For vision: { base64, path, question }
1056
1057
  for (const { action, params } of actions) {
1057
1058
  try {
1058
1059
  const result = await executeTool(action, params, config);
1059
- toolResults.push({ action, result: typeof result === 'object' ? JSON.stringify(result) : String(result) });
1060
+ // Intercept structured screenshot result for vision flow
1061
+ if (result && typeof result === 'object' && result.__screenshot) {
1062
+ screenshotData = result;
1063
+ toolResults.push({ action, result: 'Screenshot captured. Analyzing with vision...' });
1064
+ } else {
1065
+ toolResults.push({ action, result: typeof result === 'object' ? JSON.stringify(result) : String(result) });
1066
+ }
1060
1067
  } catch (e) {
1061
1068
  toolResults.push({ action, result: `Error: ${e.message}` });
1062
1069
  }
1063
1070
  }
1064
1071
 
1065
1072
  let fullResponse;
1066
- if (toolResults.length > 0) {
1067
- // Second LLM call with real tool results — forces the LLM to use actual data
1073
+ if (screenshotData && screenshotData.base64) {
1074
+ // VISION FLOW: send screenshot to LLM as image via callLLMVision
1075
+ try {
1076
+ const { callLLMVision } = await import('../services/llm.mjs');
1077
+ const visionPrompt = enrichedSystemPrompt + '\n\nIMPORTANT: You are looking at a REAL screenshot from the user\'s screen. Describe ONLY what you ACTUALLY see. NEVER invent, guess, or fabricate details. If something is unclear, say so.';
1078
+ const question = `The user said: "${body.message}"\n\n${screenshotData.question}\n\nDescribe ONLY what you see. NEVER make up information.`;
1079
+ fullResponse = await callLLMVision(config, visionPrompt, question, {
1080
+ base64: screenshotData.base64,
1081
+ mimeType: 'image/png',
1082
+ });
1083
+ } catch (visionErr) {
1084
+ fullResponse = `I captured a screenshot but vision analysis failed: ${visionErr.message}. To use screen analysis, configure a vision-capable provider (Claude, GPT-4, Gemini).`;
1085
+ }
1086
+ // Prepend screenshot file marker for the UI to display inline
1087
+ if (screenshotData.path) {
1088
+ const fname = screenshotData.path.split('/').pop();
1089
+ // Copy to NHA screenshots dir for persistence
1090
+ try {
1091
+ const ssDir = path.join(NHA_DIR, 'screenshots');
1092
+ fs.mkdirSync(ssDir, { recursive: true });
1093
+ fs.copyFileSync(screenshotData.path, path.join(ssDir, fname));
1094
+ } catch {}
1095
+ fullResponse = `![Screenshot](/api/screenshots/${fname})\n\n${fullResponse}`;
1096
+ }
1097
+ } else if (toolResults.length > 0) {
1098
+ // Standard tool results flow
1068
1099
  const toolContext = toolResults.map(t => {
1069
1100
  let clean = t.result.replace(/\[Screenshot[^\]]*\]/g, '').replace(/!\[.*?\]\(data:image[^)]+\)/g, '').slice(0, 3000);
1070
1101
  return `[${t.action} result]: ${clean.trim()}`;
@@ -1073,7 +1104,6 @@ export async function cmdUI(args) {
1073
1104
  try {
1074
1105
  fullResponse = await callLLM(config, enrichedSystemPrompt, followUp);
1075
1106
  } catch {
1076
- // Fallback: show raw results
1077
1107
  fullResponse = toolResults.map(t => `${t.action}: ${t.result}`).join('\n\n');
1078
1108
  }
1079
1109
  } else {
@@ -1325,6 +1355,40 @@ export async function cmdUI(args) {
1325
1355
  }
1326
1356
 
1327
1357
  const result = await executeTool(action, params, config);
1358
+
1359
+ // Intercept screen capture vision result
1360
+ if (result && typeof result === 'object' && result.__screenshot) {
1361
+ sendSSE('tool', { action, status: 'analyzing_screen' });
1362
+ try {
1363
+ const { callLLMVision } = await import('../services/llm.mjs');
1364
+ const visionPrompt = enrichedPrompt + '\\n\\nIMPORTANT: You are looking at a REAL screenshot. Describe ONLY what you ACTUALLY see. NEVER invent or fabricate.';
1365
+ const question = `The user said: "${msg}"\\n\\n${result.question}`;
1366
+ const visionResponse = await callLLMVision(config, visionPrompt, question, { base64: result.base64, mimeType: 'image/png' });
1367
+
1368
+ // Save screenshot for display
1369
+ const fname = result.path.split('/').pop();
1370
+ const ssDir = path.join(NHA_DIR, 'screenshots');
1371
+ fs.mkdirSync(ssDir, { recursive: true });
1372
+ try { fs.copyFileSync(result.path, path.join(ssDir, fname)); } catch {}
1373
+
1374
+ // Send screenshot to client
1375
+ sendSSE('screenshot', { filename: fname, format: 'png' });
1376
+ if (!res._screenshotFiles) res._screenshotFiles = [];
1377
+ res._screenshotFiles.push(fname);
1378
+
1379
+ // Send vision analysis as tokens
1380
+ sendSSE('tool_synthesis', {});
1381
+ sendSSE('token', { content: visionResponse });
1382
+
1383
+ toolResults.push({ action, result: `[Screen analyzed] ${visionResponse.slice(0, 500)}` });
1384
+ sendSSE('tool', { action, status: 'done', result: 'Screen analyzed' });
1385
+ } catch (visionErr) {
1386
+ toolResults.push({ action, result: `Vision failed: ${visionErr.message}` });
1387
+ sendSSE('tool', { action, status: 'error', error: visionErr.message });
1388
+ }
1389
+ continue;
1390
+ }
1391
+
1328
1392
  const resultStr = typeof result === 'object' ? JSON.stringify(result) : String(result);
1329
1393
  toolResults.push({ action, result: resultStr });
1330
1394
  sendSSE('tool', { action, status: 'done', result: typeof resultStr === 'string' ? resultStr.slice(0, 500) : '' });
package/src/constants.mjs CHANGED
@@ -5,7 +5,7 @@ import { fileURLToPath } from 'url';
5
5
  const __filename = fileURLToPath(import.meta.url);
6
6
  const __dirname = path.dirname(__filename);
7
7
 
8
- export const VERSION = '9.8.0';
8
+ export const VERSION = '9.8.2';
9
9
  export const BASE_URL = 'https://nothumanallowed.com/cli';
10
10
  export const API_BASE = 'https://nothumanallowed.com/api/v1';
11
11
 
@@ -276,105 +276,88 @@ export async function callLLM(config, systemPrompt, userMessage, opts = {}) {
276
276
  }
277
277
 
278
278
  /**
279
- * Call LLM with multimodal (vision) messages — supports image content.
280
- * Uses the provider's native vision format.
279
+ * Call LLM with multimodal (vision) messages — supports image + PDF content.
280
+ * Unified function for ALL vision calls (web UI, CLI, screen capture).
281
281
  * @param {object} config
282
- * @param {Array} messages - Array of { role, content } where content can be string or array of content blocks
282
+ * @param {string} systemPrompt
283
+ * @param {string} userMessage — text question about the image
284
+ * @param {object} media — { base64, mimeType } (image/png, image/jpeg, application/pdf)
283
285
  * @returns {Promise<string>}
284
286
  */
285
- export async function callLLMVision(config, messages) {
287
+ export async function callLLMVision(config, systemPrompt, userMessage, media) {
286
288
  const provider = config.llm.provider || 'anthropic';
287
289
  const model = config.llm.model || null;
288
290
  const apiKey = getApiKey(config, provider);
289
291
  if (!apiKey) throw new Error(`No API key for ${provider}. Vision requires Claude, GPT-4, or Gemini.`);
290
292
 
293
+ const { base64, mimeType } = media;
294
+ if (!base64 || !mimeType) throw new Error('media.base64 and media.mimeType are required');
295
+
291
296
  if (provider === 'anthropic') {
292
- // Anthropic format: system separate, messages with content blocks
293
- const systemMsg = messages.find(m => m.role === 'system');
294
- const userMsgs = messages.filter(m => m.role !== 'system');
295
-
296
- // Convert OpenAI-style image_url to Anthropic format
297
- const anthropicMessages = userMsgs.map(m => {
298
- if (typeof m.content === 'string') return m;
299
- const blocks = m.content.map(block => {
300
- if (block.type === 'text') return block;
301
- if (block.type === 'image_url') {
302
- const url = block.image_url.url;
303
- const match = url.match(/^data:image\/(png|jpeg|gif|webp);base64,(.+)$/);
304
- if (match) {
305
- return { type: 'image', source: { type: 'base64', media_type: `image/${match[1]}`, data: match[2] } };
306
- }
307
- }
308
- return block;
309
- });
310
- return { role: m.role, content: blocks };
311
- });
297
+ const isPdf = mimeType === 'application/pdf';
298
+ const contentBlock = isPdf
299
+ ? { type: 'document', source: { type: 'base64', media_type: mimeType, data: base64 } }
300
+ : { type: 'image', source: { type: 'base64', media_type: mimeType, data: base64 } };
312
301
 
313
302
  const body = {
314
303
  model: model || 'claude-sonnet-4-20250514',
315
304
  max_tokens: 4096,
316
- system: systemMsg?.content || '',
317
- messages: anthropicMessages,
305
+ system: systemPrompt,
306
+ messages: [{ role: 'user', content: [contentBlock, { type: 'text', text: userMessage }] }],
318
307
  };
319
308
  const res = await fetch('https://api.anthropic.com/v1/messages', {
320
309
  method: 'POST',
321
310
  headers: { 'Content-Type': 'application/json', 'x-api-key': apiKey, 'anthropic-version': '2023-06-01' },
322
311
  body: JSON.stringify(body),
323
312
  });
324
- if (!res.ok) throw new Error(`Anthropic vision ${res.status}: ${await res.text()}`);
313
+ if (!res.ok) throw new Error(`Anthropic vision ${res.status}: ${(await res.text()).slice(0, 300)}`);
325
314
  const data = await res.json();
326
315
  return data.content?.[0]?.text || '';
327
316
  }
328
317
 
329
318
  if (provider === 'openai' || provider === 'deepseek' || provider === 'grok' || provider === 'mistral') {
330
- // OpenAI-compatible format — works with GPT-4V, DeepSeek VL, etc.
331
319
  const url = provider === 'openai' ? 'https://api.openai.com/v1/chat/completions'
332
- : provider === 'deepseek' ? 'https://api.deepseek.com/chat/completions'
320
+ : provider === 'deepseek' ? 'https://api.deepseek.com/v1/chat/completions'
333
321
  : provider === 'grok' ? 'https://api.x.ai/v1/chat/completions'
334
322
  : 'https://api.mistral.ai/v1/chat/completions';
335
323
 
336
- const visionModel = model || (provider === 'openai' ? 'gpt-4o' : model);
337
- const body = { model: visionModel, max_tokens: 4096, messages };
324
+ const body = {
325
+ model: model || (provider === 'openai' ? 'gpt-4o' : model),
326
+ max_tokens: 4096,
327
+ messages: [
328
+ { role: 'system', content: systemPrompt },
329
+ { role: 'user', content: [
330
+ { type: 'image_url', image_url: { url: `data:${mimeType};base64,${base64}` } },
331
+ { type: 'text', text: userMessage },
332
+ ] },
333
+ ],
334
+ };
338
335
  const res = await fetch(url, {
339
336
  method: 'POST',
340
337
  headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${apiKey}` },
341
338
  body: JSON.stringify(body),
342
339
  });
343
- if (!res.ok) throw new Error(`${provider} vision ${res.status}: ${await res.text()}`);
340
+ if (!res.ok) throw new Error(`${provider} vision ${res.status}: ${(await res.text()).slice(0, 300)}`);
344
341
  const data = await res.json();
345
342
  return data.choices?.[0]?.message?.content || '';
346
343
  }
347
344
 
348
345
  if (provider === 'gemini') {
349
- // Gemini format — inline_data with base64
350
- const systemMsg = messages.find(m => m.role === 'system');
351
- const userMsgs = messages.filter(m => m.role !== 'system');
352
- const parts = [];
353
- for (const msg of userMsgs) {
354
- if (typeof msg.content === 'string') {
355
- parts.push({ text: msg.content });
356
- } else {
357
- for (const block of msg.content) {
358
- if (block.type === 'text') parts.push({ text: block.text });
359
- if (block.type === 'image_url') {
360
- const match = block.image_url.url.match(/^data:image\/(.*?);base64,(.+)$/);
361
- if (match) parts.push({ inline_data: { mime_type: `image/${match[1]}`, data: match[2] } });
362
- }
363
- }
364
- }
365
- }
366
-
367
346
  const geminiModel = model || 'gemini-2.0-flash';
368
347
  const body = {
369
- contents: [{ parts }],
370
- systemInstruction: systemMsg ? { parts: [{ text: systemMsg.content }] } : undefined,
348
+ contents: [{ parts: [
349
+ { inline_data: { mime_type: mimeType, data: base64 } },
350
+ { text: userMessage },
351
+ ] }],
352
+ systemInstruction: { parts: [{ text: systemPrompt }] },
353
+ generationConfig: { maxOutputTokens: 4096 },
371
354
  };
372
355
  const res = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/${geminiModel}:generateContent?key=${apiKey}`, {
373
356
  method: 'POST',
374
357
  headers: { 'Content-Type': 'application/json' },
375
358
  body: JSON.stringify(body),
376
359
  });
377
- if (!res.ok) throw new Error(`Gemini vision ${res.status}: ${await res.text()}`);
360
+ if (!res.ok) throw new Error(`Gemini vision ${res.status}: ${(await res.text()).slice(0, 300)}`);
378
361
  const data = await res.json();
379
362
  return data.candidates?.[0]?.content?.parts?.[0]?.text || '';
380
363
  }
@@ -382,6 +365,144 @@ export async function callLLMVision(config, messages) {
382
365
  throw new Error(`Vision not supported for provider: ${provider}. Use Claude, GPT-4, or Gemini.`);
383
366
  }
384
367
 
368
+ /**
369
+ * Call an LLM provider with streaming enabled.
370
+ * Calls onToken(chunk) for each token, returns full text at the end.
371
+ * @returns {Promise<string>} The full LLM response text.
372
+ */
373
+ export async function callLLMStream(config, systemPrompt, userMessage, onToken, opts = {}) {
374
+ const provider = opts.provider || config.llm.provider || 'anthropic';
375
+ const model = opts.model || config.llm.model || null;
376
+ const apiKey = getApiKey(config, provider);
377
+ if (!apiKey) throw new Error(`No API key for ${provider}`);
378
+
379
+ const callFn = getProviderCall(provider);
380
+ if (!callFn) throw new Error(`Unknown provider: ${provider}`);
381
+
382
+ // Gemini and Cohere don't support streaming — fall back to non-streaming
383
+ if (provider === 'gemini' || provider === 'cohere') {
384
+ const text = await callFn(apiKey, model, systemPrompt, userMessage, false);
385
+ if (onToken) onToken(text);
386
+ return text;
387
+ }
388
+
389
+ const format = provider === 'anthropic' ? 'anthropic' : 'openai';
390
+ const body = buildRequestBody(provider, model, systemPrompt, userMessage, true);
391
+ const url = getProviderUrl(provider, model, apiKey);
392
+ const headers = getProviderHeaders(provider, apiKey);
393
+
394
+ const res = await fetch(url, {
395
+ method: 'POST',
396
+ headers,
397
+ body: JSON.stringify(body),
398
+ });
399
+ if (!res.ok) {
400
+ const err = await res.text();
401
+ throw new Error(`${provider} ${res.status}: ${err}`);
402
+ }
403
+
404
+ return streamSSEWithCallback(res, format, onToken);
405
+ }
406
+
407
+ /** Build request body for a provider */
408
+ function buildRequestBody(provider, model, systemPrompt, userMessage, stream) {
409
+ if (provider === 'anthropic') {
410
+ return {
411
+ model: model || 'claude-sonnet-4-20250514',
412
+ max_tokens: 8192,
413
+ system: systemPrompt,
414
+ messages: [{ role: 'user', content: userMessage }],
415
+ stream,
416
+ };
417
+ }
418
+ // OpenAI-compatible format (OpenAI, DeepSeek, Grok, Mistral)
419
+ const modelDefaults = {
420
+ openai: 'gpt-4o',
421
+ deepseek: 'deepseek-chat',
422
+ grok: 'grok-3-latest',
423
+ mistral: 'mistral-large-latest',
424
+ };
425
+ return {
426
+ model: model || modelDefaults[provider] || 'gpt-4o',
427
+ max_tokens: 8192,
428
+ messages: [
429
+ { role: 'system', content: systemPrompt },
430
+ { role: 'user', content: userMessage },
431
+ ],
432
+ stream,
433
+ };
434
+ }
435
+
436
+ /** Get provider API URL */
437
+ function getProviderUrl(provider, model, apiKey) {
438
+ const urls = {
439
+ anthropic: 'https://api.anthropic.com/v1/messages',
440
+ openai: 'https://api.openai.com/v1/chat/completions',
441
+ deepseek: 'https://api.deepseek.com/v1/chat/completions',
442
+ grok: 'https://api.x.ai/v1/chat/completions',
443
+ mistral: 'https://api.mistral.ai/v1/chat/completions',
444
+ };
445
+ return urls[provider] || urls.openai;
446
+ }
447
+
448
+ /** Get provider request headers */
449
+ function getProviderHeaders(provider, apiKey) {
450
+ if (provider === 'anthropic') {
451
+ return {
452
+ 'Content-Type': 'application/json',
453
+ 'x-api-key': apiKey,
454
+ 'anthropic-version': '2023-06-01',
455
+ };
456
+ }
457
+ return {
458
+ 'Content-Type': 'application/json',
459
+ 'Authorization': `Bearer ${apiKey}`,
460
+ };
461
+ }
462
+
463
+ /** SSE stream parser with onToken callback (does NOT write to stdout directly) */
464
+ async function streamSSEWithCallback(res, format, onToken) {
465
+ const reader = res.body.getReader();
466
+ const decoder = new TextDecoder();
467
+ let buffer = '';
468
+ let fullText = '';
469
+
470
+ while (true) {
471
+ const { done, value } = await reader.read();
472
+ if (done) break;
473
+
474
+ buffer += decoder.decode(value, { stream: true });
475
+ const lines = buffer.split('\n');
476
+ buffer = lines.pop() || '';
477
+
478
+ for (const line of lines) {
479
+ if (!line.startsWith('data: ')) continue;
480
+ const data = line.slice(6).trim();
481
+ if (data === '[DONE]') continue;
482
+
483
+ try {
484
+ const json = JSON.parse(data);
485
+ let chunk = '';
486
+
487
+ if (format === 'anthropic') {
488
+ if (json.type === 'content_block_delta') {
489
+ chunk = json.delta?.text || '';
490
+ }
491
+ } else {
492
+ chunk = json.choices?.[0]?.delta?.content || '';
493
+ }
494
+
495
+ if (chunk) {
496
+ fullText += chunk;
497
+ if (onToken) onToken(chunk);
498
+ }
499
+ } catch {}
500
+ }
501
+ }
502
+
503
+ return fullText;
504
+ }
505
+
385
506
  /**
386
507
  * Call an agent by name — loads the agent file, calls LLM, returns response.
387
508
  * No streaming. Used by PAO pipeline for batch agent calls.