@ducci/jarvis 1.0.67 → 1.0.69

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ducci/jarvis",
3
- "version": "1.0.67",
3
+ "version": "1.0.69",
4
4
  "description": "A fully automated agent system that lives on a server.",
5
5
  "main": "./src/index.js",
6
6
  "type": "module",
@@ -6,6 +6,7 @@ import { handleChat } from '../../server/agent.js';
6
6
  import { loadSession } from '../../server/sessions.js';
7
7
  import { PATHS } from '../../server/config.js';
8
8
  import { load, save } from './sessions.js';
9
+ import { describeImage } from '../../server/vision.js';
9
10
 
10
11
  function getTelegramChatLogPath(chatId, sessionId) {
11
12
  const prefix = sessionId ? String(sessionId).slice(0, 8) : 'unknown';
@@ -123,11 +124,39 @@ export async function startTelegramChannel(config) {
123
124
  const combinedText = batch.length === 1
124
125
  ? batch[0].text
125
126
  : batch.map(m => m.text).join('\n\n');
126
- const allAttachments = batch.flatMap(m => m.attachments);
127
+ const rawAttachments = batch.flatMap(m => m.attachments);
128
+
129
+ // If a vision model is configured and the batch contains images, run a one-shot
130
+ // image analysis first and inject the result as text into the main agent turn.
131
+ let userText = combinedText;
132
+ let allAttachments = rawAttachments;
133
+ if (config.visionModel && config.visionApiKey && rawAttachments.length > 0) {
134
+ try {
135
+ const results = await Promise.allSettled(
136
+ rawAttachments.map(a => describeImage(a, combinedText, config))
137
+ );
138
+ const successfulDescs = results
139
+ .filter(r => r.status === 'fulfilled')
140
+ .map(r => `[Image analysis: ${r.value}]`);
141
+ const failedCount = results.filter(r => r.status === 'rejected').length;
142
+ if (failedCount > 0) {
143
+ console.error(`[telegram] vision error for ${failedCount}/${results.length} image(s), those will be sent directly to main agent`);
144
+ }
145
+ if (successfulDescs.length > 0) {
146
+ const descBlock = successfulDescs.join('\n\n');
147
+ userText = combinedText ? `${descBlock}\n\n${combinedText}` : descBlock;
148
+ }
149
+ // Only clear attachments for images that were successfully described
150
+ allAttachments = rawAttachments.filter((_, i) => results[i].status === 'rejected');
151
+ } catch (e) {
152
+ console.error(`[telegram] vision error, falling back to direct image: ${e.message}`);
153
+ // allAttachments and userText stay unchanged — main agent gets the raw image
154
+ }
155
+ }
127
156
 
128
157
  let result;
129
158
  try {
130
- result = await handleChat(config, sessionId, combinedText, allAttachments);
159
+ result = await handleChat(config, sessionId, userText, allAttachments);
131
160
  } catch (e) {
132
161
  console.error(`[telegram] agent error chat_id=${chatId}: ${e.message}`);
133
162
  const errText = e.message
@@ -389,6 +389,203 @@ async function run() {
389
389
  saveSettings(settings);
390
390
  console.log(chalk.green(`\nModel ${chalk.bold(selectedModel)} saved to settings.`));
391
391
 
392
+ // --- VISION MODEL STEP (OPTIONAL) ---
393
+ let skipVision = false;
394
+ if (settings.visionModel) {
395
+ const { visionAction } = await inquirer.prompt([
396
+ {
397
+ type: 'list',
398
+ name: 'visionAction',
399
+ message: `Vision model is configured (${chalk.yellow(settings.visionModel)}). What do you want to do?`,
400
+ choices: [
401
+ { name: 'Keep current vision model', value: 'keep' },
402
+ { name: 'Change vision model', value: 'change' },
403
+ { name: 'Disable vision', value: 'disable' },
404
+ ],
405
+ }
406
+ ]);
407
+ if (visionAction === 'keep') {
408
+ skipVision = true;
409
+ } else if (visionAction === 'disable') {
410
+ delete settings.visionProvider;
411
+ delete settings.visionModel;
412
+ saveSettings(settings);
413
+ console.log(chalk.yellow('Vision model disabled.'));
414
+ skipVision = true;
415
+ }
416
+ } else {
417
+ const { configureVision } = await inquirer.prompt([
418
+ {
419
+ type: 'confirm',
420
+ name: 'configureVision',
421
+ message: 'Do you want to configure a separate vision model for image analysis (e.g. for Telegram photos)?',
422
+ default: false,
423
+ }
424
+ ]);
425
+ if (!configureVision) skipVision = true;
426
+ }
427
+
428
+ if (!skipVision) {
429
+ const { visionProvider } = await inquirer.prompt([
430
+ {
431
+ type: 'list',
432
+ name: 'visionProvider',
433
+ message: 'Which provider for the vision model?',
434
+ choices: [
435
+ { name: 'OpenRouter (access many models via one key)', value: 'openrouter' },
436
+ { name: 'Z.AI Direct (GLM models)', value: 'z-ai' },
437
+ ],
438
+ default: settings.visionProvider || provider,
439
+ }
440
+ ]);
441
+
442
+ // Ensure the key for the vision provider is available
443
+ const visionKeyVar = visionProvider === 'z-ai' ? 'ZAI_API_KEY' : 'OPENROUTER_API_KEY';
444
+ let visionApiKey = loadEnvVar(visionKeyVar);
445
+ if (visionApiKey) {
446
+ const { keepVisionKey } = await inquirer.prompt([
447
+ {
448
+ type: 'confirm',
449
+ name: 'keepVisionKey',
450
+ message: `A ${visionKeyVar} is already configured. Use it for the vision model?`,
451
+ default: true,
452
+ }
453
+ ]);
454
+ if (!keepVisionKey) visionApiKey = null;
455
+ }
456
+ if (!visionApiKey) {
457
+ console.log(chalk.yellow(`No ${visionKeyVar} found — needed for the vision model.`));
458
+ const { newVisionKey } = await inquirer.prompt([
459
+ {
460
+ type: 'password',
461
+ name: 'newVisionKey',
462
+ message: `Enter your ${visionProvider === 'z-ai' ? 'Z.AI' : 'OpenRouter'} API key:`,
463
+ validate: (input) => input.length >= 10 || 'API key must be at least 10 characters long.',
464
+ }
465
+ ]);
466
+ visionApiKey = newVisionKey;
467
+ saveEnvVar(visionKeyVar, visionApiKey);
468
+ console.log(chalk.green(`${visionKeyVar} saved.`));
469
+ }
470
+
471
+ // Model selection for vision provider
472
+ let visionModel = settings.visionProvider === visionProvider ? settings.visionModel : null;
473
+
474
+ if (visionModel) {
475
+ const { keepVisionModel } = await inquirer.prompt([
476
+ {
477
+ type: 'list',
478
+ name: 'keepVisionModel',
479
+ message: `Current vision model is ${chalk.yellow(visionModel)}. Keep it or change it?`,
480
+ choices: [
481
+ { name: 'Keep current vision model', value: true },
482
+ { name: 'Change vision model', value: false },
483
+ ],
484
+ }
485
+ ]);
486
+ if (!keepVisionModel) visionModel = null;
487
+ }
488
+
489
+ if (!visionModel) {
490
+ if (visionProvider === 'z-ai') {
491
+ const models = await fetchZaiModels(visionApiKey);
492
+ const choices = models.length > 0
493
+ ? models.map(m => ({ name: m.id, value: m.id }))
494
+ : [];
495
+ choices.push({ name: 'Enter model ID manually', value: '__manual__' });
496
+
497
+ const { browsedVisionModel } = await inquirer.prompt([
498
+ {
499
+ type: 'list',
500
+ name: 'browsedVisionModel',
501
+ message: 'Select a Z.AI vision model:',
502
+ choices,
503
+ pageSize: 20,
504
+ }
505
+ ]);
506
+ if (browsedVisionModel === '__manual__') {
507
+ const { manualVisionModel } = await inquirer.prompt([
508
+ {
509
+ type: 'input',
510
+ name: 'manualVisionModel',
511
+ message: 'Enter Z.AI model ID (e.g., glm-4v):',
512
+ validate: (input) => input.trim().length > 0 || 'Model ID cannot be empty.',
513
+ }
514
+ ]);
515
+ visionModel = manualVisionModel.trim();
516
+ } else {
517
+ visionModel = browsedVisionModel;
518
+ }
519
+ } else {
520
+ // OpenRouter
521
+ const { visionModelSelectionMethod } = await inquirer.prompt([
522
+ {
523
+ type: 'list',
524
+ name: 'visionModelSelectionMethod',
525
+ message: 'How would you like to select the vision model?',
526
+ choices: [
527
+ { name: 'Browse OpenRouter models', value: 'browse' },
528
+ { name: 'Enter model ID manually', value: 'manual' },
529
+ ],
530
+ }
531
+ ]);
532
+
533
+ if (visionModelSelectionMethod === 'manual') {
534
+ const { manualVisionModel } = await inquirer.prompt([
535
+ {
536
+ type: 'input',
537
+ name: 'manualVisionModel',
538
+ message: 'Enter OpenRouter model ID (e.g., openai/gpt-4o):',
539
+ validate: (input) => input.trim().length > 0 || 'Model ID cannot be empty.',
540
+ }
541
+ ]);
542
+ visionModel = manualVisionModel.trim();
543
+ } else {
544
+ const models = await fetchOpenRouterModels(visionApiKey);
545
+ if (models.length === 0) {
546
+ console.log(chalk.yellow('Falling back to manual entry due to fetch failure.'));
547
+ const { manualVisionModel } = await inquirer.prompt([
548
+ {
549
+ type: 'input',
550
+ name: 'manualVisionModel',
551
+ message: 'Enter OpenRouter model ID:',
552
+ validate: (input) => input.trim().length > 0 || 'Model ID cannot be empty.',
553
+ }
554
+ ]);
555
+ visionModel = manualVisionModel.trim();
556
+ } else {
557
+ models.sort((a, b) => {
558
+ const isFreeA = a.pricing && parseFloat(a.pricing.prompt) === 0 && parseFloat(a.pricing.completion) === 0;
559
+ const isFreeB = b.pricing && parseFloat(b.pricing.prompt) === 0 && parseFloat(b.pricing.completion) === 0;
560
+ if (isFreeA && !isFreeB) return -1;
561
+ if (!isFreeA && isFreeB) return 1;
562
+ return a.id.localeCompare(b.id);
563
+ });
564
+ const choices = models.map(m => {
565
+ const isFree = m.pricing && parseFloat(m.pricing.prompt) === 0 && parseFloat(m.pricing.completion) === 0;
566
+ return { name: `${m.id} ${isFree ? chalk.green('(Free)') : ''}`, value: m.id };
567
+ });
568
+ const { browsedVisionModel } = await inquirer.prompt([
569
+ {
570
+ type: 'list',
571
+ name: 'browsedVisionModel',
572
+ message: 'Select a vision model:',
573
+ choices,
574
+ pageSize: 20,
575
+ }
576
+ ]);
577
+ visionModel = browsedVisionModel;
578
+ }
579
+ }
580
+ }
581
+ }
582
+
583
+ settings.visionProvider = visionProvider;
584
+ settings.visionModel = visionModel;
585
+ saveSettings(settings);
586
+ console.log(chalk.green(`Vision model ${chalk.bold(visionModel)} saved.`));
587
+ }
588
+
392
589
  // --- TELEGRAM CHANNEL STEP (OPTIONAL) ---
393
590
  const { configureTelegram } = await inquirer.prompt([
394
591
  {
@@ -464,44 +661,6 @@ async function run() {
464
661
  }
465
662
  }
466
663
 
467
- // --- PERPLEXITY STEP (OPTIONAL) ---
468
- const existingPerplexityKey = loadEnvVar('PERPLEXITY_API_KEY');
469
- const { configurePerplexity } = await inquirer.prompt([
470
- {
471
- type: 'confirm',
472
- name: 'configurePerplexity',
473
- message: 'Do you want to configure Perplexity web search?',
474
- default: !!existingPerplexityKey
475
- }
476
- ]);
477
-
478
- if (configurePerplexity) {
479
- let keepPerplexityKey = false;
480
- if (existingPerplexityKey) {
481
- const { keep } = await inquirer.prompt([
482
- {
483
- type: 'confirm',
484
- name: 'keep',
485
- message: 'A PERPLEXITY_API_KEY is already configured. Do you want to keep it?',
486
- default: true
487
- }
488
- ]);
489
- keepPerplexityKey = keep;
490
- }
491
- if (!keepPerplexityKey) {
492
- const { perplexityKey } = await inquirer.prompt([
493
- {
494
- type: 'password',
495
- name: 'perplexityKey',
496
- message: 'Enter your Perplexity API key (from perplexity.ai/settings/api):',
497
- validate: (input) => input.trim().length > 0 || 'API key cannot be empty.'
498
- }
499
- ]);
500
- saveEnvVar('PERPLEXITY_API_KEY', perplexityKey.trim());
501
- console.log(chalk.green('Perplexity API key saved.'));
502
- }
503
- }
504
-
505
664
  // --- PM2 + LOG ROTATION STEP ---
506
665
  const pm2Check = spawnSync('pm2', ['--version'], { stdio: 'pipe' });
507
666
  if (pm2Check.status !== 0) {
@@ -54,6 +54,16 @@ export function loadConfig() {
54
54
  if (!apiKey) throw new Error('OPENROUTER_API_KEY not found. Run `jarvis setup` first.');
55
55
  }
56
56
 
57
+ // Vision model (optional) — separate provider/model for one-shot image analysis
58
+ const visionProvider = settings.visionProvider || null;
59
+ const visionModel = settings.visionModel || null;
60
+ let visionApiKey = null;
61
+ if (visionProvider === 'z-ai') {
62
+ visionApiKey = process.env.ZAI_API_KEY || null;
63
+ } else if (visionProvider === 'openrouter') {
64
+ visionApiKey = process.env.OPENROUTER_API_KEY || null;
65
+ }
66
+
57
67
  return {
58
68
  provider,
59
69
  apiKey,
@@ -67,6 +77,9 @@ export function loadConfig() {
67
77
  token: process.env.TELEGRAM_BOT_TOKEN || null,
68
78
  allowedUserIds: settings.channels?.telegram?.allowedUserIds || [],
69
79
  },
80
+ visionProvider,
81
+ visionModel,
82
+ visionApiKey,
70
83
  };
71
84
  }
72
85
 
@@ -656,6 +656,71 @@ const SEED_TOOLS = {
656
656
  return { status: 'ok', name: args.name, content };
657
657
  `,
658
658
  },
659
+ analyze_image: {
660
+ definition: {
661
+ type: 'function',
662
+ function: {
663
+ name: 'analyze_image',
664
+ description: 'Fetch an image from a URL and analyze it using the configured vision model. Returns a detailed description of the image. Use this whenever a user shares an image URL and asks about its content.',
665
+ parameters: {
666
+ type: 'object',
667
+ properties: {
668
+ url: {
669
+ type: 'string',
670
+ description: 'The URL of the image to analyze (http or https).',
671
+ },
672
+ prompt: {
673
+ type: 'string',
674
+ description: 'Optional question or instruction for the vision model, e.g. "What text is visible?" or "Describe the chart". Defaults to a general description.',
675
+ },
676
+ },
677
+ required: ['url'],
678
+ },
679
+ },
680
+ },
681
+ code: `
682
+ const settingsPath = path.join(process.env.HOME, '.jarvis/data/config/settings.json');
683
+ const settings = JSON.parse(await fs.promises.readFile(settingsPath, 'utf8').catch(() => '{}'));
684
+ const visionModel = settings.visionModel;
685
+ const visionProvider = settings.visionProvider;
686
+ if (!visionModel || !visionProvider) {
687
+ return { status: 'error', message: 'No vision model configured. Set visionModel and visionProvider in settings.' };
688
+ }
689
+ let apiKey, baseURL;
690
+ if (visionProvider === 'z-ai') {
691
+ apiKey = process.env.ZAI_API_KEY;
692
+ baseURL = 'https://api.z.ai/api/coding/paas/v4/';
693
+ } else {
694
+ apiKey = process.env.OPENROUTER_API_KEY;
695
+ baseURL = 'https://openrouter.ai/api/v1';
696
+ }
697
+ if (!apiKey) return { status: 'error', message: 'No API key found for vision provider: ' + visionProvider };
698
+ const imgResponse = await fetch(args.url);
699
+ if (!imgResponse.ok) return { status: 'error', message: 'Failed to fetch image: HTTP ' + imgResponse.status };
700
+ const buffer = await imgResponse.arrayBuffer();
701
+ const base64 = Buffer.from(buffer).toString('base64');
702
+ const contentType = imgResponse.headers.get('content-type') || 'image/jpeg';
703
+ const dataUrl = 'data:' + contentType + ';base64,' + base64;
704
+ const textPrompt = args.prompt?.trim()
705
+ ? 'The user shared this image with the following question/context: "' + args.prompt.trim() + '"\\n\\nPlease describe what you see, paying special attention to anything relevant to their message.'
706
+ : 'Please describe this image in detail. Include all visible text, objects, colors, layout, and any other relevant details.';
707
+ const apiResponse = await fetch(baseURL + (baseURL.endsWith('/') ? '' : '/') + 'chat/completions', {
708
+ method: 'POST',
709
+ headers: { 'Authorization': 'Bearer ' + apiKey, 'Content-Type': 'application/json' },
710
+ body: JSON.stringify({
711
+ model: visionModel,
712
+ messages: [{ role: 'user', content: [
713
+ { type: 'image_url', image_url: { url: dataUrl } },
714
+ { type: 'text', text: textPrompt },
715
+ ]}],
716
+ }),
717
+ });
718
+ const result = await apiResponse.json();
719
+ if (!apiResponse.ok) return { status: 'error', message: result.error?.message || 'Vision API error' };
720
+ const description = result.choices?.[0]?.message?.content?.trim() || '(no description returned)';
721
+ return { status: 'ok', description };
722
+ `,
723
+ },
659
724
  };
660
725
 
661
726
  export function seedTools() {
@@ -0,0 +1,40 @@
1
+ import { createClient } from './provider.js';
2
+
3
+ const DEFAULT_PROMPT = 'Please describe this image in detail. Include all visible text, objects, colors, layout, and any other relevant details.';
4
+
5
+ /**
6
+ * Sends a one-shot image analysis request to the configured vision model.
7
+ * No chat history is included — only the image and an optional caption-based prompt.
8
+ *
9
+ * @param {object} attachment - { url: 'data:image/jpeg;base64,...' }
10
+ * @param {string} caption - optional user-provided caption / question
11
+ * @param {object} config - full app config (must include visionProvider, visionModel, visionApiKey)
12
+ * @returns {Promise<string>} - the vision model's description
13
+ */
14
+ export async function describeImage(attachment, caption, config) {
15
+ const visionConfig = {
16
+ provider: config.visionProvider,
17
+ apiKey: config.visionApiKey,
18
+ };
19
+
20
+ const client = createClient(visionConfig);
21
+
22
+ const textPrompt = caption?.trim()
23
+ ? `The user sent this image with the following message: "${caption.trim()}"\n\nPlease describe what you see in the image in detail, paying special attention to anything relevant to their message.`
24
+ : DEFAULT_PROMPT;
25
+
26
+ const response = await client.chat.completions.create({
27
+ model: config.visionModel,
28
+ messages: [
29
+ {
30
+ role: 'user',
31
+ content: [
32
+ { type: 'image_url', image_url: { url: attachment.url } },
33
+ { type: 'text', text: textPrompt },
34
+ ],
35
+ },
36
+ ],
37
+ });
38
+
39
+ return response.choices[0]?.message?.content?.trim() || '(no description returned)';
40
+ }