@ducci/jarvis 1.0.66 → 1.0.68
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/channels/telegram/index.js +31 -2
- package/src/scripts/onboarding.js +176 -0
- package/src/server/agent.js +41 -6
- package/src/server/config.js +13 -0
- package/src/server/vision.js +40 -0
package/package.json
CHANGED
|
@@ -6,6 +6,7 @@ import { handleChat } from '../../server/agent.js';
|
|
|
6
6
|
import { loadSession } from '../../server/sessions.js';
|
|
7
7
|
import { PATHS } from '../../server/config.js';
|
|
8
8
|
import { load, save } from './sessions.js';
|
|
9
|
+
import { describeImage } from '../../server/vision.js';
|
|
9
10
|
|
|
10
11
|
function getTelegramChatLogPath(chatId, sessionId) {
|
|
11
12
|
const prefix = sessionId ? String(sessionId).slice(0, 8) : 'unknown';
|
|
@@ -123,11 +124,39 @@ export async function startTelegramChannel(config) {
|
|
|
123
124
|
const combinedText = batch.length === 1
|
|
124
125
|
? batch[0].text
|
|
125
126
|
: batch.map(m => m.text).join('\n\n');
|
|
126
|
-
const
|
|
127
|
+
const rawAttachments = batch.flatMap(m => m.attachments);
|
|
128
|
+
|
|
129
|
+
// If a vision model is configured and the batch contains images, run a one-shot
|
|
130
|
+
// image analysis first and inject the result as text into the main agent turn.
|
|
131
|
+
let userText = combinedText;
|
|
132
|
+
let allAttachments = rawAttachments;
|
|
133
|
+
if (config.visionModel && config.visionApiKey && rawAttachments.length > 0) {
|
|
134
|
+
try {
|
|
135
|
+
const results = await Promise.allSettled(
|
|
136
|
+
rawAttachments.map(a => describeImage(a, combinedText, config))
|
|
137
|
+
);
|
|
138
|
+
const successfulDescs = results
|
|
139
|
+
.filter(r => r.status === 'fulfilled')
|
|
140
|
+
.map(r => `[Image analysis: ${r.value}]`);
|
|
141
|
+
const failedCount = results.filter(r => r.status === 'rejected').length;
|
|
142
|
+
if (failedCount > 0) {
|
|
143
|
+
console.error(`[telegram] vision error for ${failedCount}/${results.length} image(s), those will be sent directly to main agent`);
|
|
144
|
+
}
|
|
145
|
+
if (successfulDescs.length > 0) {
|
|
146
|
+
const descBlock = successfulDescs.join('\n\n');
|
|
147
|
+
userText = combinedText ? `${descBlock}\n\n${combinedText}` : descBlock;
|
|
148
|
+
}
|
|
149
|
+
// Only clear attachments for images that were successfully described
|
|
150
|
+
allAttachments = rawAttachments.filter((_, i) => results[i].status === 'rejected');
|
|
151
|
+
} catch (e) {
|
|
152
|
+
console.error(`[telegram] vision error, falling back to direct image: ${e.message}`);
|
|
153
|
+
// allAttachments and userText stay unchanged — main agent gets the raw image
|
|
154
|
+
}
|
|
155
|
+
}
|
|
127
156
|
|
|
128
157
|
let result;
|
|
129
158
|
try {
|
|
130
|
-
result = await handleChat(config, sessionId,
|
|
159
|
+
result = await handleChat(config, sessionId, userText, allAttachments);
|
|
131
160
|
} catch (e) {
|
|
132
161
|
console.error(`[telegram] agent error chat_id=${chatId}: ${e.message}`);
|
|
133
162
|
const errText = e.message
|
|
@@ -389,6 +389,182 @@ async function run() {
|
|
|
389
389
|
saveSettings(settings);
|
|
390
390
|
console.log(chalk.green(`\nModel ${chalk.bold(selectedModel)} saved to settings.`));
|
|
391
391
|
|
|
392
|
+
// --- VISION MODEL STEP (OPTIONAL) ---
|
|
393
|
+
const { configureVision } = await inquirer.prompt([
|
|
394
|
+
{
|
|
395
|
+
type: 'confirm',
|
|
396
|
+
name: 'configureVision',
|
|
397
|
+
message: 'Do you want to configure a separate vision model for image analysis (e.g. for Telegram photos)?',
|
|
398
|
+
default: !!settings.visionModel,
|
|
399
|
+
}
|
|
400
|
+
]);
|
|
401
|
+
|
|
402
|
+
if (configureVision) {
|
|
403
|
+
const { visionProvider } = await inquirer.prompt([
|
|
404
|
+
{
|
|
405
|
+
type: 'list',
|
|
406
|
+
name: 'visionProvider',
|
|
407
|
+
message: 'Which provider for the vision model?',
|
|
408
|
+
choices: [
|
|
409
|
+
{ name: 'OpenRouter (access many models via one key)', value: 'openrouter' },
|
|
410
|
+
{ name: 'Z.AI Direct (GLM models)', value: 'z-ai' },
|
|
411
|
+
],
|
|
412
|
+
default: settings.visionProvider || provider,
|
|
413
|
+
}
|
|
414
|
+
]);
|
|
415
|
+
|
|
416
|
+
// Ensure the key for the vision provider is available
|
|
417
|
+
const visionKeyVar = visionProvider === 'z-ai' ? 'ZAI_API_KEY' : 'OPENROUTER_API_KEY';
|
|
418
|
+
let visionApiKey = loadEnvVar(visionKeyVar);
|
|
419
|
+
if (visionApiKey) {
|
|
420
|
+
const { keepVisionKey } = await inquirer.prompt([
|
|
421
|
+
{
|
|
422
|
+
type: 'confirm',
|
|
423
|
+
name: 'keepVisionKey',
|
|
424
|
+
message: `A ${visionKeyVar} is already configured. Use it for the vision model?`,
|
|
425
|
+
default: true,
|
|
426
|
+
}
|
|
427
|
+
]);
|
|
428
|
+
if (!keepVisionKey) visionApiKey = null;
|
|
429
|
+
}
|
|
430
|
+
if (!visionApiKey) {
|
|
431
|
+
console.log(chalk.yellow(`No ${visionKeyVar} found — needed for the vision model.`));
|
|
432
|
+
const { newVisionKey } = await inquirer.prompt([
|
|
433
|
+
{
|
|
434
|
+
type: 'password',
|
|
435
|
+
name: 'newVisionKey',
|
|
436
|
+
message: `Enter your ${visionProvider === 'z-ai' ? 'Z.AI' : 'OpenRouter'} API key:`,
|
|
437
|
+
validate: (input) => input.length >= 10 || 'API key must be at least 10 characters long.',
|
|
438
|
+
}
|
|
439
|
+
]);
|
|
440
|
+
visionApiKey = newVisionKey;
|
|
441
|
+
saveEnvVar(visionKeyVar, visionApiKey);
|
|
442
|
+
console.log(chalk.green(`${visionKeyVar} saved.`));
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
// Model selection for vision provider
|
|
446
|
+
let visionModel = settings.visionProvider === visionProvider ? settings.visionModel : null;
|
|
447
|
+
|
|
448
|
+
if (visionModel) {
|
|
449
|
+
const { keepVisionModel } = await inquirer.prompt([
|
|
450
|
+
{
|
|
451
|
+
type: 'list',
|
|
452
|
+
name: 'keepVisionModel',
|
|
453
|
+
message: `Current vision model is ${chalk.yellow(visionModel)}. Keep it or change it?`,
|
|
454
|
+
choices: [
|
|
455
|
+
{ name: 'Keep current vision model', value: true },
|
|
456
|
+
{ name: 'Change vision model', value: false },
|
|
457
|
+
],
|
|
458
|
+
}
|
|
459
|
+
]);
|
|
460
|
+
if (!keepVisionModel) visionModel = null;
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
if (!visionModel) {
|
|
464
|
+
if (visionProvider === 'z-ai') {
|
|
465
|
+
const models = await fetchZaiModels(visionApiKey);
|
|
466
|
+
const choices = models.length > 0
|
|
467
|
+
? models.map(m => ({ name: m.id, value: m.id }))
|
|
468
|
+
: [];
|
|
469
|
+
choices.push({ name: 'Enter model ID manually', value: '__manual__' });
|
|
470
|
+
|
|
471
|
+
const { browsedVisionModel } = await inquirer.prompt([
|
|
472
|
+
{
|
|
473
|
+
type: 'list',
|
|
474
|
+
name: 'browsedVisionModel',
|
|
475
|
+
message: 'Select a Z.AI vision model:',
|
|
476
|
+
choices,
|
|
477
|
+
pageSize: 20,
|
|
478
|
+
}
|
|
479
|
+
]);
|
|
480
|
+
if (browsedVisionModel === '__manual__') {
|
|
481
|
+
const { manualVisionModel } = await inquirer.prompt([
|
|
482
|
+
{
|
|
483
|
+
type: 'input',
|
|
484
|
+
name: 'manualVisionModel',
|
|
485
|
+
message: 'Enter Z.AI model ID (e.g., glm-4v):',
|
|
486
|
+
validate: (input) => input.trim().length > 0 || 'Model ID cannot be empty.',
|
|
487
|
+
}
|
|
488
|
+
]);
|
|
489
|
+
visionModel = manualVisionModel.trim();
|
|
490
|
+
} else {
|
|
491
|
+
visionModel = browsedVisionModel;
|
|
492
|
+
}
|
|
493
|
+
} else {
|
|
494
|
+
// OpenRouter
|
|
495
|
+
const { visionModelSelectionMethod } = await inquirer.prompt([
|
|
496
|
+
{
|
|
497
|
+
type: 'list',
|
|
498
|
+
name: 'visionModelSelectionMethod',
|
|
499
|
+
message: 'How would you like to select the vision model?',
|
|
500
|
+
choices: [
|
|
501
|
+
{ name: 'Browse OpenRouter models', value: 'browse' },
|
|
502
|
+
{ name: 'Enter model ID manually', value: 'manual' },
|
|
503
|
+
],
|
|
504
|
+
}
|
|
505
|
+
]);
|
|
506
|
+
|
|
507
|
+
if (visionModelSelectionMethod === 'manual') {
|
|
508
|
+
const { manualVisionModel } = await inquirer.prompt([
|
|
509
|
+
{
|
|
510
|
+
type: 'input',
|
|
511
|
+
name: 'manualVisionModel',
|
|
512
|
+
message: 'Enter OpenRouter model ID (e.g., openai/gpt-4o):',
|
|
513
|
+
validate: (input) => input.trim().length > 0 || 'Model ID cannot be empty.',
|
|
514
|
+
}
|
|
515
|
+
]);
|
|
516
|
+
visionModel = manualVisionModel.trim();
|
|
517
|
+
} else {
|
|
518
|
+
const models = await fetchOpenRouterModels(visionApiKey);
|
|
519
|
+
if (models.length === 0) {
|
|
520
|
+
console.log(chalk.yellow('Falling back to manual entry due to fetch failure.'));
|
|
521
|
+
const { manualVisionModel } = await inquirer.prompt([
|
|
522
|
+
{
|
|
523
|
+
type: 'input',
|
|
524
|
+
name: 'manualVisionModel',
|
|
525
|
+
message: 'Enter OpenRouter model ID:',
|
|
526
|
+
validate: (input) => input.trim().length > 0 || 'Model ID cannot be empty.',
|
|
527
|
+
}
|
|
528
|
+
]);
|
|
529
|
+
visionModel = manualVisionModel.trim();
|
|
530
|
+
} else {
|
|
531
|
+
models.sort((a, b) => {
|
|
532
|
+
const isFreeA = a.pricing && parseFloat(a.pricing.prompt) === 0 && parseFloat(a.pricing.completion) === 0;
|
|
533
|
+
const isFreeB = b.pricing && parseFloat(b.pricing.prompt) === 0 && parseFloat(b.pricing.completion) === 0;
|
|
534
|
+
if (isFreeA && !isFreeB) return -1;
|
|
535
|
+
if (!isFreeA && isFreeB) return 1;
|
|
536
|
+
return a.id.localeCompare(b.id);
|
|
537
|
+
});
|
|
538
|
+
const choices = models.map(m => {
|
|
539
|
+
const isFree = m.pricing && parseFloat(m.pricing.prompt) === 0 && parseFloat(m.pricing.completion) === 0;
|
|
540
|
+
return { name: `${m.id} ${isFree ? chalk.green('(Free)') : ''}`, value: m.id };
|
|
541
|
+
});
|
|
542
|
+
const { browsedVisionModel } = await inquirer.prompt([
|
|
543
|
+
{
|
|
544
|
+
type: 'list',
|
|
545
|
+
name: 'browsedVisionModel',
|
|
546
|
+
message: 'Select a vision model:',
|
|
547
|
+
choices,
|
|
548
|
+
pageSize: 20,
|
|
549
|
+
}
|
|
550
|
+
]);
|
|
551
|
+
visionModel = browsedVisionModel;
|
|
552
|
+
}
|
|
553
|
+
}
|
|
554
|
+
}
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
settings.visionProvider = visionProvider;
|
|
558
|
+
settings.visionModel = visionModel;
|
|
559
|
+
saveSettings(settings);
|
|
560
|
+
console.log(chalk.green(`Vision model ${chalk.bold(visionModel)} saved.`));
|
|
561
|
+
} else {
|
|
562
|
+
// Clear vision config if user opts out
|
|
563
|
+
delete settings.visionProvider;
|
|
564
|
+
delete settings.visionModel;
|
|
565
|
+
saveSettings(settings);
|
|
566
|
+
}
|
|
567
|
+
|
|
392
568
|
// --- TELEGRAM CHANNEL STEP (OPTIONAL) ---
|
|
393
569
|
const { configureTelegram } = await inquirer.prompt([
|
|
394
570
|
{
|
package/src/server/agent.js
CHANGED
|
@@ -15,6 +15,41 @@ const LOOP_DETECTION_THRESHOLD = 3;
|
|
|
15
15
|
function stripCodeFence(text) {
|
|
16
16
|
return text.replace(/^```(?:json)?\s*\n([\s\S]*?)\n?```\s*$/, '$1').trim();
|
|
17
17
|
}
|
|
18
|
+
|
|
19
|
+
// Sanitize raw model output before JSON.parse:
|
|
20
|
+
// 1. Strip code fences
|
|
21
|
+
// 2. Escape literal control characters (e.g. real newlines in <pre> blocks) inside
|
|
22
|
+
// JSON string values — models sometimes forget to escape \n as \\n
|
|
23
|
+
// 3. Remove trailing commas before } and ] (JS-valid but JSON-invalid)
|
|
24
|
+
function sanitizeJson(text) {
|
|
25
|
+
let s = stripCodeFence(text);
|
|
26
|
+
|
|
27
|
+
// State machine: walk the string and fix unescaped control chars inside strings
|
|
28
|
+
let result = '';
|
|
29
|
+
let inString = false;
|
|
30
|
+
let escaped = false;
|
|
31
|
+
const controlEscapes = { '\n': '\\n', '\r': '\\r', '\t': '\\t' };
|
|
32
|
+
for (let i = 0; i < s.length; i++) {
|
|
33
|
+
const ch = s[i];
|
|
34
|
+
if (escaped) {
|
|
35
|
+
result += ch;
|
|
36
|
+
escaped = false;
|
|
37
|
+
} else if (ch === '\\' && inString) {
|
|
38
|
+
result += ch;
|
|
39
|
+
escaped = true;
|
|
40
|
+
} else if (ch === '"') {
|
|
41
|
+
result += ch;
|
|
42
|
+
inString = !inString;
|
|
43
|
+
} else if (inString && ch.charCodeAt(0) < 0x20) {
|
|
44
|
+
result += controlEscapes[ch] ?? `\\u${ch.charCodeAt(0).toString(16).padStart(4, '0')}`;
|
|
45
|
+
} else {
|
|
46
|
+
result += ch;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
// Remove trailing commas before } and ]
|
|
51
|
+
return result.replace(/,(\s*[}\]])/g, '$1');
|
|
52
|
+
}
|
|
18
53
|
const CONSECUTIVE_FAILURE_THRESHOLD = 3;
|
|
19
54
|
const MAX_TOOL_RESULT = 4000;
|
|
20
55
|
|
|
@@ -402,20 +437,20 @@ export async function runAgentLoop(client, config, session, prepareMessages, usa
|
|
|
402
437
|
if (nudgeContent.trim()) {
|
|
403
438
|
content = nudgeContent;
|
|
404
439
|
}
|
|
405
|
-
parsed = JSON.parse(
|
|
440
|
+
parsed = JSON.parse(sanitizeJson(nudgeContent));
|
|
406
441
|
} catch {
|
|
407
442
|
// Fall through to !parsed handler; content may now carry the nudge text
|
|
408
443
|
}
|
|
409
444
|
} else {
|
|
410
445
|
try {
|
|
411
|
-
parsed = JSON.parse(
|
|
446
|
+
parsed = JSON.parse(sanitizeJson(content));
|
|
412
447
|
} catch {
|
|
413
448
|
// Step 1: retry with fallback model
|
|
414
449
|
try {
|
|
415
450
|
const fallbackResult = await callModel(client, config.fallbackModel, preparedMessages, toolDefs);
|
|
416
451
|
accumulateUsage(usageAccum, fallbackResult);
|
|
417
452
|
const fallbackContent = fallbackResult.choices[0]?.message?.content || '';
|
|
418
|
-
parsed = JSON.parse(
|
|
453
|
+
parsed = JSON.parse(sanitizeJson(fallbackContent));
|
|
419
454
|
content = fallbackContent;
|
|
420
455
|
} catch {
|
|
421
456
|
// Step 2: nudge retry via both models
|
|
@@ -424,7 +459,7 @@ export async function runAgentLoop(client, config, session, prepareMessages, usa
|
|
|
424
459
|
const nudgeResult = await callModelWithFallback(client, config, nudgeMessages, toolDefs);
|
|
425
460
|
accumulateUsage(usageAccum, nudgeResult);
|
|
426
461
|
const nudgeContent = nudgeResult.choices[0]?.message?.content || '';
|
|
427
|
-
parsed = JSON.parse(
|
|
462
|
+
parsed = JSON.parse(sanitizeJson(nudgeContent));
|
|
428
463
|
content = nudgeContent;
|
|
429
464
|
} catch {
|
|
430
465
|
// Give up
|
|
@@ -495,14 +530,14 @@ export async function runAgentLoop(client, config, session, prepareMessages, usa
|
|
|
495
530
|
|
|
496
531
|
// Try JSON parse; if it fails, nudge retry (Layer 2)
|
|
497
532
|
try {
|
|
498
|
-
parsedWrapUp = JSON.parse(
|
|
533
|
+
parsedWrapUp = JSON.parse(sanitizeJson(wrapUpContent));
|
|
499
534
|
} catch {
|
|
500
535
|
try {
|
|
501
536
|
const nudgeMessages = [...wrapUpMessages, { role: 'user', content: FORMAT_NUDGE }];
|
|
502
537
|
const nudgeResult = await callModelWithFallback(client, config, nudgeMessages, []);
|
|
503
538
|
accumulateUsage(usageAccum, nudgeResult);
|
|
504
539
|
const nudgeContent = nudgeResult.choices[0]?.message?.content || '';
|
|
505
|
-
parsedWrapUp = JSON.parse(
|
|
540
|
+
parsedWrapUp = JSON.parse(sanitizeJson(nudgeContent));
|
|
506
541
|
wrapUpContent = nudgeContent;
|
|
507
542
|
} catch {
|
|
508
543
|
// Layer 3: use raw text as best-effort response below
|
package/src/server/config.js
CHANGED
|
@@ -54,6 +54,16 @@ export function loadConfig() {
|
|
|
54
54
|
if (!apiKey) throw new Error('OPENROUTER_API_KEY not found. Run `jarvis setup` first.');
|
|
55
55
|
}
|
|
56
56
|
|
|
57
|
+
// Vision model (optional) — separate provider/model for one-shot image analysis
|
|
58
|
+
const visionProvider = settings.visionProvider || null;
|
|
59
|
+
const visionModel = settings.visionModel || null;
|
|
60
|
+
let visionApiKey = null;
|
|
61
|
+
if (visionProvider === 'z-ai') {
|
|
62
|
+
visionApiKey = process.env.ZAI_API_KEY || null;
|
|
63
|
+
} else if (visionProvider === 'openrouter') {
|
|
64
|
+
visionApiKey = process.env.OPENROUTER_API_KEY || null;
|
|
65
|
+
}
|
|
66
|
+
|
|
57
67
|
return {
|
|
58
68
|
provider,
|
|
59
69
|
apiKey,
|
|
@@ -67,6 +77,9 @@ export function loadConfig() {
|
|
|
67
77
|
token: process.env.TELEGRAM_BOT_TOKEN || null,
|
|
68
78
|
allowedUserIds: settings.channels?.telegram?.allowedUserIds || [],
|
|
69
79
|
},
|
|
80
|
+
visionProvider,
|
|
81
|
+
visionModel,
|
|
82
|
+
visionApiKey,
|
|
70
83
|
};
|
|
71
84
|
}
|
|
72
85
|
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import { createClient } from './provider.js';
|
|
2
|
+
|
|
3
|
+
const DEFAULT_PROMPT = 'Please describe this image in detail. Include all visible text, objects, colors, layout, and any other relevant details.';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Sends a one-shot image analysis request to the configured vision model.
|
|
7
|
+
* No chat history is included — only the image and an optional caption-based prompt.
|
|
8
|
+
*
|
|
9
|
+
* @param {object} attachment - { url: 'data:image/jpeg;base64,...' }
|
|
10
|
+
* @param {string} caption - optional user-provided caption / question
|
|
11
|
+
* @param {object} config - full app config (must include visionProvider, visionModel, visionApiKey)
|
|
12
|
+
* @returns {Promise<string>} - the vision model's description
|
|
13
|
+
*/
|
|
14
|
+
export async function describeImage(attachment, caption, config) {
|
|
15
|
+
const visionConfig = {
|
|
16
|
+
provider: config.visionProvider,
|
|
17
|
+
apiKey: config.visionApiKey,
|
|
18
|
+
};
|
|
19
|
+
|
|
20
|
+
const client = createClient(visionConfig);
|
|
21
|
+
|
|
22
|
+
const textPrompt = caption?.trim()
|
|
23
|
+
? `The user sent this image with the following message: "${caption.trim()}"\n\nPlease describe what you see in the image in detail, paying special attention to anything relevant to their message.`
|
|
24
|
+
: DEFAULT_PROMPT;
|
|
25
|
+
|
|
26
|
+
const response = await client.chat.completions.create({
|
|
27
|
+
model: config.visionModel,
|
|
28
|
+
messages: [
|
|
29
|
+
{
|
|
30
|
+
role: 'user',
|
|
31
|
+
content: [
|
|
32
|
+
{ type: 'image_url', image_url: { url: attachment.url } },
|
|
33
|
+
{ type: 'text', text: textPrompt },
|
|
34
|
+
],
|
|
35
|
+
},
|
|
36
|
+
],
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
return response.choices[0]?.message?.content?.trim() || '(no description returned)';
|
|
40
|
+
}
|