@ducci/jarvis 1.0.39 → 1.0.41
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/docs/telegram.md +42 -2
- package/package.json +1 -1
- package/src/channels/telegram/index.js +65 -0
- package/src/server/agent.js +24 -5
- package/src/server/tools.js +42 -1
package/docs/telegram.md
CHANGED
|
@@ -18,7 +18,7 @@ The channel calls the agent layer directly (no HTTP hop) — it imports and call
|
|
|
18
18
|
|
|
19
19
|
```
|
|
20
20
|
Telegram user
|
|
21
|
-
↓ (text message)
|
|
21
|
+
↓ (text or photo message)
|
|
22
22
|
Telegram Bot API ←→ grammy-runner (long polling)
|
|
23
23
|
↓
|
|
24
24
|
Channel adapter (src/channels/telegram/index.js)
|
|
@@ -246,9 +246,49 @@ await bot.api.setMyCommands([
|
|
|
246
246
|
| User sends `/new`, no session exists yet | No-op, same confirmation sent |
|
|
247
247
|
| Next text message after `/new` | New session created, mapped to `chat_id` |
|
|
248
248
|
|
|
249
|
+
## Photo Support
|
|
250
|
+
|
|
251
|
+
The bot handles incoming photos (`message:photo`) in addition to text. When a user sends a photo, the adapter selects the best resolution under 800px wide to keep token usage reasonable, then passes the image URL and optional caption to the agent as a multimodal content block.
|
|
252
|
+
|
|
253
|
+
### Photo selection
|
|
254
|
+
|
|
255
|
+
Telegram always delivers multiple resolutions of every photo as an array of `PhotoSize` objects, sorted ascending by resolution. The adapter picks the last entry with `width <= 800`:
|
|
256
|
+
|
|
257
|
+
```js
|
|
258
|
+
const photo = ctx.message.photo.filter(p => p.width <= 800).at(-1)
|
|
259
|
+
?? ctx.message.photo[0]; // fallback: smallest if all variants exceed 800px
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
This gives the highest quality image below the 800px threshold. Sending the full-resolution original would consume significantly more tokens for no practical benefit in most tasks.
|
|
263
|
+
|
|
264
|
+
### Download and base64 encoding
|
|
265
|
+
|
|
266
|
+
The image is downloaded immediately at receive time using the Telegram file URL (`https://api.telegram.org/file/bot<token>/<file_path>`) and converted to a base64 data URL (`data:image/jpeg;base64,...`). The data URL is stored directly in the session message, so the image remains available across handoffs and future conversation turns without depending on a Telegram URL that would expire after ~1 hour. Base64 encoding does not cost more tokens than a URL — image token cost is based on pixel dimensions, not transport format.
|
|
267
|
+
|
|
268
|
+
### Agent call
|
|
269
|
+
|
|
270
|
+
Photos are passed to the agent as a multimodal content array instead of a plain string:
|
|
271
|
+
|
|
272
|
+
```js
|
|
273
|
+
const content = [
|
|
274
|
+
{ type: 'image_url', url: fileUrl },
|
|
275
|
+
];
|
|
276
|
+
if (caption) content.push({ type: 'text', text: caption });
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
The agent layer must support receiving `content` as either a string or a content array and pass it through to the model accordingly.
|
|
280
|
+
|
|
281
|
+
### Caption
|
|
282
|
+
|
|
283
|
+
If the user attaches a caption to the photo (`ctx.message.caption`), it is included as a text block alongside the image. If there is no caption, only the image block is sent.
|
|
284
|
+
|
|
285
|
+
### Unsupported media types
|
|
286
|
+
|
|
287
|
+
Documents, audio, video, stickers, and other non-photo media types are not handled — the bot silently ignores them (same as unauthorized messages).
|
|
288
|
+
|
|
249
289
|
## Non-Goals (v1)
|
|
250
290
|
|
|
251
|
-
- No support for
|
|
291
|
+
- No support for documents, audio, video, or other non-photo media types
|
|
252
292
|
- No inline keyboards or callback queries
|
|
253
293
|
- No group chat support (only private chats)
|
|
254
294
|
- No message editing or deletion handling
|
package/package.json
CHANGED
|
@@ -60,6 +60,71 @@ export async function startTelegramChannel(config) {
|
|
|
60
60
|
await ctx.reply('New session started.');
|
|
61
61
|
});
|
|
62
62
|
|
|
63
|
+
bot.on('message:photo', async (ctx) => {
|
|
64
|
+
const userId = ctx.from?.id;
|
|
65
|
+
if (!allowedUserIds.includes(userId)) return;
|
|
66
|
+
|
|
67
|
+
const chatId = ctx.chat.id;
|
|
68
|
+
const sessionId = sessions[chatId] || null;
|
|
69
|
+
|
|
70
|
+
console.log(`[telegram] incoming photo chat_id=${chatId}`);
|
|
71
|
+
|
|
72
|
+
await ctx.api.sendChatAction(chatId, 'typing');
|
|
73
|
+
const typingInterval = setInterval(() => {
|
|
74
|
+
ctx.api.sendChatAction(chatId, 'typing').catch(() => {});
|
|
75
|
+
}, 4000);
|
|
76
|
+
|
|
77
|
+
let result;
|
|
78
|
+
try {
|
|
79
|
+
const photo = ctx.message.photo.filter(p => p.width <= 800).at(-1)
|
|
80
|
+
?? ctx.message.photo[0];
|
|
81
|
+
const file = await ctx.api.getFile(photo.file_id);
|
|
82
|
+
const fileUrl = `https://api.telegram.org/file/bot${token}/${file.file_path}`;
|
|
83
|
+
const imgResponse = await fetch(fileUrl);
|
|
84
|
+
const buffer = await imgResponse.arrayBuffer();
|
|
85
|
+
const base64 = Buffer.from(buffer).toString('base64');
|
|
86
|
+
const dataUrl = `data:image/jpeg;base64,${base64}`;
|
|
87
|
+
const caption = ctx.message.caption || '';
|
|
88
|
+
result = await handleChat(config, sessionId, caption, [{ url: dataUrl }]);
|
|
89
|
+
} catch (e) {
|
|
90
|
+
console.error(`[telegram] agent error chat_id=${chatId}: ${e.message}`);
|
|
91
|
+
const errText = e.message
|
|
92
|
+
? `Sorry, something went wrong: ${e.message}`
|
|
93
|
+
: 'Sorry, something went wrong. Please try again.';
|
|
94
|
+
await ctx.reply(errText).catch(() => {});
|
|
95
|
+
clearInterval(typingInterval);
|
|
96
|
+
return;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
if (!sessions[chatId]) {
|
|
100
|
+
sessions[chatId] = result.sessionId;
|
|
101
|
+
save(sessions);
|
|
102
|
+
console.log(`[telegram] session created sessionId=${result.sessionId.slice(0, 8)}`);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
try {
|
|
106
|
+
const MAX_TG = 4096;
|
|
107
|
+
const rawResponse = typeof result.response === 'string'
|
|
108
|
+
? result.response
|
|
109
|
+
: result.response != null ? JSON.stringify(result.response, null, 2) : '';
|
|
110
|
+
const text = rawResponse.trim()
|
|
111
|
+
|| 'The agent encountered an error and could not produce a response. Please try again.';
|
|
112
|
+
if (text.length <= MAX_TG) {
|
|
113
|
+
await ctx.reply(text);
|
|
114
|
+
} else {
|
|
115
|
+
for (let i = 0; i < text.length; i += MAX_TG) {
|
|
116
|
+
await ctx.reply(text.slice(i, i + MAX_TG));
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
console.log(`[telegram] response sent chat_id=${chatId} length=${text.length}`);
|
|
120
|
+
} catch (e) {
|
|
121
|
+
console.error(`[telegram] delivery error chat_id=${chatId}: ${e.message}`);
|
|
122
|
+
await ctx.reply('Sorry, something went wrong sending the response. Please try again.').catch(() => {});
|
|
123
|
+
} finally {
|
|
124
|
+
clearInterval(typingInterval);
|
|
125
|
+
}
|
|
126
|
+
});
|
|
127
|
+
|
|
63
128
|
bot.on('message:text', async (ctx) => {
|
|
64
129
|
const userId = ctx.from?.id;
|
|
65
130
|
|
package/src/server/agent.js
CHANGED
|
@@ -50,6 +50,12 @@ async function callModel(client, model, messages, tools) {
|
|
|
50
50
|
return await client.chat.completions.create(params);
|
|
51
51
|
}
|
|
52
52
|
|
|
53
|
+
function isImageUnsupportedError(apiErrors) {
|
|
54
|
+
if (!apiErrors) return false;
|
|
55
|
+
return [apiErrors.primary?.message, apiErrors.fallback?.message]
|
|
56
|
+
.some(m => m?.toLowerCase().includes('image input'));
|
|
57
|
+
}
|
|
58
|
+
|
|
53
59
|
function extractApiError(err, model) {
|
|
54
60
|
return {
|
|
55
61
|
model,
|
|
@@ -483,7 +489,7 @@ export async function withSessionLock(sessionId, fn) {
|
|
|
483
489
|
* Main entry point: handles a single POST /api/chat request.
|
|
484
490
|
* Manages the handoff loop across multiple agent runs.
|
|
485
491
|
*/
|
|
486
|
-
export async function handleChat(config, requestSessionId, userMessage) {
|
|
492
|
+
export async function handleChat(config, requestSessionId, userMessage, attachments = []) {
|
|
487
493
|
const sessionId = requestSessionId || crypto.randomUUID();
|
|
488
494
|
|
|
489
495
|
// Serialize concurrent requests for the same session. Each request registers
|
|
@@ -497,7 +503,7 @@ export async function handleChat(config, requestSessionId, userMessage) {
|
|
|
497
503
|
await previous;
|
|
498
504
|
|
|
499
505
|
try {
|
|
500
|
-
return await _runHandleChat(config, sessionId, userMessage);
|
|
506
|
+
return await _runHandleChat(config, sessionId, userMessage, attachments);
|
|
501
507
|
} finally {
|
|
502
508
|
releaseLock();
|
|
503
509
|
// Clean up only if no one else has queued behind us
|
|
@@ -511,7 +517,7 @@ export async function handleChat(config, requestSessionId, userMessage) {
|
|
|
511
517
|
* The actual chat logic, extracted so handleChat can wrap it cleanly with the
|
|
512
518
|
* session lock.
|
|
513
519
|
*/
|
|
514
|
-
async function _runHandleChat(config, sessionId, userMessage) {
|
|
520
|
+
async function _runHandleChat(config, sessionId, userMessage, attachments = []) {
|
|
515
521
|
const client = createClient(config);
|
|
516
522
|
|
|
517
523
|
const systemPromptTemplate = loadSystemPrompt();
|
|
@@ -545,8 +551,18 @@ async function _runHandleChat(config, sessionId, userMessage) {
|
|
|
545
551
|
userMessageWithContext += note;
|
|
546
552
|
}
|
|
547
553
|
|
|
548
|
-
// Append user message and reset handoff state
|
|
549
|
-
|
|
554
|
+
// Append user message and reset handoff state.
|
|
555
|
+
// If attachments (e.g. images) are present, build a multimodal content array.
|
|
556
|
+
let userContent;
|
|
557
|
+
if (attachments && attachments.length > 0) {
|
|
558
|
+
userContent = [
|
|
559
|
+
...attachments.map(a => ({ type: 'image_url', image_url: { url: a.url } })),
|
|
560
|
+
{ type: 'text', text: userMessageWithContext },
|
|
561
|
+
];
|
|
562
|
+
} else {
|
|
563
|
+
userContent = userMessageWithContext;
|
|
564
|
+
}
|
|
565
|
+
session.messages.push({ role: 'user', content: userContent });
|
|
550
566
|
session.metadata.handoffCount = 0;
|
|
551
567
|
session.metadata.failedApproaches = [];
|
|
552
568
|
session.metadata.lastCheckpointRemaining = null;
|
|
@@ -630,6 +646,9 @@ async function _runHandleChat(config, sessionId, userMessage) {
|
|
|
630
646
|
// windows). The synthetic note is sufficient context; tool results are preserved
|
|
631
647
|
// in the JSONL log and accessible via read_session_log.
|
|
632
648
|
if (finalStatus === 'model_error' || finalStatus === 'format_error') {
|
|
649
|
+
if (finalStatus === 'model_error' && isImageUnsupportedError(run.errorDetail)) {
|
|
650
|
+
finalResponse = 'This model does not support image input. Please switch to a multimodal model (e.g. claude-3.5-sonnet, gpt-4o) in settings.';
|
|
651
|
+
}
|
|
633
652
|
session.messages.splice(runStartIndex, session.messages.length - runStartIndex);
|
|
634
653
|
const errorDetail = run.errorDetail ? ` Error detail: ${JSON.stringify(run.errorDetail)}` : '';
|
|
635
654
|
session.messages.push({
|
package/src/server/tools.js
CHANGED
|
@@ -281,7 +281,7 @@ const SEED_TOOLS = {
|
|
|
281
281
|
type: 'function',
|
|
282
282
|
function: {
|
|
283
283
|
name: 'write_file',
|
|
284
|
-
description: '
|
|
284
|
+
description: 'Create a new file or completely overwrite an existing file. Content is written exactly as provided — dollar signs, backslashes, and special characters are preserved without modification. Always prefer this over exec+echo, exec+printf, or exec+heredoc. For shell scripts, pass mode: "755". For targeted edits to an existing file (changing a specific line or section), use edit_file instead.',
|
|
285
285
|
parameters: {
|
|
286
286
|
type: 'object',
|
|
287
287
|
properties: {
|
|
@@ -313,6 +313,47 @@ const SEED_TOOLS = {
|
|
|
313
313
|
return { status: 'ok', path: targetPath, bytes, mode: args.mode || '644' };
|
|
314
314
|
`,
|
|
315
315
|
},
|
|
316
|
+
edit_file: {
|
|
317
|
+
definition: {
|
|
318
|
+
type: 'function',
|
|
319
|
+
function: {
|
|
320
|
+
name: 'edit_file',
|
|
321
|
+
description: 'Replace an exact string in a file with a new string. Use this for targeted edits — you only need to provide the specific section to change, not the whole file. old_string must match exactly (including whitespace and indentation) and must appear exactly once in the file. If it appears more than once, add more surrounding context to make it unique. For creating new files or rewriting entire files, use write_file instead.',
|
|
322
|
+
parameters: {
|
|
323
|
+
type: 'object',
|
|
324
|
+
properties: {
|
|
325
|
+
path: {
|
|
326
|
+
type: 'string',
|
|
327
|
+
description: 'Absolute or relative path to the file to edit.',
|
|
328
|
+
},
|
|
329
|
+
old_string: {
|
|
330
|
+
type: 'string',
|
|
331
|
+
description: 'The exact string to find and replace. Must match character-for-character including whitespace and indentation.',
|
|
332
|
+
},
|
|
333
|
+
new_string: {
|
|
334
|
+
type: 'string',
|
|
335
|
+
description: 'The string to replace old_string with.',
|
|
336
|
+
},
|
|
337
|
+
},
|
|
338
|
+
required: ['path', 'old_string', 'new_string'],
|
|
339
|
+
},
|
|
340
|
+
},
|
|
341
|
+
},
|
|
342
|
+
code: `
|
|
343
|
+
const targetPath = path.resolve(args.path);
|
|
344
|
+
const content = await fs.promises.readFile(targetPath, 'utf8');
|
|
345
|
+
const count = content.split(args.old_string).length - 1;
|
|
346
|
+
if (count === 0) {
|
|
347
|
+
return { status: 'error', error: 'old_string not found in file. Check for exact whitespace and indentation match.' };
|
|
348
|
+
}
|
|
349
|
+
if (count > 1) {
|
|
350
|
+
return { status: 'error', error: \`old_string found \${count} times. Add more surrounding context to make it unique.\` };
|
|
351
|
+
}
|
|
352
|
+
const updated = content.replace(args.old_string, args.new_string);
|
|
353
|
+
await fs.promises.writeFile(targetPath, updated, 'utf8');
|
|
354
|
+
return { status: 'ok', path: targetPath };
|
|
355
|
+
`,
|
|
356
|
+
},
|
|
316
357
|
get_current_time: {
|
|
317
358
|
definition: {
|
|
318
359
|
type: 'function',
|