@ducci/jarvis 1.0.39 → 1.0.40
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/docs/telegram.md +42 -2
- package/package.json +1 -1
- package/src/channels/telegram/index.js +65 -0
- package/src/server/agent.js +15 -5
- package/src/server/tools.js +42 -1
package/docs/telegram.md
CHANGED
|
@@ -18,7 +18,7 @@ The channel calls the agent layer directly (no HTTP hop) — it imports and call
|
|
|
18
18
|
|
|
19
19
|
```
|
|
20
20
|
Telegram user
|
|
21
|
-
↓ (text message)
|
|
21
|
+
↓ (text or photo message)
|
|
22
22
|
Telegram Bot API ←→ grammy-runner (long polling)
|
|
23
23
|
↓
|
|
24
24
|
Channel adapter (src/channels/telegram/index.js)
|
|
@@ -246,9 +246,49 @@ await bot.api.setMyCommands([
|
|
|
246
246
|
| User sends `/new`, no session exists yet | No-op, same confirmation sent |
|
|
247
247
|
| Next text message after `/new` | New session created, mapped to `chat_id` |
|
|
248
248
|
|
|
249
|
+
## Photo Support
|
|
250
|
+
|
|
251
|
+
The bot handles incoming photos (`message:photo`) in addition to text. When a user sends a photo, the adapter selects the best resolution under 800px wide to keep token usage reasonable, then passes the image URL and optional caption to the agent as a multimodal content block.
|
|
252
|
+
|
|
253
|
+
### Photo selection
|
|
254
|
+
|
|
255
|
+
Telegram always delivers multiple resolutions of every photo as an array of `PhotoSize` objects, sorted ascending by resolution. The adapter picks the last entry with `width <= 800`:
|
|
256
|
+
|
|
257
|
+
```js
|
|
258
|
+
const photo = ctx.message.photo.filter(p => p.width <= 800).at(-1)
|
|
259
|
+
?? ctx.message.photo[0]; // fallback: smallest if all variants exceed 800px
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
This gives the highest quality image below the 800px threshold. Sending the full-resolution original would consume significantly more tokens for no practical benefit in most tasks.
|
|
263
|
+
|
|
264
|
+
### Download and base64 encoding
|
|
265
|
+
|
|
266
|
+
The image is downloaded immediately at receive time using the Telegram file URL (`https://api.telegram.org/file/bot<token>/<file_path>`) and converted to a base64 data URL (`data:image/jpeg;base64,...`). The data URL is stored directly in the session message, so the image remains available across handoffs and future conversation turns without depending on a Telegram URL that would expire after ~1 hour. Base64 encoding does not cost more tokens than a URL — image token cost is based on pixel dimensions, not transport format.
|
|
267
|
+
|
|
268
|
+
### Agent call
|
|
269
|
+
|
|
270
|
+
Photos are passed to the agent as a multimodal content array instead of a plain string:
|
|
271
|
+
|
|
272
|
+
```js
|
|
273
|
+
const content = [
|
|
274
|
+
{ type: 'image_url', url: fileUrl },
|
|
275
|
+
];
|
|
276
|
+
if (caption) content.push({ type: 'text', text: caption });
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
The agent layer must support receiving `content` as either a string or a content array and pass it through to the model accordingly.
|
|
280
|
+
|
|
281
|
+
### Caption
|
|
282
|
+
|
|
283
|
+
If the user attaches a caption to the photo (`ctx.message.caption`), it is included as a text block alongside the image. If there is no caption, only the image block is sent.
|
|
284
|
+
|
|
285
|
+
### Unsupported media types
|
|
286
|
+
|
|
287
|
+
Documents, audio, video, stickers, and other non-photo media types are not handled — the bot silently ignores them (same as unauthorized messages).
|
|
288
|
+
|
|
249
289
|
## Non-Goals (v1)
|
|
250
290
|
|
|
251
|
-
- No support for
|
|
291
|
+
- No support for documents, audio, video, or other non-photo media types
|
|
252
292
|
- No inline keyboards or callback queries
|
|
253
293
|
- No group chat support (only private chats)
|
|
254
294
|
- No message editing or deletion handling
|
package/package.json
CHANGED
|
@@ -60,6 +60,71 @@ export async function startTelegramChannel(config) {
|
|
|
60
60
|
await ctx.reply('New session started.');
|
|
61
61
|
});
|
|
62
62
|
|
|
63
|
+
bot.on('message:photo', async (ctx) => {
|
|
64
|
+
const userId = ctx.from?.id;
|
|
65
|
+
if (!allowedUserIds.includes(userId)) return;
|
|
66
|
+
|
|
67
|
+
const chatId = ctx.chat.id;
|
|
68
|
+
const sessionId = sessions[chatId] || null;
|
|
69
|
+
|
|
70
|
+
console.log(`[telegram] incoming photo chat_id=${chatId}`);
|
|
71
|
+
|
|
72
|
+
await ctx.api.sendChatAction(chatId, 'typing');
|
|
73
|
+
const typingInterval = setInterval(() => {
|
|
74
|
+
ctx.api.sendChatAction(chatId, 'typing').catch(() => {});
|
|
75
|
+
}, 4000);
|
|
76
|
+
|
|
77
|
+
let result;
|
|
78
|
+
try {
|
|
79
|
+
const photo = ctx.message.photo.filter(p => p.width <= 800).at(-1)
|
|
80
|
+
?? ctx.message.photo[0];
|
|
81
|
+
const file = await ctx.api.getFile(photo.file_id);
|
|
82
|
+
const fileUrl = `https://api.telegram.org/file/bot${token}/${file.file_path}`;
|
|
83
|
+
const imgResponse = await fetch(fileUrl);
|
|
84
|
+
const buffer = await imgResponse.arrayBuffer();
|
|
85
|
+
const base64 = Buffer.from(buffer).toString('base64');
|
|
86
|
+
const dataUrl = `data:image/jpeg;base64,${base64}`;
|
|
87
|
+
const caption = ctx.message.caption || '';
|
|
88
|
+
result = await handleChat(config, sessionId, caption, [{ url: dataUrl }]);
|
|
89
|
+
} catch (e) {
|
|
90
|
+
console.error(`[telegram] agent error chat_id=${chatId}: ${e.message}`);
|
|
91
|
+
const errText = e.message
|
|
92
|
+
? `Sorry, something went wrong: ${e.message}`
|
|
93
|
+
: 'Sorry, something went wrong. Please try again.';
|
|
94
|
+
await ctx.reply(errText).catch(() => {});
|
|
95
|
+
clearInterval(typingInterval);
|
|
96
|
+
return;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
if (!sessions[chatId]) {
|
|
100
|
+
sessions[chatId] = result.sessionId;
|
|
101
|
+
save(sessions);
|
|
102
|
+
console.log(`[telegram] session created sessionId=${result.sessionId.slice(0, 8)}`);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
try {
|
|
106
|
+
const MAX_TG = 4096;
|
|
107
|
+
const rawResponse = typeof result.response === 'string'
|
|
108
|
+
? result.response
|
|
109
|
+
: result.response != null ? JSON.stringify(result.response, null, 2) : '';
|
|
110
|
+
const text = rawResponse.trim()
|
|
111
|
+
|| 'The agent encountered an error and could not produce a response. Please try again.';
|
|
112
|
+
if (text.length <= MAX_TG) {
|
|
113
|
+
await ctx.reply(text);
|
|
114
|
+
} else {
|
|
115
|
+
for (let i = 0; i < text.length; i += MAX_TG) {
|
|
116
|
+
await ctx.reply(text.slice(i, i + MAX_TG));
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
console.log(`[telegram] response sent chat_id=${chatId} length=${text.length}`);
|
|
120
|
+
} catch (e) {
|
|
121
|
+
console.error(`[telegram] delivery error chat_id=${chatId}: ${e.message}`);
|
|
122
|
+
await ctx.reply('Sorry, something went wrong sending the response. Please try again.').catch(() => {});
|
|
123
|
+
} finally {
|
|
124
|
+
clearInterval(typingInterval);
|
|
125
|
+
}
|
|
126
|
+
});
|
|
127
|
+
|
|
63
128
|
bot.on('message:text', async (ctx) => {
|
|
64
129
|
const userId = ctx.from?.id;
|
|
65
130
|
|
package/src/server/agent.js
CHANGED
|
@@ -483,7 +483,7 @@ export async function withSessionLock(sessionId, fn) {
|
|
|
483
483
|
* Main entry point: handles a single POST /api/chat request.
|
|
484
484
|
* Manages the handoff loop across multiple agent runs.
|
|
485
485
|
*/
|
|
486
|
-
export async function handleChat(config, requestSessionId, userMessage) {
|
|
486
|
+
export async function handleChat(config, requestSessionId, userMessage, attachments = []) {
|
|
487
487
|
const sessionId = requestSessionId || crypto.randomUUID();
|
|
488
488
|
|
|
489
489
|
// Serialize concurrent requests for the same session. Each request registers
|
|
@@ -497,7 +497,7 @@ export async function handleChat(config, requestSessionId, userMessage) {
|
|
|
497
497
|
await previous;
|
|
498
498
|
|
|
499
499
|
try {
|
|
500
|
-
return await _runHandleChat(config, sessionId, userMessage);
|
|
500
|
+
return await _runHandleChat(config, sessionId, userMessage, attachments);
|
|
501
501
|
} finally {
|
|
502
502
|
releaseLock();
|
|
503
503
|
// Clean up only if no one else has queued behind us
|
|
@@ -511,7 +511,7 @@ export async function handleChat(config, requestSessionId, userMessage) {
|
|
|
511
511
|
* The actual chat logic, extracted so handleChat can wrap it cleanly with the
|
|
512
512
|
* session lock.
|
|
513
513
|
*/
|
|
514
|
-
async function _runHandleChat(config, sessionId, userMessage) {
|
|
514
|
+
async function _runHandleChat(config, sessionId, userMessage, attachments = []) {
|
|
515
515
|
const client = createClient(config);
|
|
516
516
|
|
|
517
517
|
const systemPromptTemplate = loadSystemPrompt();
|
|
@@ -545,8 +545,18 @@ async function _runHandleChat(config, sessionId, userMessage) {
|
|
|
545
545
|
userMessageWithContext += note;
|
|
546
546
|
}
|
|
547
547
|
|
|
548
|
-
// Append user message and reset handoff state
|
|
549
|
-
|
|
548
|
+
// Append user message and reset handoff state.
|
|
549
|
+
// If attachments (e.g. images) are present, build a multimodal content array.
|
|
550
|
+
let userContent;
|
|
551
|
+
if (attachments && attachments.length > 0) {
|
|
552
|
+
userContent = [
|
|
553
|
+
...attachments.map(a => ({ type: 'image_url', image_url: { url: a.url } })),
|
|
554
|
+
{ type: 'text', text: userMessageWithContext },
|
|
555
|
+
];
|
|
556
|
+
} else {
|
|
557
|
+
userContent = userMessageWithContext;
|
|
558
|
+
}
|
|
559
|
+
session.messages.push({ role: 'user', content: userContent });
|
|
550
560
|
session.metadata.handoffCount = 0;
|
|
551
561
|
session.metadata.failedApproaches = [];
|
|
552
562
|
session.metadata.lastCheckpointRemaining = null;
|
package/src/server/tools.js
CHANGED
|
@@ -281,7 +281,7 @@ const SEED_TOOLS = {
|
|
|
281
281
|
type: 'function',
|
|
282
282
|
function: {
|
|
283
283
|
name: 'write_file',
|
|
284
|
-
description: '
|
|
284
|
+
description: 'Create a new file or completely overwrite an existing file. Content is written exactly as provided — dollar signs, backslashes, and special characters are preserved without modification. Always prefer this over exec+echo, exec+printf, or exec+heredoc. For shell scripts, pass mode: "755". For targeted edits to an existing file (changing a specific line or section), use edit_file instead.',
|
|
285
285
|
parameters: {
|
|
286
286
|
type: 'object',
|
|
287
287
|
properties: {
|
|
@@ -313,6 +313,47 @@ const SEED_TOOLS = {
|
|
|
313
313
|
return { status: 'ok', path: targetPath, bytes, mode: args.mode || '644' };
|
|
314
314
|
`,
|
|
315
315
|
},
|
|
316
|
+
edit_file: {
|
|
317
|
+
definition: {
|
|
318
|
+
type: 'function',
|
|
319
|
+
function: {
|
|
320
|
+
name: 'edit_file',
|
|
321
|
+
description: 'Replace an exact string in a file with a new string. Use this for targeted edits — you only need to provide the specific section to change, not the whole file. old_string must match exactly (including whitespace and indentation) and must appear exactly once in the file. If it appears more than once, add more surrounding context to make it unique. For creating new files or rewriting entire files, use write_file instead.',
|
|
322
|
+
parameters: {
|
|
323
|
+
type: 'object',
|
|
324
|
+
properties: {
|
|
325
|
+
path: {
|
|
326
|
+
type: 'string',
|
|
327
|
+
description: 'Absolute or relative path to the file to edit.',
|
|
328
|
+
},
|
|
329
|
+
old_string: {
|
|
330
|
+
type: 'string',
|
|
331
|
+
description: 'The exact string to find and replace. Must match character-for-character including whitespace and indentation.',
|
|
332
|
+
},
|
|
333
|
+
new_string: {
|
|
334
|
+
type: 'string',
|
|
335
|
+
description: 'The string to replace old_string with.',
|
|
336
|
+
},
|
|
337
|
+
},
|
|
338
|
+
required: ['path', 'old_string', 'new_string'],
|
|
339
|
+
},
|
|
340
|
+
},
|
|
341
|
+
},
|
|
342
|
+
code: `
|
|
343
|
+
const targetPath = path.resolve(args.path);
|
|
344
|
+
const content = await fs.promises.readFile(targetPath, 'utf8');
|
|
345
|
+
const count = content.split(args.old_string).length - 1;
|
|
346
|
+
if (count === 0) {
|
|
347
|
+
return { status: 'error', error: 'old_string not found in file. Check for exact whitespace and indentation match.' };
|
|
348
|
+
}
|
|
349
|
+
if (count > 1) {
|
|
350
|
+
return { status: 'error', error: \`old_string found \${count} times. Add more surrounding context to make it unique.\` };
|
|
351
|
+
}
|
|
352
|
+
const updated = content.replace(args.old_string, args.new_string);
|
|
353
|
+
await fs.promises.writeFile(targetPath, updated, 'utf8');
|
|
354
|
+
return { status: 'ok', path: targetPath };
|
|
355
|
+
`,
|
|
356
|
+
},
|
|
316
357
|
get_current_time: {
|
|
317
358
|
definition: {
|
|
318
359
|
type: 'function',
|