@ducci/jarvis 1.0.39 → 1.0.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/docs/telegram.md CHANGED
@@ -18,7 +18,7 @@ The channel calls the agent layer directly (no HTTP hop) — it imports and call
18
18
 
19
19
  ```
20
20
  Telegram user
21
- ↓ (text message)
21
+ ↓ (text or photo message)
22
22
  Telegram Bot API ←→ grammy-runner (long polling)
23
23
 
24
24
  Channel adapter (src/channels/telegram/index.js)
@@ -246,9 +246,49 @@ await bot.api.setMyCommands([
246
246
  | User sends `/new`, no session exists yet | No-op, same confirmation sent |
247
247
  | Next text message after `/new` | New session created, mapped to `chat_id` |
248
248
 
249
+ ## Photo Support
250
+
251
+ The bot handles incoming photos (`message:photo`) in addition to text. When a user sends a photo, the adapter selects the best resolution under 800px wide to keep token usage reasonable, then passes the image URL and optional caption to the agent as a multimodal content block.
252
+
253
+ ### Photo selection
254
+
255
+ Telegram always delivers multiple resolutions of every photo as an array of `PhotoSize` objects, sorted ascending by resolution. The adapter picks the last entry with `width <= 800`:
256
+
257
+ ```js
258
+ const photo = ctx.message.photo.filter(p => p.width <= 800).at(-1)
259
+ ?? ctx.message.photo[0]; // fallback: smallest if all variants exceed 800px
260
+ ```
261
+
262
+ This gives the highest quality image below the 800px threshold. Sending the full-resolution original would consume significantly more tokens for no practical benefit in most tasks.
263
+
264
+ ### Download and base64 encoding
265
+
266
+ The image is downloaded immediately at receive time using the Telegram file URL (`https://api.telegram.org/file/bot<token>/<file_path>`) and converted to a base64 data URL (`data:image/jpeg;base64,...`). The data URL is stored directly in the session message, so the image remains available across handoffs and future conversation turns without depending on a Telegram URL that would expire after ~1 hour. Base64 encoding does not cost more tokens than a URL — image token cost is based on pixel dimensions, not transport format.
267
+
268
+ ### Agent call
269
+
270
+ Photos are passed to the agent as a multimodal content array instead of a plain string:
271
+
272
+ ```js
273
+ const content = [
274
+ { type: 'image_url', url: fileUrl },
275
+ ];
276
+ if (caption) content.push({ type: 'text', text: caption });
277
+ ```
278
+
279
+ The agent layer must support receiving `content` as either a string or a content array and pass it through to the model accordingly.
280
+
281
+ ### Caption
282
+
283
+ If the user attaches a caption to the photo (`ctx.message.caption`), it is included as a text block alongside the image. If there is no caption, only the image block is sent.
284
+
285
+ ### Unsupported media types
286
+
287
+ Documents, audio, video, stickers, and other non-photo media types are not handled — the bot silently ignores them (same as unauthorized messages).
288
+
249
289
  ## Non-Goals (v1)
250
290
 
251
- - No support for photos, files, or other media types (text only)
291
+ - No support for documents, audio, video, or other non-photo media types
252
292
  - No inline keyboards or callback queries
253
293
  - No group chat support (only private chats)
254
294
  - No message editing or deletion handling
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ducci/jarvis",
3
- "version": "1.0.39",
3
+ "version": "1.0.41",
4
4
  "description": "A fully automated agent system that lives on a server.",
5
5
  "main": "./src/index.js",
6
6
  "type": "module",
@@ -60,6 +60,71 @@ export async function startTelegramChannel(config) {
60
60
  await ctx.reply('New session started.');
61
61
  });
62
62
 
63
+ bot.on('message:photo', async (ctx) => {
64
+ const userId = ctx.from?.id;
65
+ if (!allowedUserIds.includes(userId)) return;
66
+
67
+ const chatId = ctx.chat.id;
68
+ const sessionId = sessions[chatId] || null;
69
+
70
+ console.log(`[telegram] incoming photo chat_id=${chatId}`);
71
+
72
+ await ctx.api.sendChatAction(chatId, 'typing');
73
+ const typingInterval = setInterval(() => {
74
+ ctx.api.sendChatAction(chatId, 'typing').catch(() => {});
75
+ }, 4000);
76
+
77
+ let result;
78
+ try {
79
+ const photo = ctx.message.photo.filter(p => p.width <= 800).at(-1)
80
+ ?? ctx.message.photo[0];
81
+ const file = await ctx.api.getFile(photo.file_id);
82
+ const fileUrl = `https://api.telegram.org/file/bot${token}/${file.file_path}`;
83
+ const imgResponse = await fetch(fileUrl);
84
+ const buffer = await imgResponse.arrayBuffer();
85
+ const base64 = Buffer.from(buffer).toString('base64');
86
+ const dataUrl = `data:image/jpeg;base64,${base64}`;
87
+ const caption = ctx.message.caption || '';
88
+ result = await handleChat(config, sessionId, caption, [{ url: dataUrl }]);
89
+ } catch (e) {
90
+ console.error(`[telegram] agent error chat_id=${chatId}: ${e.message}`);
91
+ const errText = e.message
92
+ ? `Sorry, something went wrong: ${e.message}`
93
+ : 'Sorry, something went wrong. Please try again.';
94
+ await ctx.reply(errText).catch(() => {});
95
+ clearInterval(typingInterval);
96
+ return;
97
+ }
98
+
99
+ if (!sessions[chatId]) {
100
+ sessions[chatId] = result.sessionId;
101
+ save(sessions);
102
+ console.log(`[telegram] session created sessionId=${result.sessionId.slice(0, 8)}`);
103
+ }
104
+
105
+ try {
106
+ const MAX_TG = 4096;
107
+ const rawResponse = typeof result.response === 'string'
108
+ ? result.response
109
+ : result.response != null ? JSON.stringify(result.response, null, 2) : '';
110
+ const text = rawResponse.trim()
111
+ || 'The agent encountered an error and could not produce a response. Please try again.';
112
+ if (text.length <= MAX_TG) {
113
+ await ctx.reply(text);
114
+ } else {
115
+ for (let i = 0; i < text.length; i += MAX_TG) {
116
+ await ctx.reply(text.slice(i, i + MAX_TG));
117
+ }
118
+ }
119
+ console.log(`[telegram] response sent chat_id=${chatId} length=${text.length}`);
120
+ } catch (e) {
121
+ console.error(`[telegram] delivery error chat_id=${chatId}: ${e.message}`);
122
+ await ctx.reply('Sorry, something went wrong sending the response. Please try again.').catch(() => {});
123
+ } finally {
124
+ clearInterval(typingInterval);
125
+ }
126
+ });
127
+
63
128
  bot.on('message:text', async (ctx) => {
64
129
  const userId = ctx.from?.id;
65
130
 
@@ -50,6 +50,12 @@ async function callModel(client, model, messages, tools) {
50
50
  return await client.chat.completions.create(params);
51
51
  }
52
52
 
53
+ function isImageUnsupportedError(apiErrors) {
54
+ if (!apiErrors) return false;
55
+ return [apiErrors.primary?.message, apiErrors.fallback?.message]
56
+ .some(m => m?.toLowerCase().includes('image input'));
57
+ }
58
+
53
59
  function extractApiError(err, model) {
54
60
  return {
55
61
  model,
@@ -483,7 +489,7 @@ export async function withSessionLock(sessionId, fn) {
483
489
  * Main entry point: handles a single POST /api/chat request.
484
490
  * Manages the handoff loop across multiple agent runs.
485
491
  */
486
- export async function handleChat(config, requestSessionId, userMessage) {
492
+ export async function handleChat(config, requestSessionId, userMessage, attachments = []) {
487
493
  const sessionId = requestSessionId || crypto.randomUUID();
488
494
 
489
495
  // Serialize concurrent requests for the same session. Each request registers
@@ -497,7 +503,7 @@ export async function handleChat(config, requestSessionId, userMessage) {
497
503
  await previous;
498
504
 
499
505
  try {
500
- return await _runHandleChat(config, sessionId, userMessage);
506
+ return await _runHandleChat(config, sessionId, userMessage, attachments);
501
507
  } finally {
502
508
  releaseLock();
503
509
  // Clean up only if no one else has queued behind us
@@ -511,7 +517,7 @@ export async function handleChat(config, requestSessionId, userMessage) {
511
517
  * The actual chat logic, extracted so handleChat can wrap it cleanly with the
512
518
  * session lock.
513
519
  */
514
- async function _runHandleChat(config, sessionId, userMessage) {
520
+ async function _runHandleChat(config, sessionId, userMessage, attachments = []) {
515
521
  const client = createClient(config);
516
522
 
517
523
  const systemPromptTemplate = loadSystemPrompt();
@@ -545,8 +551,18 @@ async function _runHandleChat(config, sessionId, userMessage) {
545
551
  userMessageWithContext += note;
546
552
  }
547
553
 
548
- // Append user message and reset handoff state
549
- session.messages.push({ role: 'user', content: userMessageWithContext });
554
+ // Append user message and reset handoff state.
555
+ // If attachments (e.g. images) are present, build a multimodal content array.
556
+ let userContent;
557
+ if (attachments && attachments.length > 0) {
558
+ userContent = [
559
+ ...attachments.map(a => ({ type: 'image_url', image_url: { url: a.url } })),
560
+ { type: 'text', text: userMessageWithContext },
561
+ ];
562
+ } else {
563
+ userContent = userMessageWithContext;
564
+ }
565
+ session.messages.push({ role: 'user', content: userContent });
550
566
  session.metadata.handoffCount = 0;
551
567
  session.metadata.failedApproaches = [];
552
568
  session.metadata.lastCheckpointRemaining = null;
@@ -630,6 +646,9 @@ async function _runHandleChat(config, sessionId, userMessage) {
630
646
  // windows). The synthetic note is sufficient context; tool results are preserved
631
647
  // in the JSONL log and accessible via read_session_log.
632
648
  if (finalStatus === 'model_error' || finalStatus === 'format_error') {
649
+ if (finalStatus === 'model_error' && isImageUnsupportedError(run.errorDetail)) {
650
+ finalResponse = 'This model does not support image input. Please switch to a multimodal model (e.g. claude-3.5-sonnet, gpt-4o) in settings.';
651
+ }
633
652
  session.messages.splice(runStartIndex, session.messages.length - runStartIndex);
634
653
  const errorDetail = run.errorDetail ? ` Error detail: ${JSON.stringify(run.errorDetail)}` : '';
635
654
  session.messages.push({
@@ -281,7 +281,7 @@ const SEED_TOOLS = {
281
281
  type: 'function',
282
282
  function: {
283
283
  name: 'write_file',
284
- description: 'Write content directly to a file on the filesystem, bypassing all shell escaping. Use this to create or overwrite any file — shell scripts, config files, code, etc. Content is written exactly as provided: dollar signs, backslashes, and special characters are preserved without modification. Always prefer this over exec+echo, exec+printf, or exec+heredoc for writing files. For shell scripts, pass mode: "755" to make the file executable. Example: write_file({ path: "/path/to/scan.sh", content: "#!/bin/bash\\nDOMAIN=$1\\n...", mode: "755" })',
284
+ description: 'Create a new file or completely overwrite an existing file. Content is written exactly as provided dollar signs, backslashes, and special characters are preserved without modification. Always prefer this over exec+echo, exec+printf, or exec+heredoc. For shell scripts, pass mode: "755". For targeted edits to an existing file (changing a specific line or section), use edit_file instead.',
285
285
  parameters: {
286
286
  type: 'object',
287
287
  properties: {
@@ -313,6 +313,47 @@ const SEED_TOOLS = {
313
313
  return { status: 'ok', path: targetPath, bytes, mode: args.mode || '644' };
314
314
  `,
315
315
  },
316
+ edit_file: {
317
+ definition: {
318
+ type: 'function',
319
+ function: {
320
+ name: 'edit_file',
321
+ description: 'Replace an exact string in a file with a new string. Use this for targeted edits — you only need to provide the specific section to change, not the whole file. old_string must match exactly (including whitespace and indentation) and must appear exactly once in the file. If it appears more than once, add more surrounding context to make it unique. For creating new files or rewriting entire files, use write_file instead.',
322
+ parameters: {
323
+ type: 'object',
324
+ properties: {
325
+ path: {
326
+ type: 'string',
327
+ description: 'Absolute or relative path to the file to edit.',
328
+ },
329
+ old_string: {
330
+ type: 'string',
331
+ description: 'The exact string to find and replace. Must match character-for-character including whitespace and indentation.',
332
+ },
333
+ new_string: {
334
+ type: 'string',
335
+ description: 'The string to replace old_string with.',
336
+ },
337
+ },
338
+ required: ['path', 'old_string', 'new_string'],
339
+ },
340
+ },
341
+ },
342
+ code: `
343
+ const targetPath = path.resolve(args.path);
344
+ const content = await fs.promises.readFile(targetPath, 'utf8');
345
+ const count = content.split(args.old_string).length - 1;
346
+ if (count === 0) {
347
+ return { status: 'error', error: 'old_string not found in file. Check for exact whitespace and indentation match.' };
348
+ }
349
+ if (count > 1) {
350
+ return { status: 'error', error: \`old_string found \${count} times. Add more surrounding context to make it unique.\` };
351
+ }
352
+ const updated = content.replace(args.old_string, args.new_string);
353
+ await fs.promises.writeFile(targetPath, updated, 'utf8');
354
+ return { status: 'ok', path: targetPath };
355
+ `,
356
+ },
316
357
  get_current_time: {
317
358
  definition: {
318
359
  type: 'function',