npm - @ducci/jarvis - Versions diffs - 1.0.39 → 1.0.41 - Mend

@ducci/jarvis 1.0.39 → 1.0.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/docs/telegram.md +42 -2
package/package.json +1 -1
package/src/channels/telegram/index.js +65 -0
package/src/server/agent.js +24 -5
package/src/server/tools.js +42 -1

package/docs/telegram.md CHANGED Viewed

@@ -18,7 +18,7 @@ The channel calls the agent layer directly (no HTTP hop) — it imports and call
 ```
 Telegram user
-    ↓ (text message)
+    ↓ (text or photo message)
 Telegram Bot API  ←→  grammy-runner (long polling)
     ↓
 Channel adapter (src/channels/telegram/index.js)
@@ -246,9 +246,49 @@ await bot.api.setMyCommands([
 | User sends `/new`, no session exists yet | No-op, same confirmation sent |
 | Next text message after `/new` | New session created, mapped to `chat_id` |
+## Photo Support
+The bot handles incoming photos (`message:photo`) in addition to text. When a user sends a photo, the adapter selects the best resolution under 800px wide to keep token usage reasonable, then passes the image URL and optional caption to the agent as a multimodal content block.
+### Photo selection
+Telegram always delivers multiple resolutions of every photo as an array of `PhotoSize` objects, sorted ascending by resolution. The adapter picks the last entry with `width <= 800`:
+```js
+const photo = ctx.message.photo.filter(p => p.width <= 800).at(-1)
+  ?? ctx.message.photo[0]; // fallback: smallest if all variants exceed 800px
+```
+This gives the highest quality image below the 800px threshold. Sending the full-resolution original would consume significantly more tokens for no practical benefit in most tasks.
+### Download and base64 encoding
+The image is downloaded immediately at receive time using the Telegram file URL (`https://api.telegram.org/file/bot<token>/<file_path>`) and converted to a base64 data URL (`data:image/jpeg;base64,...`). The data URL is stored directly in the session message, so the image remains available across handoffs and future conversation turns without depending on a Telegram URL that would expire after ~1 hour. Base64 encoding does not cost more tokens than a URL — image token cost is based on pixel dimensions, not transport format.
+### Agent call
+Photos are passed to the agent as a multimodal content array instead of a plain string:
+```js
+const content = [
+  { type: 'image_url', url: fileUrl },
+];
+if (caption) content.push({ type: 'text', text: caption });
+```
+The agent layer must support receiving `content` as either a string or a content array and pass it through to the model accordingly.
+### Caption
+If the user attaches a caption to the photo (`ctx.message.caption`), it is included as a text block alongside the image. If there is no caption, only the image block is sent.
+### Unsupported media types
+Documents, audio, video, stickers, and other non-photo media types are not handled — the bot silently ignores them (same as unauthorized messages).
 ## Non-Goals (v1)
-- No support for photos, files, or other media types (text only)
+- No support for documents, audio, video, or other non-photo media types
 - No inline keyboards or callback queries
 - No group chat support (only private chats)
 - No message editing or deletion handling

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ducci/jarvis",
-  "version": "1.0.39",
+  "version": "1.0.41",
   "description": "A fully automated agent system that lives on a server.",
   "main": "./src/index.js",
   "type": "module",

package/src/channels/telegram/index.js CHANGED Viewed

@@ -60,6 +60,71 @@ export async function startTelegramChannel(config) {
     await ctx.reply('New session started.');
   });
+  bot.on('message:photo', async (ctx) => {
+    const userId = ctx.from?.id;
+    if (!allowedUserIds.includes(userId)) return;
+    const chatId = ctx.chat.id;
+    const sessionId = sessions[chatId] || null;
+    console.log(`[telegram] incoming photo chat_id=${chatId}`);
+    await ctx.api.sendChatAction(chatId, 'typing');
+    const typingInterval = setInterval(() => {
+      ctx.api.sendChatAction(chatId, 'typing').catch(() => {});
+    }, 4000);
+    let result;
+    try {
+      const photo = ctx.message.photo.filter(p => p.width <= 800).at(-1)
+        ?? ctx.message.photo[0];
+      const file = await ctx.api.getFile(photo.file_id);
+      const fileUrl = `https://api.telegram.org/file/bot${token}/${file.file_path}`;
+      const imgResponse = await fetch(fileUrl);
+      const buffer = await imgResponse.arrayBuffer();
+      const base64 = Buffer.from(buffer).toString('base64');
+      const dataUrl = `data:image/jpeg;base64,${base64}`;
+      const caption = ctx.message.caption || '';
+      result = await handleChat(config, sessionId, caption, [{ url: dataUrl }]);
+    } catch (e) {
+      console.error(`[telegram] agent error chat_id=${chatId}: ${e.message}`);
+      const errText = e.message
+        ? `Sorry, something went wrong: ${e.message}`
+        : 'Sorry, something went wrong. Please try again.';
+      await ctx.reply(errText).catch(() => {});
+      clearInterval(typingInterval);
+      return;
+    }
+    if (!sessions[chatId]) {
+      sessions[chatId] = result.sessionId;
+      save(sessions);
+      console.log(`[telegram] session created sessionId=${result.sessionId.slice(0, 8)}`);
+    }
+    try {
+      const MAX_TG = 4096;
+      const rawResponse = typeof result.response === 'string'
+        ? result.response
+        : result.response != null ? JSON.stringify(result.response, null, 2) : '';
+      const text = rawResponse.trim()
+        || 'The agent encountered an error and could not produce a response. Please try again.';
+      if (text.length <= MAX_TG) {
+        await ctx.reply(text);
+      } else {
+        for (let i = 0; i < text.length; i += MAX_TG) {
+          await ctx.reply(text.slice(i, i + MAX_TG));
+        }
+      }
+      console.log(`[telegram] response sent chat_id=${chatId} length=${text.length}`);
+    } catch (e) {
+      console.error(`[telegram] delivery error chat_id=${chatId}: ${e.message}`);
+      await ctx.reply('Sorry, something went wrong sending the response. Please try again.').catch(() => {});
+    } finally {
+      clearInterval(typingInterval);
+    }
+  });
   bot.on('message:text', async (ctx) => {
     const userId = ctx.from?.id;

package/src/server/agent.js CHANGED Viewed

@@ -50,6 +50,12 @@ async function callModel(client, model, messages, tools) {
   return await client.chat.completions.create(params);
 }
+function isImageUnsupportedError(apiErrors) {
+  if (!apiErrors) return false;
+  return [apiErrors.primary?.message, apiErrors.fallback?.message]
+    .some(m => m?.toLowerCase().includes('image input'));
+}
 function extractApiError(err, model) {
   return {
     model,
@@ -483,7 +489,7 @@ export async function withSessionLock(sessionId, fn) {
  * Main entry point: handles a single POST /api/chat request.
  * Manages the handoff loop across multiple agent runs.
  */
-export async function handleChat(config, requestSessionId, userMessage) {
+export async function handleChat(config, requestSessionId, userMessage, attachments = []) {
   const sessionId = requestSessionId || crypto.randomUUID();
   // Serialize concurrent requests for the same session. Each request registers
@@ -497,7 +503,7 @@ export async function handleChat(config, requestSessionId, userMessage) {
   await previous;
   try {
-    return await _runHandleChat(config, sessionId, userMessage);
+    return await _runHandleChat(config, sessionId, userMessage, attachments);
   } finally {
     releaseLock();
     // Clean up only if no one else has queued behind us
@@ -511,7 +517,7 @@ export async function handleChat(config, requestSessionId, userMessage) {
  * The actual chat logic, extracted so handleChat can wrap it cleanly with the
  * session lock.
  */
-async function _runHandleChat(config, sessionId, userMessage) {
+async function _runHandleChat(config, sessionId, userMessage, attachments = []) {
   const client = createClient(config);
   const systemPromptTemplate = loadSystemPrompt();
@@ -545,8 +551,18 @@ async function _runHandleChat(config, sessionId, userMessage) {
     userMessageWithContext += note;
   }
-  // Append user message and reset handoff state
-  session.messages.push({ role: 'user', content: userMessageWithContext });
+  // Append user message and reset handoff state.
+  // If attachments (e.g. images) are present, build a multimodal content array.
+  let userContent;
+  if (attachments && attachments.length > 0) {
+    userContent = [
+      ...attachments.map(a => ({ type: 'image_url', image_url: { url: a.url } })),
+      { type: 'text', text: userMessageWithContext },
+    ];
+  } else {
+    userContent = userMessageWithContext;
+  }
+  session.messages.push({ role: 'user', content: userContent });
   session.metadata.handoffCount = 0;
   session.metadata.failedApproaches = [];
   session.metadata.lastCheckpointRemaining = null;
@@ -630,6 +646,9 @@ async function _runHandleChat(config, sessionId, userMessage) {
         // windows). The synthetic note is sufficient context; tool results are preserved
         // in the JSONL log and accessible via read_session_log.
         if (finalStatus === 'model_error' || finalStatus === 'format_error') {
+          if (finalStatus === 'model_error' && isImageUnsupportedError(run.errorDetail)) {
+            finalResponse = 'This model does not support image input. Please switch to a multimodal model (e.g. claude-3.5-sonnet, gpt-4o) in settings.';
+          }
           session.messages.splice(runStartIndex, session.messages.length - runStartIndex);
           const errorDetail = run.errorDetail ? ` Error detail: ${JSON.stringify(run.errorDetail)}` : '';
           session.messages.push({

package/src/server/tools.js CHANGED Viewed

@@ -281,7 +281,7 @@ const SEED_TOOLS = {
       type: 'function',
       function: {
         name: 'write_file',
-        description: 'Write content directly to a file on the filesystem, bypassing all shell escaping. Use this to create or overwrite any file — shell scripts, config files, code, etc. Content is written exactly as provided: dollar signs, backslashes, and special characters are preserved without modification. Always prefer this over exec+echo, exec+printf, or exec+heredoc for writing files. For shell scripts, pass mode: "755" to make the file executable. Example: write_file({ path: "/path/to/scan.sh", content: "#!/bin/bash\\nDOMAIN=$1\\n...", mode: "755" })',
+        description: 'Create a new file or completely overwrite an existing file. Content is written exactly as provided — dollar signs, backslashes, and special characters are preserved without modification. Always prefer this over exec+echo, exec+printf, or exec+heredoc. For shell scripts, pass mode: "755". For targeted edits to an existing file (changing a specific line or section), use edit_file instead.',
         parameters: {
           type: 'object',
           properties: {
@@ -313,6 +313,47 @@ const SEED_TOOLS = {
       return { status: 'ok', path: targetPath, bytes, mode: args.mode || '644' };
     `,
   },
+  edit_file: {
+    definition: {
+      type: 'function',
+      function: {
+        name: 'edit_file',
+        description: 'Replace an exact string in a file with a new string. Use this for targeted edits — you only need to provide the specific section to change, not the whole file. old_string must match exactly (including whitespace and indentation) and must appear exactly once in the file. If it appears more than once, add more surrounding context to make it unique. For creating new files or rewriting entire files, use write_file instead.',
+        parameters: {
+          type: 'object',
+          properties: {
+            path: {
+              type: 'string',
+              description: 'Absolute or relative path to the file to edit.',
+            },
+            old_string: {
+              type: 'string',
+              description: 'The exact string to find and replace. Must match character-for-character including whitespace and indentation.',
+            },
+            new_string: {
+              type: 'string',
+              description: 'The string to replace old_string with.',
+            },
+          },
+          required: ['path', 'old_string', 'new_string'],
+        },
+      },
+    },
+    code: `
+      const targetPath = path.resolve(args.path);
+      const content = await fs.promises.readFile(targetPath, 'utf8');
+      const count = content.split(args.old_string).length - 1;
+      if (count === 0) {
+        return { status: 'error', error: 'old_string not found in file. Check for exact whitespace and indentation match.' };
+      }
+      if (count > 1) {
+        return { status: 'error', error: \`old_string found \${count} times. Add more surrounding context to make it unique.\` };
+      }
+      const updated = content.replace(args.old_string, args.new_string);
+      await fs.promises.writeFile(targetPath, updated, 'utf8');
+      return { status: 'ok', path: targetPath };
+    `,
+  },
   get_current_time: {
     definition: {
       type: 'function',