@ducci/jarvis 1.0.39 → 1.0.40

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/docs/telegram.md CHANGED
@@ -18,7 +18,7 @@ The channel calls the agent layer directly (no HTTP hop) — it imports and call
18
18
 
19
19
  ```
20
20
  Telegram user
21
- ↓ (text message)
21
+ ↓ (text or photo message)
22
22
  Telegram Bot API ←→ grammy-runner (long polling)
23
23
 
24
24
  Channel adapter (src/channels/telegram/index.js)
@@ -246,9 +246,49 @@ await bot.api.setMyCommands([
246
246
  | User sends `/new`, no session exists yet | No-op, same confirmation sent |
247
247
  | Next text message after `/new` | New session created, mapped to `chat_id` |
248
248
 
249
+ ## Photo Support
250
+
251
+ The bot handles incoming photos (`message:photo`) in addition to text. When a user sends a photo, the adapter selects the best resolution under 800px wide to keep token usage reasonable, then passes the image URL and optional caption to the agent as a multimodal content block.
252
+
253
+ ### Photo selection
254
+
255
+ Telegram always delivers multiple resolutions of every photo as an array of `PhotoSize` objects, sorted ascending by resolution. The adapter picks the last entry with `width <= 800`:
256
+
257
+ ```js
258
+ const photo = ctx.message.photo.filter(p => p.width <= 800).at(-1)
259
+ ?? ctx.message.photo[0]; // fallback: smallest if all variants exceed 800px
260
+ ```
261
+
262
+ This gives the highest quality image below the 800px threshold. Sending the full-resolution original would consume significantly more tokens for no practical benefit in most tasks.
263
+
264
+ ### Download and base64 encoding
265
+
266
+ The image is downloaded immediately at receive time using the Telegram file URL (`https://api.telegram.org/file/bot<token>/<file_path>`) and converted to a base64 data URL (`data:image/jpeg;base64,...`). The data URL is stored directly in the session message, so the image remains available across handoffs and future conversation turns without depending on a Telegram URL that would expire after ~1 hour. Base64 encoding does not cost more tokens than a URL — image token cost is based on pixel dimensions, not transport format.
267
+
268
+ ### Agent call
269
+
270
+ Photos are passed to the agent as a multimodal content array instead of a plain string:
271
+
272
+ ```js
273
+ const content = [
274
+ { type: 'image_url', url: fileUrl },
275
+ ];
276
+ if (caption) content.push({ type: 'text', text: caption });
277
+ ```
278
+
279
+ The agent layer must support receiving `content` as either a string or a content array and pass it through to the model accordingly.
280
+
281
+ ### Caption
282
+
283
+ If the user attaches a caption to the photo (`ctx.message.caption`), it is included as a text block alongside the image. If there is no caption, only the image block is sent.
284
+
285
+ ### Unsupported media types
286
+
287
+ Documents, audio, video, stickers, and other non-photo media types are not handled — the bot silently ignores them (same as unauthorized messages).
288
+
249
289
  ## Non-Goals (v1)
250
290
 
251
- - No support for photos, files, or other media types (text only)
291
+ - No support for documents, audio, video, or other non-photo media types
252
292
  - No inline keyboards or callback queries
253
293
  - No group chat support (only private chats)
254
294
  - No message editing or deletion handling
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ducci/jarvis",
3
- "version": "1.0.39",
3
+ "version": "1.0.40",
4
4
  "description": "A fully automated agent system that lives on a server.",
5
5
  "main": "./src/index.js",
6
6
  "type": "module",
@@ -60,6 +60,71 @@ export async function startTelegramChannel(config) {
60
60
  await ctx.reply('New session started.');
61
61
  });
62
62
 
63
+ bot.on('message:photo', async (ctx) => {
64
+ const userId = ctx.from?.id;
65
+ if (!allowedUserIds.includes(userId)) return;
66
+
67
+ const chatId = ctx.chat.id;
68
+ const sessionId = sessions[chatId] || null;
69
+
70
+ console.log(`[telegram] incoming photo chat_id=${chatId}`);
71
+
72
+ await ctx.api.sendChatAction(chatId, 'typing');
73
+ const typingInterval = setInterval(() => {
74
+ ctx.api.sendChatAction(chatId, 'typing').catch(() => {});
75
+ }, 4000);
76
+
77
+ let result;
78
+ try {
79
+ const photo = ctx.message.photo.filter(p => p.width <= 800).at(-1)
80
+ ?? ctx.message.photo[0];
81
+ const file = await ctx.api.getFile(photo.file_id);
82
+ const fileUrl = `https://api.telegram.org/file/bot${token}/${file.file_path}`;
83
+ const imgResponse = await fetch(fileUrl);
84
+ const buffer = await imgResponse.arrayBuffer();
85
+ const base64 = Buffer.from(buffer).toString('base64');
86
+ const dataUrl = `data:image/jpeg;base64,${base64}`;
87
+ const caption = ctx.message.caption || '';
88
+ result = await handleChat(config, sessionId, caption, [{ url: dataUrl }]);
89
+ } catch (e) {
90
+ console.error(`[telegram] agent error chat_id=${chatId}: ${e.message}`);
91
+ const errText = e.message
92
+ ? `Sorry, something went wrong: ${e.message}`
93
+ : 'Sorry, something went wrong. Please try again.';
94
+ await ctx.reply(errText).catch(() => {});
95
+ clearInterval(typingInterval);
96
+ return;
97
+ }
98
+
99
+ if (!sessions[chatId]) {
100
+ sessions[chatId] = result.sessionId;
101
+ save(sessions);
102
+ console.log(`[telegram] session created sessionId=${result.sessionId.slice(0, 8)}`);
103
+ }
104
+
105
+ try {
106
+ const MAX_TG = 4096;
107
+ const rawResponse = typeof result.response === 'string'
108
+ ? result.response
109
+ : result.response != null ? JSON.stringify(result.response, null, 2) : '';
110
+ const text = rawResponse.trim()
111
+ || 'The agent encountered an error and could not produce a response. Please try again.';
112
+ if (text.length <= MAX_TG) {
113
+ await ctx.reply(text);
114
+ } else {
115
+ for (let i = 0; i < text.length; i += MAX_TG) {
116
+ await ctx.reply(text.slice(i, i + MAX_TG));
117
+ }
118
+ }
119
+ console.log(`[telegram] response sent chat_id=${chatId} length=${text.length}`);
120
+ } catch (e) {
121
+ console.error(`[telegram] delivery error chat_id=${chatId}: ${e.message}`);
122
+ await ctx.reply('Sorry, something went wrong sending the response. Please try again.').catch(() => {});
123
+ } finally {
124
+ clearInterval(typingInterval);
125
+ }
126
+ });
127
+
63
128
  bot.on('message:text', async (ctx) => {
64
129
  const userId = ctx.from?.id;
65
130
 
@@ -483,7 +483,7 @@ export async function withSessionLock(sessionId, fn) {
483
483
  * Main entry point: handles a single POST /api/chat request.
484
484
  * Manages the handoff loop across multiple agent runs.
485
485
  */
486
- export async function handleChat(config, requestSessionId, userMessage) {
486
+ export async function handleChat(config, requestSessionId, userMessage, attachments = []) {
487
487
  const sessionId = requestSessionId || crypto.randomUUID();
488
488
 
489
489
  // Serialize concurrent requests for the same session. Each request registers
@@ -497,7 +497,7 @@ export async function handleChat(config, requestSessionId, userMessage) {
497
497
  await previous;
498
498
 
499
499
  try {
500
- return await _runHandleChat(config, sessionId, userMessage);
500
+ return await _runHandleChat(config, sessionId, userMessage, attachments);
501
501
  } finally {
502
502
  releaseLock();
503
503
  // Clean up only if no one else has queued behind us
@@ -511,7 +511,7 @@ export async function handleChat(config, requestSessionId, userMessage) {
511
511
  * The actual chat logic, extracted so handleChat can wrap it cleanly with the
512
512
  * session lock.
513
513
  */
514
- async function _runHandleChat(config, sessionId, userMessage) {
514
+ async function _runHandleChat(config, sessionId, userMessage, attachments = []) {
515
515
  const client = createClient(config);
516
516
 
517
517
  const systemPromptTemplate = loadSystemPrompt();
@@ -545,8 +545,18 @@ async function _runHandleChat(config, sessionId, userMessage) {
545
545
  userMessageWithContext += note;
546
546
  }
547
547
 
548
- // Append user message and reset handoff state
549
- session.messages.push({ role: 'user', content: userMessageWithContext });
548
+ // Append user message and reset handoff state.
549
+ // If attachments (e.g. images) are present, build a multimodal content array.
550
+ let userContent;
551
+ if (attachments && attachments.length > 0) {
552
+ userContent = [
553
+ ...attachments.map(a => ({ type: 'image_url', image_url: { url: a.url } })),
554
+ { type: 'text', text: userMessageWithContext },
555
+ ];
556
+ } else {
557
+ userContent = userMessageWithContext;
558
+ }
559
+ session.messages.push({ role: 'user', content: userContent });
550
560
  session.metadata.handoffCount = 0;
551
561
  session.metadata.failedApproaches = [];
552
562
  session.metadata.lastCheckpointRemaining = null;
@@ -281,7 +281,7 @@ const SEED_TOOLS = {
281
281
  type: 'function',
282
282
  function: {
283
283
  name: 'write_file',
284
- description: 'Write content directly to a file on the filesystem, bypassing all shell escaping. Use this to create or overwrite any file — shell scripts, config files, code, etc. Content is written exactly as provided: dollar signs, backslashes, and special characters are preserved without modification. Always prefer this over exec+echo, exec+printf, or exec+heredoc for writing files. For shell scripts, pass mode: "755" to make the file executable. Example: write_file({ path: "/path/to/scan.sh", content: "#!/bin/bash\\nDOMAIN=$1\\n...", mode: "755" })',
284
+ description: 'Create a new file or completely overwrite an existing file. Content is written exactly as provided dollar signs, backslashes, and special characters are preserved without modification. Always prefer this over exec+echo, exec+printf, or exec+heredoc. For shell scripts, pass mode: "755". For targeted edits to an existing file (changing a specific line or section), use edit_file instead.',
285
285
  parameters: {
286
286
  type: 'object',
287
287
  properties: {
@@ -313,6 +313,47 @@ const SEED_TOOLS = {
313
313
  return { status: 'ok', path: targetPath, bytes, mode: args.mode || '644' };
314
314
  `,
315
315
  },
316
+ edit_file: {
317
+ definition: {
318
+ type: 'function',
319
+ function: {
320
+ name: 'edit_file',
321
+ description: 'Replace an exact string in a file with a new string. Use this for targeted edits — you only need to provide the specific section to change, not the whole file. old_string must match exactly (including whitespace and indentation) and must appear exactly once in the file. If it appears more than once, add more surrounding context to make it unique. For creating new files or rewriting entire files, use write_file instead.',
322
+ parameters: {
323
+ type: 'object',
324
+ properties: {
325
+ path: {
326
+ type: 'string',
327
+ description: 'Absolute or relative path to the file to edit.',
328
+ },
329
+ old_string: {
330
+ type: 'string',
331
+ description: 'The exact string to find and replace. Must match character-for-character including whitespace and indentation.',
332
+ },
333
+ new_string: {
334
+ type: 'string',
335
+ description: 'The string to replace old_string with.',
336
+ },
337
+ },
338
+ required: ['path', 'old_string', 'new_string'],
339
+ },
340
+ },
341
+ },
342
+ code: `
343
+ const targetPath = path.resolve(args.path);
344
+ const content = await fs.promises.readFile(targetPath, 'utf8');
345
+ const count = content.split(args.old_string).length - 1;
346
+ if (count === 0) {
347
+ return { status: 'error', error: 'old_string not found in file. Check for exact whitespace and indentation match.' };
348
+ }
349
+ if (count > 1) {
350
+ return { status: 'error', error: \`old_string found \${count} times. Add more surrounding context to make it unique.\` };
351
+ }
352
+ const updated = content.replace(args.old_string, args.new_string);
353
+ await fs.promises.writeFile(targetPath, updated, 'utf8');
354
+ return { status: 'ok', path: targetPath };
355
+ `,
356
+ },
316
357
  get_current_time: {
317
358
  definition: {
318
359
  type: 'function',