npm - @ducci/jarvis - Versions diffs - 1.0.83 → 1.0.85 - Mend

@ducci/jarvis 1.0.83 → 1.0.85

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/package.json +1 -1
package/src/channels/telegram/index.js +117 -3
package/src/server/agent.js +5 -5
package/src/server/config.js +7 -1
package/src/server/fish-audio.js +120 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ducci/jarvis",
-  "version": "1.0.83",
+  "version": "1.0.85",
   "description": "A fully automated agent system that lives on a server.",
   "main": "./src/index.js",
   "type": "module",

package/src/channels/telegram/index.js CHANGED Viewed

@@ -6,7 +6,7 @@ const execAsync = promisify(exec);
 import { createRequire } from 'module';
 const _require = createRequire(import.meta.url);
 const { version: JARVIS_VERSION } = _require('../../../package.json');
-import { Bot, InlineKeyboard } from 'grammy';
+import { Bot, InlineKeyboard, InputFile } from 'grammy';
 import { run } from '@grammyjs/runner';
 import { handleChat, requestAbort } from '../../server/agent.js';
 import { loadSession } from '../../server/sessions.js';
@@ -14,6 +14,7 @@ import { PATHS } from '../../server/config.js';
 import { isRunningCron, getRunningCrons } from '../../server/cron-scheduler.js';
 import { load, save } from './sessions.js';
 import { describeImage } from '../../server/vision.js';
+import { textToSpeech, speechToText, generateTtsSummary } from '../../server/fish-audio.js';
 function getTelegramChatLogPath(chatId, sessionId) {
   const prefix = sessionId ? String(sessionId).slice(0, 8) : 'unknown';
@@ -37,6 +38,16 @@ function stripHtml(text) {
     .replace(/&lt;/g, '<').replace(/&gt;/g, '>').replace(/&amp;/g, '&');
 }
+// Strip HTML for passing plain text to the TTS summary LLM.
+function toPlainText(htmlText) {
+  let text = htmlText;
+  // Remove <pre> code blocks entirely — not useful spoken
+  text = text.replace(/<pre>[\s\S]*?<\/pre>/gi, '');
+  // Remove <code> inline tags but keep content
+  text = text.replace(/<code>([^<]*)<\/code>/gi, '$1');
+  return stripHtml(text).replace(/[ \t]+/g, ' ').trim();
+}
 function markdownToHtml(text) {
   // 0. Sanitize unsupported Telegram HTML tags
   // Headings → <b>
@@ -248,6 +259,7 @@ export async function startTelegramChannel(config) {
     { command: 'stop',    description: 'Stop the running agent on the active slot' },
     { command: 'slots',   description: 'Show all slots and their status' },
     { command: 'crons',   description: 'Show all crons, running status and next run' },
+    { command: 'voice',   description: 'Toggle voice responses on/off (fish.audio TTS)' },
     { command: 'version', description: 'Show Jarvis version' },
     { command: 'update',  description: 'Update Jarvis to the latest version' },
     { command: 'restart', description: 'Restart Jarvis' },
@@ -329,9 +341,9 @@ export async function startTelegramChannel(config) {
     }
     const totalMessages = Math.max(0, session.messages.length - 1); // exclude system prompt
-    const windowed = session.messages.length <= config.contextWindow + 1
+    const windowed = session.messages.length <= config.messageWindow + 1
       ? session.messages
-      : [session.messages[0], ...session.messages.slice(-config.contextWindow)];
+      : [session.messages[0], ...session.messages.slice(-config.messageWindow)];
     const inContext = Math.max(0, windowed.length - 1);
     const estimatedTokens = Math.round(JSON.stringify(windowed).length / 4);
     const model = config.selectedModel || 'unknown';
@@ -573,6 +585,28 @@ export async function startTelegramChannel(config) {
     await ctx.reply(lines.join('\n'), { parse_mode: 'HTML' });
   });
+  bot.command('voice', async (ctx) => {
+    const userId = ctx.from?.id;
+    if (!allowedUserIds.includes(userId)) return;
+    if (!config.fishAudioApiKey) {
+      await ctx.reply('fish.audio not configured. Add FISH_AUDIO_API_KEY to ~/.jarvis/.env first.');
+      return;
+    }
+    // Toggle voiceEnabled in settings.json and update live config
+    let settings = {};
+    try {
+      settings = JSON.parse(fs.readFileSync(PATHS.settingsFile, 'utf8'));
+    } catch { /* ignore */ }
+    settings.voiceEnabled = !config.voiceEnabled;
+    fs.writeFileSync(PATHS.settingsFile, JSON.stringify(settings, null, 2), 'utf8');
+    config.voiceEnabled = settings.voiceEnabled;
+    const status = config.voiceEnabled ? 'on' : 'off';
+    await ctx.reply(`Voice responses: <b>${status}</b>`, { parse_mode: 'HTML' });
+  });
   // Runs one or more batches until the pending queue is drained.
   // Each iteration takes all currently pending messages, merges them into a
   // single user turn, calls handleChat once, and sends one response.
@@ -662,6 +696,22 @@ export async function startTelegramChannel(config) {
           await appendTelegramChatLog(chatId, result.sessionId, 'JARVIS', displayText);
           await sendMessage(api, chatId, displayText, result.sessionId);
           console.log(`[telegram] response sent chat_id=${chatId} slot=${slot} length=${displayText.length}`);
+          // TTS: send audio summary if voice is enabled (config.voiceEnabled checked live, updated by /voice toggle)
+          if (config.voiceEnabled && config.fishAudioApiKey) {
+            try {
+              const plain = toPlainText(displayText);
+              if (plain) {
+                const ttsText = await generateTtsSummary(plain, config);
+                if (ttsText) {
+                  const audioBuffer = await textToSpeech(ttsText, config);
+                  await api.sendAudio(chatId, new InputFile(audioBuffer, 'response.mp3'));
+                  console.log(`[telegram] voice sent chat_id=${chatId} slot=${slot} tts_chars=${ttsText.length}`);
+                }
+              }
+            } catch (e) {
+              console.error(`[telegram] TTS error chat_id=${chatId}: ${e.message}`);
+            }
+          }
         } else {
           console.log(`[telegram] skipped duplicate final response chat_id=${chatId} slot=${slot}`);
         }
@@ -729,6 +779,70 @@ export async function startTelegramChannel(config) {
     }
   });
+  bot.on('message:voice', async (ctx) => {
+    const userId = ctx.from?.id;
+    if (!allowedUserIds.includes(userId)) return;
+    const chatId = ctx.chat.id;
+    const ts = new Date().toISOString();
+    if (!config.fishAudioApiKey) {
+      await ctx.reply('Voice input not configured. Add FISH_AUDIO_API_KEY to ~/.jarvis/.env first.');
+      return;
+    }
+    console.log(`[telegram] incoming voice chat_id=${chatId}`);
+    await ctx.api.sendChatAction(chatId, 'typing');
+    // Download voice file (OGG/Opus from Telegram)
+    let transcription;
+    try {
+      const file = await ctx.api.getFile(ctx.message.voice.file_id);
+      const fileUrl = `https://api.telegram.org/file/bot${token}/${file.file_path}`;
+      const audioResponse = await fetch(fileUrl);
+      const audioBuffer = Buffer.from(await audioResponse.arrayBuffer());
+      transcription = await speechToText(audioBuffer, config);
+    } catch (e) {
+      console.error(`[telegram] STT error chat_id=${chatId}: ${e.message}`);
+      await ctx.reply('Sorry, could not transcribe the voice message.').catch(() => {});
+      return;
+    }
+    if (!transcription) {
+      await ctx.reply('Could not transcribe voice message (empty result).').catch(() => {});
+      return;
+    }
+    console.log(`[telegram] voice transcribed chat_id=${chatId}: "${transcription.slice(0, 80)}"`);
+    // Echo transcription back so user can confirm what was understood
+    await ctx.reply(`<i>🎤 ${escapeHtml(transcription)}</i>`, { parse_mode: 'HTML' }).catch(() => {});
+    const entry = { text: transcription, attachments: [], ts };
+    const slot = getActiveSlot(chatId);
+    const key = slotKey(chatId, slot);
+    if (isRunning.has(key)) {
+      if (!pendingMessages.has(key)) pendingMessages.set(key, []);
+      pendingMessages.get(key).push(entry);
+      console.log(`[telegram] buffered voice chat_id=${chatId} slot=${slot} pending=${pendingMessages.get(key).length}`);
+      return;
+    }
+    isRunning.add(key);
+    runStartTimes.set(key, new Date());
+    const typingInterval = setInterval(() => {
+      ctx.api.sendChatAction(chatId, 'typing').catch(() => {});
+    }, 4000);
+    try {
+      await processQueue(ctx.api, chatId, slot, [entry]);
+    } finally {
+      clearInterval(typingInterval);
+      isRunning.delete(key);
+      runStartTimes.delete(key);
+    }
+  });
   bot.on('message:text', async (ctx) => {
     const userId = ctx.from?.id;

package/src/server/agent.js CHANGED Viewed

@@ -202,8 +202,8 @@ async function runSubagent(client, config, args, parentSessionId) {
       }
       return msg;
     });
-    if (resolved.length <= subConfig.contextWindow + 1) return resolved;
-    return [resolved[0], ...resolved.slice(-(subConfig.contextWindow))];
+    if (resolved.length <= subConfig.messageWindow + 1) return resolved;
+    return [resolved[0], ...resolved.slice(-(subConfig.messageWindow))];
   }
   const run = await runAgentLoop(client, subConfig, subSession, prepareMessages, usageAccum);
@@ -766,7 +766,7 @@ async function _runHandleChat(config, sessionId, userMessage, attachments = [],
   // Resolves {{user_info}} in system prompt at runtime (never persisted).
   // Applies a sliding window: always includes the system prompt (messages[0])
-  // plus the most recent contextWindow messages, so long sessions don't overflow
+  // plus the most recent messageWindow messages, so long sessions don't overflow
   // the model's context. Full history is always preserved on disk.
   function prepareMessages(messages) {
     const resolved = messages.map((msg, i) => {
@@ -775,8 +775,8 @@ async function _runHandleChat(config, sessionId, userMessage, attachments = [],
       }
       return msg;
     });
-    if (resolved.length <= config.contextWindow + 1) return resolved;
-    return [resolved[0], ...resolved.slice(-(config.contextWindow))];
+    if (resolved.length <= config.messageWindow + 1) return resolved;
+    return [resolved[0], ...resolved.slice(-(config.messageWindow))];
   }
   const allToolCalls = [];

package/src/server/config.js CHANGED Viewed

@@ -64,6 +64,9 @@ export function loadConfig() {
     visionApiKey = process.env.OPENROUTER_API_KEY || null;
   }
+  // fish.audio voice (optional) — TTS for outgoing responses, STT for incoming voice messages
+  const fishAudioApiKey = process.env.FISH_AUDIO_API_KEY || null;
   return {
     provider,
     apiKey,
@@ -71,7 +74,7 @@ export function loadConfig() {
     fallbackModel: settings.fallbackModel || (provider === 'anthropic' ? 'claude-haiku-4-5-20251001' : 'openrouter/free'),
     maxIterations: settings.maxIterations || 20,
     maxHandoffs: settings.maxHandoffs || 3,
-    contextWindow: settings.contextWindow || 300,
+    messageWindow: settings.messageWindow || 300,
     modelContextWindow: settings.modelContextWindow || null,
     port: settings.port || 18008,
     telegram: {
@@ -81,6 +84,9 @@ export function loadConfig() {
     visionProvider,
     visionModel,
     visionApiKey,
+    voiceEnabled: settings.voiceEnabled || false,
+    fishAudioApiKey,
+    fishAudioVoiceId: settings.fishAudioVoiceId || null,
   };
 }

package/src/server/fish-audio.js ADDED Viewed

@@ -0,0 +1,120 @@
+/**
+ * fish.audio API integration — TTS and STT.
+ */
+import { createClient } from './provider.js';
+// System prompt for TTS summary generation.
+// fish.audio s1 emotion tags: (emotion) at the START of a sentence only — applies to the whole sentence.
+// Multiple tags can be combined: (excited)(soft tone) Hello!
+// Tags must never appear mid-sentence.
+const TTS_SYSTEM_PROMPT = `You summarize AI assistant responses into a short spoken version for text-to-speech audio.
+Rules:
+- Output 1–3 short sentences maximum. Be concise — this is spoken audio, not text.
+- Begin EACH sentence with exactly one emotion tag from this list, placed before the first word:
+  (happy) (sad) (angry) (excited) (calm) (nervous) (confident) (surprised) (satisfied)
+  (delighted) (scared) (worried) (frustrated) (empathetic) (curious) (sarcastic)
+  (optimistic) (determined) (proud) (relaxed) (in a hurry tone) (whispering) (soft tone)
+- The emotion tag goes at the very start of the sentence, before any word. Never mid-sentence.
+- Choose emotions that fit the content: use (confident) or (calm) for informational answers,
+  (excited) or (satisfied) for completed tasks, (curious) for questions, etc.
+- You may combine two tags on one sentence: (excited)(soft tone) Great news!
+- No markdown, no code blocks, no bullet points — plain speech only.
+- Keep technical jargon minimal; explain concepts simply as you would speak them.
+- Match the language of the original response.
+Output only the spoken summary text, nothing else.`;
+/**
+ * Generate a short TTS-optimized spoken summary of a response via LLM.
+ * Includes fish.audio s1 emotion tags at sentence starts.
+ *
+ * @param {string} plainText - Plain text of the agent response (HTML already stripped)
+ * @param {object} config - Full app config (provider, apiKey, selectedModel)
+ * @returns {Promise<string>} - TTS-ready text with emotion tags
+ */
+export async function generateTtsSummary(plainText, config) {
+  const client = createClient(config);
+  const response = await client.chat.completions.create({
+    model: config.selectedModel,
+    max_tokens: 200,
+    messages: [
+      { role: 'system', content: TTS_SYSTEM_PROMPT },
+      { role: 'user', content: `Summarize this for spoken audio:\n\n${plainText.slice(0, 3000)}` },
+    ],
+  });
+  return (response.choices[0]?.message?.content || '').trim();
+}
+/**
+ * Convert text to speech via fish.audio TTS API.
+ * Returns a Buffer containing MP3 audio data.
+ *
+ * @param {string} text - TTS-ready text (may include fish.audio emotion tags)
+ * @param {object} config - Must include fishAudioApiKey and optionally fishAudioVoiceId
+ * @returns {Promise<Buffer>}
+ */
+export async function textToSpeech(text, config) {
+  const { fishAudioApiKey, fishAudioVoiceId } = config;
+  const body = {
+    text,
+    format: 'mp3',
+    latency: 'normal',
+    mp3_bitrate: 64,
+  };
+  if (fishAudioVoiceId) body.reference_id = fishAudioVoiceId;
+  const response = await fetch('https://api.fish.audio/v1/tts', {
+    method: 'POST',
+    headers: {
+      'Authorization': `Bearer ${fishAudioApiKey}`,
+      'Content-Type': 'application/json',
+      'model': 's1',
+    },
+    body: JSON.stringify(body),
+  });
+  if (!response.ok) {
+    const errText = await response.text().catch(() => '');
+    throw new Error(`fish.audio TTS ${response.status}: ${errText.slice(0, 200)}`);
+  }
+  return Buffer.from(await response.arrayBuffer());
+}
+/**
+ * Transcribe audio to text via fish.audio ASR API.
+ * audioBuffer should be OGG/Opus data (standard Telegram voice format).
+ *
+ * @param {Buffer} audioBuffer
+ * @param {object} config - Must include fishAudioApiKey
+ * @returns {Promise<string>} - Transcribed text
+ */
+export async function speechToText(audioBuffer, config) {
+  const { fishAudioApiKey } = config;
+  const formData = new FormData();
+  const blob = new Blob([audioBuffer], { type: 'audio/ogg' });
+  formData.append('audio', blob, 'voice.ogg');
+  formData.append('ignore_timestamps', 'true');
+  const response = await fetch('https://api.fish.audio/v1/asr', {
+    method: 'POST',
+    headers: {
+      'Authorization': `Bearer ${fishAudioApiKey}`,
+    },
+    body: formData,
+  });
+  if (!response.ok) {
+    const errText = await response.text().catch(() => '');
+    throw new Error(`fish.audio ASR ${response.status}: ${errText.slice(0, 200)}`);
+  }
+  const data = await response.json();
+  return (data.text || '').trim();
+}