npm - @ducci/jarvis - Versions diffs - 1.0.91 → 1.0.93 - Mend

@ducci/jarvis 1.0.91 → 1.0.93

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/package.json +1 -1
package/src/channels/telegram/index.js +6 -2
package/src/server/fish-audio.js +9 -19

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ducci/jarvis",
-  "version": "1.0.91",
+  "version": "1.0.93",
   "description": "A fully automated agent system that lives on a server.",
   "main": "./src/index.js",
   "type": "module",

package/src/channels/telegram/index.js CHANGED Viewed

@@ -709,8 +709,12 @@ export async function startTelegramChannel(config) {
               const plain = toPlainText(ttsSource);
               if (plain) {
                 const ttsText = await generateTtsSummary(plain, config);
-                if (ttsText) {
-                  const audioBuffer = await textToSpeech(ttsText, config);
+                // Fallback to plain text (truncated) if LLM returns empty summary
+                const finalTtsText = ttsText || plain.slice(0, 500);
+                await api.sendMessage(chatId, `[TTS debug] summary="${ttsText}" finalLen=${finalTtsText.length}`).catch(() => {});
+                if (finalTtsText) {
+                  const audioBuffer = await textToSpeech(finalTtsText, config);
+                  await api.sendMessage(chatId, `[TTS debug] audio bytes: ${audioBuffer.length}`).catch(() => {});
                   await api.sendAudio(chatId, new InputFile(audioBuffer, 'response.mp3'));
                   console.log(`[telegram] voice sent chat_id=${chatId} slot=${slot} tts_chars=${ttsText.length}`);
                 }

package/src/server/fish-audio.js CHANGED Viewed

@@ -14,23 +14,7 @@ const execAsync = promisify(exec);
 // fish.audio s1 emotion tags: (emotion) at the START of a sentence only — applies to the whole sentence.
 // Multiple tags can be combined: (excited)(soft tone) Hello!
 // Tags must never appear mid-sentence.
-const TTS_SYSTEM_PROMPT = `You summarize AI assistant responses into a short spoken version for text-to-speech audio.
-Rules:
-- Output 1–3 short sentences maximum. Be concise — this is spoken audio, not text.
-- Begin EACH sentence with exactly one emotion tag from this list, placed before the first word:
-  (happy) (sad) (angry) (excited) (calm) (nervous) (confident) (surprised) (satisfied)
-  (delighted) (scared) (worried) (frustrated) (empathetic) (curious) (sarcastic)
-  (optimistic) (determined) (proud) (relaxed) (in a hurry tone) (whispering) (soft tone)
-- The emotion tag goes at the very start of the sentence, before any word. Never mid-sentence.
-- Choose emotions that fit the content: use (confident) or (calm) for informational answers,
-  (excited) or (satisfied) for completed tasks, (curious) for questions, etc.
-- You may combine two tags on one sentence: (excited)(soft tone) Great news!
-- Plain text only — no emojis, no markdown, no code blocks, no bullet points, no special characters.
-- Keep technical jargon minimal; explain concepts simply as you would speak them.
-- Match the language of the original response.
-Output only the spoken summary text, nothing else.`;
+const TTS_SYSTEM_PROMPT = `Summarize the given text in 1-2 spoken sentences for audio playback. Start each sentence with an emotion tag like (calm), (excited), (confident), (happy), or (curious) — placed before the first word of the sentence. Plain text only, no emojis, no markdown. Match the language of the text. Output only the summary, nothing else.`;
 /**
  * Generate a short TTS-optimized spoken summary of a response via LLM.
@@ -45,14 +29,20 @@ export async function generateTtsSummary(plainText, config) {
   const response = await client.chat.completions.create({
     model: config.selectedModel,
-    max_tokens: 200,
     messages: [
       { role: 'system', content: TTS_SYSTEM_PROMPT },
       { role: 'user', content: `Summarize this for spoken audio:\n\n${plainText.slice(0, 3000)}` },
     ],
   });
-  return (response.choices[0]?.message?.content || '').trim();
+  const choice = response.choices[0];
+  const msg = choice?.message;
+  const content = (msg?.content || msg?.reasoning_content || '').trim();
+  if (!content) {
+    // Surface the raw response structure so we can diagnose where the text actually is
+    throw new Error(`empty LLM response. finish_reason=${choice?.finish_reason} keys=${Object.keys(msg || {}).join(',')} raw=${JSON.stringify(msg).slice(0, 300)}`);
+  }
+  return content;
 }
 /**