@ducci/jarvis 1.0.92 → 1.0.94

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ducci/jarvis",
3
- "version": "1.0.92",
3
+ "version": "1.0.94",
4
4
  "description": "A fully automated agent system that lives on a server.",
5
5
  "main": "./src/index.js",
6
6
  "type": "module",
@@ -697,7 +697,6 @@ export async function startTelegramChannel(config) {
697
697
  await sendMessage(api, chatId, displayText, result.sessionId);
698
698
  console.log(`[telegram] response sent chat_id=${chatId} slot=${slot} length=${displayText.length}`);
699
699
  // TTS: send audio summary if voice is enabled (config.voiceEnabled checked live, updated by /voice toggle)
700
- await api.sendMessage(chatId, `[TTS debug] voiceEnabled=${config.voiceEnabled} hasKey=${!!config.fishAudioApiKey}`).catch(() => {});
701
700
  if (config.voiceEnabled && config.fishAudioApiKey) {
702
701
  try {
703
702
  // If the response is a raw JSON blob (format_error recovery), extract the actual text
@@ -709,10 +708,8 @@ export async function startTelegramChannel(config) {
709
708
  const plain = toPlainText(ttsSource);
710
709
  if (plain) {
711
710
  const ttsText = await generateTtsSummary(plain, config);
712
- await api.sendMessage(chatId, `[TTS debug] plain: "${plain.slice(0, 300)}"\nsummary: "${ttsText}"`).catch(() => {});
713
711
  if (ttsText) {
714
712
  const audioBuffer = await textToSpeech(ttsText, config);
715
- await api.sendMessage(chatId, `[TTS debug] audio bytes: ${audioBuffer.length}`).catch(() => {});
716
713
  await api.sendAudio(chatId, new InputFile(audioBuffer, 'response.mp3'));
717
714
  console.log(`[telegram] voice sent chat_id=${chatId} slot=${slot} tts_chars=${ttsText.length}`);
718
715
  }
@@ -14,23 +14,7 @@ const execAsync = promisify(exec);
14
14
  // fish.audio s1 emotion tags: (emotion) at the START of a sentence only — applies to the whole sentence.
15
15
  // Multiple tags can be combined: (excited)(soft tone) Hello!
16
16
  // Tags must never appear mid-sentence.
17
- const TTS_SYSTEM_PROMPT = `You summarize AI assistant responses into a short spoken version for text-to-speech audio.
18
-
19
- Rules:
20
- - Output 1–3 short sentences maximum. Be concise — this is spoken audio, not text.
21
- - Begin EACH sentence with exactly one emotion tag from this list, placed before the first word:
22
- (happy) (sad) (angry) (excited) (calm) (nervous) (confident) (surprised) (satisfied)
23
- (delighted) (scared) (worried) (frustrated) (empathetic) (curious) (sarcastic)
24
- (optimistic) (determined) (proud) (relaxed) (in a hurry tone) (whispering) (soft tone)
25
- - The emotion tag goes at the very start of the sentence, before any word. Never mid-sentence.
26
- - Choose emotions that fit the content: use (confident) or (calm) for informational answers,
27
- (excited) or (satisfied) for completed tasks, (curious) for questions, etc.
28
- - You may combine two tags on one sentence: (excited)(soft tone) Great news!
29
- - Plain text only — no emojis, no markdown, no code blocks, no bullet points, no special characters.
30
- - Keep technical jargon minimal; explain concepts simply as you would speak them.
31
- - Match the language of the original response.
32
-
33
- Output only the spoken summary text, nothing else.`;
17
+ const TTS_SYSTEM_PROMPT = `Summarize the given text in 1-2 spoken sentences for audio playback. Start each sentence with an emotion tag like (calm), (excited), (confident), (happy), or (curious) — placed before the first word of the sentence. Plain text only, no emojis, no markdown. Match the language of the text. Output only the summary, nothing else.`;
34
18
 
35
19
  /**
36
20
  * Generate a short TTS-optimized spoken summary of a response via LLM.
@@ -45,14 +29,20 @@ export async function generateTtsSummary(plainText, config) {
45
29
 
46
30
  const response = await client.chat.completions.create({
47
31
  model: config.selectedModel,
48
- max_tokens: 200,
49
32
  messages: [
50
33
  { role: 'system', content: TTS_SYSTEM_PROMPT },
51
34
  { role: 'user', content: `Summarize this for spoken audio:\n\n${plainText.slice(0, 3000)}` },
52
35
  ],
53
36
  });
54
37
 
55
- return (response.choices[0]?.message?.content || '').trim();
38
+ const choice = response.choices[0];
39
+ const msg = choice?.message;
40
+ const content = (msg?.content || msg?.reasoning_content || '').trim();
41
+ if (!content) {
42
+ // Surface the raw response structure so we can diagnose where the text actually is
43
+ throw new Error(`empty LLM response. finish_reason=${choice?.finish_reason} keys=${Object.keys(msg || {}).join(',')} raw=${JSON.stringify(msg).slice(0, 300)}`);
44
+ }
45
+ return content;
56
46
  }
57
47
 
58
48
  /**