@ducci/jarvis 1.0.91 → 1.0.93
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -709,8 +709,12 @@ export async function startTelegramChannel(config) {
|
|
|
709
709
|
const plain = toPlainText(ttsSource);
|
|
710
710
|
if (plain) {
|
|
711
711
|
const ttsText = await generateTtsSummary(plain, config);
|
|
712
|
-
|
|
713
|
-
|
|
712
|
+
// Fallback to plain text (truncated) if LLM returns empty summary
|
|
713
|
+
const finalTtsText = ttsText || plain.slice(0, 500);
|
|
714
|
+
await api.sendMessage(chatId, `[TTS debug] summary="${ttsText}" finalLen=${finalTtsText.length}`).catch(() => {});
|
|
715
|
+
if (finalTtsText) {
|
|
716
|
+
const audioBuffer = await textToSpeech(finalTtsText, config);
|
|
717
|
+
await api.sendMessage(chatId, `[TTS debug] audio bytes: ${audioBuffer.length}`).catch(() => {});
|
|
714
718
|
await api.sendAudio(chatId, new InputFile(audioBuffer, 'response.mp3'));
|
|
715
719
|
console.log(`[telegram] voice sent chat_id=${chatId} slot=${slot} tts_chars=${ttsText.length}`);
|
|
716
720
|
}
|
package/src/server/fish-audio.js
CHANGED
|
@@ -14,23 +14,7 @@ const execAsync = promisify(exec);
|
|
|
14
14
|
// fish.audio s1 emotion tags: (emotion) at the START of a sentence only — applies to the whole sentence.
|
|
15
15
|
// Multiple tags can be combined: (excited)(soft tone) Hello!
|
|
16
16
|
// Tags must never appear mid-sentence.
|
|
17
|
-
const TTS_SYSTEM_PROMPT = `
|
|
18
|
-
|
|
19
|
-
Rules:
|
|
20
|
-
- Output 1–3 short sentences maximum. Be concise — this is spoken audio, not text.
|
|
21
|
-
- Begin EACH sentence with exactly one emotion tag from this list, placed before the first word:
|
|
22
|
-
(happy) (sad) (angry) (excited) (calm) (nervous) (confident) (surprised) (satisfied)
|
|
23
|
-
(delighted) (scared) (worried) (frustrated) (empathetic) (curious) (sarcastic)
|
|
24
|
-
(optimistic) (determined) (proud) (relaxed) (in a hurry tone) (whispering) (soft tone)
|
|
25
|
-
- The emotion tag goes at the very start of the sentence, before any word. Never mid-sentence.
|
|
26
|
-
- Choose emotions that fit the content: use (confident) or (calm) for informational answers,
|
|
27
|
-
(excited) or (satisfied) for completed tasks, (curious) for questions, etc.
|
|
28
|
-
- You may combine two tags on one sentence: (excited)(soft tone) Great news!
|
|
29
|
-
- Plain text only — no emojis, no markdown, no code blocks, no bullet points, no special characters.
|
|
30
|
-
- Keep technical jargon minimal; explain concepts simply as you would speak them.
|
|
31
|
-
- Match the language of the original response.
|
|
32
|
-
|
|
33
|
-
Output only the spoken summary text, nothing else.`;
|
|
17
|
+
const TTS_SYSTEM_PROMPT = `Summarize the given text in 1-2 spoken sentences for audio playback. Start each sentence with an emotion tag like (calm), (excited), (confident), (happy), or (curious) — placed before the first word of the sentence. Plain text only, no emojis, no markdown. Match the language of the text. Output only the summary, nothing else.`;
|
|
34
18
|
|
|
35
19
|
/**
|
|
36
20
|
* Generate a short TTS-optimized spoken summary of a response via LLM.
|
|
@@ -45,14 +29,20 @@ export async function generateTtsSummary(plainText, config) {
|
|
|
45
29
|
|
|
46
30
|
const response = await client.chat.completions.create({
|
|
47
31
|
model: config.selectedModel,
|
|
48
|
-
max_tokens: 200,
|
|
49
32
|
messages: [
|
|
50
33
|
{ role: 'system', content: TTS_SYSTEM_PROMPT },
|
|
51
34
|
{ role: 'user', content: `Summarize this for spoken audio:\n\n${plainText.slice(0, 3000)}` },
|
|
52
35
|
],
|
|
53
36
|
});
|
|
54
37
|
|
|
55
|
-
|
|
38
|
+
const choice = response.choices[0];
|
|
39
|
+
const msg = choice?.message;
|
|
40
|
+
const content = (msg?.content || msg?.reasoning_content || '').trim();
|
|
41
|
+
if (!content) {
|
|
42
|
+
// Surface the raw response structure so we can diagnose where the text actually is
|
|
43
|
+
throw new Error(`empty LLM response. finish_reason=${choice?.finish_reason} keys=${Object.keys(msg || {}).join(',')} raw=${JSON.stringify(msg).slice(0, 300)}`);
|
|
44
|
+
}
|
|
45
|
+
return content;
|
|
56
46
|
}
|
|
57
47
|
|
|
58
48
|
/**
|