@ducci/jarvis 1.0.92 → 1.0.94
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -697,7 +697,6 @@ export async function startTelegramChannel(config) {
|
|
|
697
697
|
await sendMessage(api, chatId, displayText, result.sessionId);
|
|
698
698
|
console.log(`[telegram] response sent chat_id=${chatId} slot=${slot} length=${displayText.length}`);
|
|
699
699
|
// TTS: send audio summary if voice is enabled (config.voiceEnabled checked live, updated by /voice toggle)
|
|
700
|
-
await api.sendMessage(chatId, `[TTS debug] voiceEnabled=${config.voiceEnabled} hasKey=${!!config.fishAudioApiKey}`).catch(() => {});
|
|
701
700
|
if (config.voiceEnabled && config.fishAudioApiKey) {
|
|
702
701
|
try {
|
|
703
702
|
// If the response is a raw JSON blob (format_error recovery), extract the actual text
|
|
@@ -709,10 +708,8 @@ export async function startTelegramChannel(config) {
|
|
|
709
708
|
const plain = toPlainText(ttsSource);
|
|
710
709
|
if (plain) {
|
|
711
710
|
const ttsText = await generateTtsSummary(plain, config);
|
|
712
|
-
await api.sendMessage(chatId, `[TTS debug] plain: "${plain.slice(0, 300)}"\nsummary: "${ttsText}"`).catch(() => {});
|
|
713
711
|
if (ttsText) {
|
|
714
712
|
const audioBuffer = await textToSpeech(ttsText, config);
|
|
715
|
-
await api.sendMessage(chatId, `[TTS debug] audio bytes: ${audioBuffer.length}`).catch(() => {});
|
|
716
713
|
await api.sendAudio(chatId, new InputFile(audioBuffer, 'response.mp3'));
|
|
717
714
|
console.log(`[telegram] voice sent chat_id=${chatId} slot=${slot} tts_chars=${ttsText.length}`);
|
|
718
715
|
}
|
package/src/server/fish-audio.js
CHANGED
|
@@ -14,23 +14,7 @@ const execAsync = promisify(exec);
|
|
|
14
14
|
// fish.audio s1 emotion tags: (emotion) at the START of a sentence only — applies to the whole sentence.
|
|
15
15
|
// Multiple tags can be combined: (excited)(soft tone) Hello!
|
|
16
16
|
// Tags must never appear mid-sentence.
|
|
17
|
-
const TTS_SYSTEM_PROMPT = `
|
|
18
|
-
|
|
19
|
-
Rules:
|
|
20
|
-
- Output 1–3 short sentences maximum. Be concise — this is spoken audio, not text.
|
|
21
|
-
- Begin EACH sentence with exactly one emotion tag from this list, placed before the first word:
|
|
22
|
-
(happy) (sad) (angry) (excited) (calm) (nervous) (confident) (surprised) (satisfied)
|
|
23
|
-
(delighted) (scared) (worried) (frustrated) (empathetic) (curious) (sarcastic)
|
|
24
|
-
(optimistic) (determined) (proud) (relaxed) (in a hurry tone) (whispering) (soft tone)
|
|
25
|
-
- The emotion tag goes at the very start of the sentence, before any word. Never mid-sentence.
|
|
26
|
-
- Choose emotions that fit the content: use (confident) or (calm) for informational answers,
|
|
27
|
-
(excited) or (satisfied) for completed tasks, (curious) for questions, etc.
|
|
28
|
-
- You may combine two tags on one sentence: (excited)(soft tone) Great news!
|
|
29
|
-
- Plain text only — no emojis, no markdown, no code blocks, no bullet points, no special characters.
|
|
30
|
-
- Keep technical jargon minimal; explain concepts simply as you would speak them.
|
|
31
|
-
- Match the language of the original response.
|
|
32
|
-
|
|
33
|
-
Output only the spoken summary text, nothing else.`;
|
|
17
|
+
const TTS_SYSTEM_PROMPT = `Summarize the given text in 1-2 spoken sentences for audio playback. Start each sentence with an emotion tag like (calm), (excited), (confident), (happy), or (curious) — placed before the first word of the sentence. Plain text only, no emojis, no markdown. Match the language of the text. Output only the summary, nothing else.`;
|
|
34
18
|
|
|
35
19
|
/**
|
|
36
20
|
* Generate a short TTS-optimized spoken summary of a response via LLM.
|
|
@@ -45,14 +29,20 @@ export async function generateTtsSummary(plainText, config) {
|
|
|
45
29
|
|
|
46
30
|
const response = await client.chat.completions.create({
|
|
47
31
|
model: config.selectedModel,
|
|
48
|
-
max_tokens: 200,
|
|
49
32
|
messages: [
|
|
50
33
|
{ role: 'system', content: TTS_SYSTEM_PROMPT },
|
|
51
34
|
{ role: 'user', content: `Summarize this for spoken audio:\n\n${plainText.slice(0, 3000)}` },
|
|
52
35
|
],
|
|
53
36
|
});
|
|
54
37
|
|
|
55
|
-
|
|
38
|
+
const choice = response.choices[0];
|
|
39
|
+
const msg = choice?.message;
|
|
40
|
+
const content = (msg?.content || msg?.reasoning_content || '').trim();
|
|
41
|
+
if (!content) {
|
|
42
|
+
// Surface the raw response structure so we can diagnose where the text actually is
|
|
43
|
+
throw new Error(`empty LLM response. finish_reason=${choice?.finish_reason} keys=${Object.keys(msg || {}).join(',')} raw=${JSON.stringify(msg).slice(0, 300)}`);
|
|
44
|
+
}
|
|
45
|
+
return content;
|
|
56
46
|
}
|
|
57
47
|
|
|
58
48
|
/**
|