@ducci/jarvis 1.0.83 → 1.0.85
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/channels/telegram/index.js +117 -3
- package/src/server/agent.js +5 -5
- package/src/server/config.js +7 -1
- package/src/server/fish-audio.js +120 -0
package/package.json
CHANGED
|
@@ -6,7 +6,7 @@ const execAsync = promisify(exec);
|
|
|
6
6
|
import { createRequire } from 'module';
|
|
7
7
|
const _require = createRequire(import.meta.url);
|
|
8
8
|
const { version: JARVIS_VERSION } = _require('../../../package.json');
|
|
9
|
-
import { Bot, InlineKeyboard } from 'grammy';
|
|
9
|
+
import { Bot, InlineKeyboard, InputFile } from 'grammy';
|
|
10
10
|
import { run } from '@grammyjs/runner';
|
|
11
11
|
import { handleChat, requestAbort } from '../../server/agent.js';
|
|
12
12
|
import { loadSession } from '../../server/sessions.js';
|
|
@@ -14,6 +14,7 @@ import { PATHS } from '../../server/config.js';
|
|
|
14
14
|
import { isRunningCron, getRunningCrons } from '../../server/cron-scheduler.js';
|
|
15
15
|
import { load, save } from './sessions.js';
|
|
16
16
|
import { describeImage } from '../../server/vision.js';
|
|
17
|
+
import { textToSpeech, speechToText, generateTtsSummary } from '../../server/fish-audio.js';
|
|
17
18
|
|
|
18
19
|
function getTelegramChatLogPath(chatId, sessionId) {
|
|
19
20
|
const prefix = sessionId ? String(sessionId).slice(0, 8) : 'unknown';
|
|
@@ -37,6 +38,16 @@ function stripHtml(text) {
|
|
|
37
38
|
.replace(/</g, '<').replace(/>/g, '>').replace(/&/g, '&');
|
|
38
39
|
}
|
|
39
40
|
|
|
41
|
+
// Strip HTML for passing plain text to the TTS summary LLM.
|
|
42
|
+
function toPlainText(htmlText) {
|
|
43
|
+
let text = htmlText;
|
|
44
|
+
// Remove <pre> code blocks entirely — not useful spoken
|
|
45
|
+
text = text.replace(/<pre>[\s\S]*?<\/pre>/gi, '');
|
|
46
|
+
// Remove <code> inline tags but keep content
|
|
47
|
+
text = text.replace(/<code>([^<]*)<\/code>/gi, '$1');
|
|
48
|
+
return stripHtml(text).replace(/[ \t]+/g, ' ').trim();
|
|
49
|
+
}
|
|
50
|
+
|
|
40
51
|
function markdownToHtml(text) {
|
|
41
52
|
// 0. Sanitize unsupported Telegram HTML tags
|
|
42
53
|
// Headings → <b>
|
|
@@ -248,6 +259,7 @@ export async function startTelegramChannel(config) {
|
|
|
248
259
|
{ command: 'stop', description: 'Stop the running agent on the active slot' },
|
|
249
260
|
{ command: 'slots', description: 'Show all slots and their status' },
|
|
250
261
|
{ command: 'crons', description: 'Show all crons, running status and next run' },
|
|
262
|
+
{ command: 'voice', description: 'Toggle voice responses on/off (fish.audio TTS)' },
|
|
251
263
|
{ command: 'version', description: 'Show Jarvis version' },
|
|
252
264
|
{ command: 'update', description: 'Update Jarvis to the latest version' },
|
|
253
265
|
{ command: 'restart', description: 'Restart Jarvis' },
|
|
@@ -329,9 +341,9 @@ export async function startTelegramChannel(config) {
|
|
|
329
341
|
}
|
|
330
342
|
|
|
331
343
|
const totalMessages = Math.max(0, session.messages.length - 1); // exclude system prompt
|
|
332
|
-
const windowed = session.messages.length <= config.
|
|
344
|
+
const windowed = session.messages.length <= config.messageWindow + 1
|
|
333
345
|
? session.messages
|
|
334
|
-
: [session.messages[0], ...session.messages.slice(-config.
|
|
346
|
+
: [session.messages[0], ...session.messages.slice(-config.messageWindow)];
|
|
335
347
|
const inContext = Math.max(0, windowed.length - 1);
|
|
336
348
|
const estimatedTokens = Math.round(JSON.stringify(windowed).length / 4);
|
|
337
349
|
const model = config.selectedModel || 'unknown';
|
|
@@ -573,6 +585,28 @@ export async function startTelegramChannel(config) {
|
|
|
573
585
|
await ctx.reply(lines.join('\n'), { parse_mode: 'HTML' });
|
|
574
586
|
});
|
|
575
587
|
|
|
588
|
+
bot.command('voice', async (ctx) => {
|
|
589
|
+
const userId = ctx.from?.id;
|
|
590
|
+
if (!allowedUserIds.includes(userId)) return;
|
|
591
|
+
|
|
592
|
+
if (!config.fishAudioApiKey) {
|
|
593
|
+
await ctx.reply('fish.audio not configured. Add FISH_AUDIO_API_KEY to ~/.jarvis/.env first.');
|
|
594
|
+
return;
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
// Toggle voiceEnabled in settings.json and update live config
|
|
598
|
+
let settings = {};
|
|
599
|
+
try {
|
|
600
|
+
settings = JSON.parse(fs.readFileSync(PATHS.settingsFile, 'utf8'));
|
|
601
|
+
} catch { /* ignore */ }
|
|
602
|
+
settings.voiceEnabled = !config.voiceEnabled;
|
|
603
|
+
fs.writeFileSync(PATHS.settingsFile, JSON.stringify(settings, null, 2), 'utf8');
|
|
604
|
+
config.voiceEnabled = settings.voiceEnabled;
|
|
605
|
+
|
|
606
|
+
const status = config.voiceEnabled ? 'on' : 'off';
|
|
607
|
+
await ctx.reply(`Voice responses: <b>${status}</b>`, { parse_mode: 'HTML' });
|
|
608
|
+
});
|
|
609
|
+
|
|
576
610
|
// Runs one or more batches until the pending queue is drained.
|
|
577
611
|
// Each iteration takes all currently pending messages, merges them into a
|
|
578
612
|
// single user turn, calls handleChat once, and sends one response.
|
|
@@ -662,6 +696,22 @@ export async function startTelegramChannel(config) {
|
|
|
662
696
|
await appendTelegramChatLog(chatId, result.sessionId, 'JARVIS', displayText);
|
|
663
697
|
await sendMessage(api, chatId, displayText, result.sessionId);
|
|
664
698
|
console.log(`[telegram] response sent chat_id=${chatId} slot=${slot} length=${displayText.length}`);
|
|
699
|
+
// TTS: send audio summary if voice is enabled (config.voiceEnabled checked live, updated by /voice toggle)
|
|
700
|
+
if (config.voiceEnabled && config.fishAudioApiKey) {
|
|
701
|
+
try {
|
|
702
|
+
const plain = toPlainText(displayText);
|
|
703
|
+
if (plain) {
|
|
704
|
+
const ttsText = await generateTtsSummary(plain, config);
|
|
705
|
+
if (ttsText) {
|
|
706
|
+
const audioBuffer = await textToSpeech(ttsText, config);
|
|
707
|
+
await api.sendAudio(chatId, new InputFile(audioBuffer, 'response.mp3'));
|
|
708
|
+
console.log(`[telegram] voice sent chat_id=${chatId} slot=${slot} tts_chars=${ttsText.length}`);
|
|
709
|
+
}
|
|
710
|
+
}
|
|
711
|
+
} catch (e) {
|
|
712
|
+
console.error(`[telegram] TTS error chat_id=${chatId}: ${e.message}`);
|
|
713
|
+
}
|
|
714
|
+
}
|
|
665
715
|
} else {
|
|
666
716
|
console.log(`[telegram] skipped duplicate final response chat_id=${chatId} slot=${slot}`);
|
|
667
717
|
}
|
|
@@ -729,6 +779,70 @@ export async function startTelegramChannel(config) {
|
|
|
729
779
|
}
|
|
730
780
|
});
|
|
731
781
|
|
|
782
|
+
bot.on('message:voice', async (ctx) => {
|
|
783
|
+
const userId = ctx.from?.id;
|
|
784
|
+
if (!allowedUserIds.includes(userId)) return;
|
|
785
|
+
|
|
786
|
+
const chatId = ctx.chat.id;
|
|
787
|
+
const ts = new Date().toISOString();
|
|
788
|
+
|
|
789
|
+
if (!config.fishAudioApiKey) {
|
|
790
|
+
await ctx.reply('Voice input not configured. Add FISH_AUDIO_API_KEY to ~/.jarvis/.env first.');
|
|
791
|
+
return;
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
console.log(`[telegram] incoming voice chat_id=${chatId}`);
|
|
795
|
+
await ctx.api.sendChatAction(chatId, 'typing');
|
|
796
|
+
|
|
797
|
+
// Download voice file (OGG/Opus from Telegram)
|
|
798
|
+
let transcription;
|
|
799
|
+
try {
|
|
800
|
+
const file = await ctx.api.getFile(ctx.message.voice.file_id);
|
|
801
|
+
const fileUrl = `https://api.telegram.org/file/bot${token}/${file.file_path}`;
|
|
802
|
+
const audioResponse = await fetch(fileUrl);
|
|
803
|
+
const audioBuffer = Buffer.from(await audioResponse.arrayBuffer());
|
|
804
|
+
transcription = await speechToText(audioBuffer, config);
|
|
805
|
+
} catch (e) {
|
|
806
|
+
console.error(`[telegram] STT error chat_id=${chatId}: ${e.message}`);
|
|
807
|
+
await ctx.reply('Sorry, could not transcribe the voice message.').catch(() => {});
|
|
808
|
+
return;
|
|
809
|
+
}
|
|
810
|
+
|
|
811
|
+
if (!transcription) {
|
|
812
|
+
await ctx.reply('Could not transcribe voice message (empty result).').catch(() => {});
|
|
813
|
+
return;
|
|
814
|
+
}
|
|
815
|
+
|
|
816
|
+
console.log(`[telegram] voice transcribed chat_id=${chatId}: "${transcription.slice(0, 80)}"`);
|
|
817
|
+
// Echo transcription back so user can confirm what was understood
|
|
818
|
+
await ctx.reply(`<i>🎤 ${escapeHtml(transcription)}</i>`, { parse_mode: 'HTML' }).catch(() => {});
|
|
819
|
+
|
|
820
|
+
const entry = { text: transcription, attachments: [], ts };
|
|
821
|
+
const slot = getActiveSlot(chatId);
|
|
822
|
+
const key = slotKey(chatId, slot);
|
|
823
|
+
|
|
824
|
+
if (isRunning.has(key)) {
|
|
825
|
+
if (!pendingMessages.has(key)) pendingMessages.set(key, []);
|
|
826
|
+
pendingMessages.get(key).push(entry);
|
|
827
|
+
console.log(`[telegram] buffered voice chat_id=${chatId} slot=${slot} pending=${pendingMessages.get(key).length}`);
|
|
828
|
+
return;
|
|
829
|
+
}
|
|
830
|
+
|
|
831
|
+
isRunning.add(key);
|
|
832
|
+
runStartTimes.set(key, new Date());
|
|
833
|
+
const typingInterval = setInterval(() => {
|
|
834
|
+
ctx.api.sendChatAction(chatId, 'typing').catch(() => {});
|
|
835
|
+
}, 4000);
|
|
836
|
+
|
|
837
|
+
try {
|
|
838
|
+
await processQueue(ctx.api, chatId, slot, [entry]);
|
|
839
|
+
} finally {
|
|
840
|
+
clearInterval(typingInterval);
|
|
841
|
+
isRunning.delete(key);
|
|
842
|
+
runStartTimes.delete(key);
|
|
843
|
+
}
|
|
844
|
+
});
|
|
845
|
+
|
|
732
846
|
bot.on('message:text', async (ctx) => {
|
|
733
847
|
const userId = ctx.from?.id;
|
|
734
848
|
|
package/src/server/agent.js
CHANGED
|
@@ -202,8 +202,8 @@ async function runSubagent(client, config, args, parentSessionId) {
|
|
|
202
202
|
}
|
|
203
203
|
return msg;
|
|
204
204
|
});
|
|
205
|
-
if (resolved.length <= subConfig.
|
|
206
|
-
return [resolved[0], ...resolved.slice(-(subConfig.
|
|
205
|
+
if (resolved.length <= subConfig.messageWindow + 1) return resolved;
|
|
206
|
+
return [resolved[0], ...resolved.slice(-(subConfig.messageWindow))];
|
|
207
207
|
}
|
|
208
208
|
|
|
209
209
|
const run = await runAgentLoop(client, subConfig, subSession, prepareMessages, usageAccum);
|
|
@@ -766,7 +766,7 @@ async function _runHandleChat(config, sessionId, userMessage, attachments = [],
|
|
|
766
766
|
|
|
767
767
|
// Resolves {{user_info}} in system prompt at runtime (never persisted).
|
|
768
768
|
// Applies a sliding window: always includes the system prompt (messages[0])
|
|
769
|
-
// plus the most recent
|
|
769
|
+
// plus the most recent messageWindow messages, so long sessions don't overflow
|
|
770
770
|
// the model's context. Full history is always preserved on disk.
|
|
771
771
|
function prepareMessages(messages) {
|
|
772
772
|
const resolved = messages.map((msg, i) => {
|
|
@@ -775,8 +775,8 @@ async function _runHandleChat(config, sessionId, userMessage, attachments = [],
|
|
|
775
775
|
}
|
|
776
776
|
return msg;
|
|
777
777
|
});
|
|
778
|
-
if (resolved.length <= config.
|
|
779
|
-
return [resolved[0], ...resolved.slice(-(config.
|
|
778
|
+
if (resolved.length <= config.messageWindow + 1) return resolved;
|
|
779
|
+
return [resolved[0], ...resolved.slice(-(config.messageWindow))];
|
|
780
780
|
}
|
|
781
781
|
|
|
782
782
|
const allToolCalls = [];
|
package/src/server/config.js
CHANGED
|
@@ -64,6 +64,9 @@ export function loadConfig() {
|
|
|
64
64
|
visionApiKey = process.env.OPENROUTER_API_KEY || null;
|
|
65
65
|
}
|
|
66
66
|
|
|
67
|
+
// fish.audio voice (optional) — TTS for outgoing responses, STT for incoming voice messages
|
|
68
|
+
const fishAudioApiKey = process.env.FISH_AUDIO_API_KEY || null;
|
|
69
|
+
|
|
67
70
|
return {
|
|
68
71
|
provider,
|
|
69
72
|
apiKey,
|
|
@@ -71,7 +74,7 @@ export function loadConfig() {
|
|
|
71
74
|
fallbackModel: settings.fallbackModel || (provider === 'anthropic' ? 'claude-haiku-4-5-20251001' : 'openrouter/free'),
|
|
72
75
|
maxIterations: settings.maxIterations || 20,
|
|
73
76
|
maxHandoffs: settings.maxHandoffs || 3,
|
|
74
|
-
|
|
77
|
+
messageWindow: settings.messageWindow || 300,
|
|
75
78
|
modelContextWindow: settings.modelContextWindow || null,
|
|
76
79
|
port: settings.port || 18008,
|
|
77
80
|
telegram: {
|
|
@@ -81,6 +84,9 @@ export function loadConfig() {
|
|
|
81
84
|
visionProvider,
|
|
82
85
|
visionModel,
|
|
83
86
|
visionApiKey,
|
|
87
|
+
voiceEnabled: settings.voiceEnabled || false,
|
|
88
|
+
fishAudioApiKey,
|
|
89
|
+
fishAudioVoiceId: settings.fishAudioVoiceId || null,
|
|
84
90
|
};
|
|
85
91
|
}
|
|
86
92
|
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* fish.audio API integration — TTS and STT.
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { createClient } from './provider.js';
|
|
6
|
+
|
|
7
|
+
// System prompt for TTS summary generation.
|
|
8
|
+
// fish.audio s1 emotion tags: (emotion) at the START of a sentence only — applies to the whole sentence.
|
|
9
|
+
// Multiple tags can be combined: (excited)(soft tone) Hello!
|
|
10
|
+
// Tags must never appear mid-sentence.
|
|
11
|
+
const TTS_SYSTEM_PROMPT = `You summarize AI assistant responses into a short spoken version for text-to-speech audio.
|
|
12
|
+
|
|
13
|
+
Rules:
|
|
14
|
+
- Output 1–3 short sentences maximum. Be concise — this is spoken audio, not text.
|
|
15
|
+
- Begin EACH sentence with exactly one emotion tag from this list, placed before the first word:
|
|
16
|
+
(happy) (sad) (angry) (excited) (calm) (nervous) (confident) (surprised) (satisfied)
|
|
17
|
+
(delighted) (scared) (worried) (frustrated) (empathetic) (curious) (sarcastic)
|
|
18
|
+
(optimistic) (determined) (proud) (relaxed) (in a hurry tone) (whispering) (soft tone)
|
|
19
|
+
- The emotion tag goes at the very start of the sentence, before any word. Never mid-sentence.
|
|
20
|
+
- Choose emotions that fit the content: use (confident) or (calm) for informational answers,
|
|
21
|
+
(excited) or (satisfied) for completed tasks, (curious) for questions, etc.
|
|
22
|
+
- You may combine two tags on one sentence: (excited)(soft tone) Great news!
|
|
23
|
+
- No markdown, no code blocks, no bullet points — plain speech only.
|
|
24
|
+
- Keep technical jargon minimal; explain concepts simply as you would speak them.
|
|
25
|
+
- Match the language of the original response.
|
|
26
|
+
|
|
27
|
+
Output only the spoken summary text, nothing else.`;
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Generate a short TTS-optimized spoken summary of a response via LLM.
|
|
31
|
+
* Includes fish.audio s1 emotion tags at sentence starts.
|
|
32
|
+
*
|
|
33
|
+
* @param {string} plainText - Plain text of the agent response (HTML already stripped)
|
|
34
|
+
* @param {object} config - Full app config (provider, apiKey, selectedModel)
|
|
35
|
+
* @returns {Promise<string>} - TTS-ready text with emotion tags
|
|
36
|
+
*/
|
|
37
|
+
export async function generateTtsSummary(plainText, config) {
|
|
38
|
+
const client = createClient(config);
|
|
39
|
+
|
|
40
|
+
const response = await client.chat.completions.create({
|
|
41
|
+
model: config.selectedModel,
|
|
42
|
+
max_tokens: 200,
|
|
43
|
+
messages: [
|
|
44
|
+
{ role: 'system', content: TTS_SYSTEM_PROMPT },
|
|
45
|
+
{ role: 'user', content: `Summarize this for spoken audio:\n\n${plainText.slice(0, 3000)}` },
|
|
46
|
+
],
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
return (response.choices[0]?.message?.content || '').trim();
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Convert text to speech via fish.audio TTS API.
|
|
54
|
+
* Returns a Buffer containing MP3 audio data.
|
|
55
|
+
*
|
|
56
|
+
* @param {string} text - TTS-ready text (may include fish.audio emotion tags)
|
|
57
|
+
* @param {object} config - Must include fishAudioApiKey and optionally fishAudioVoiceId
|
|
58
|
+
* @returns {Promise<Buffer>}
|
|
59
|
+
*/
|
|
60
|
+
export async function textToSpeech(text, config) {
|
|
61
|
+
const { fishAudioApiKey, fishAudioVoiceId } = config;
|
|
62
|
+
|
|
63
|
+
const body = {
|
|
64
|
+
text,
|
|
65
|
+
format: 'mp3',
|
|
66
|
+
latency: 'normal',
|
|
67
|
+
mp3_bitrate: 64,
|
|
68
|
+
};
|
|
69
|
+
if (fishAudioVoiceId) body.reference_id = fishAudioVoiceId;
|
|
70
|
+
|
|
71
|
+
const response = await fetch('https://api.fish.audio/v1/tts', {
|
|
72
|
+
method: 'POST',
|
|
73
|
+
headers: {
|
|
74
|
+
'Authorization': `Bearer ${fishAudioApiKey}`,
|
|
75
|
+
'Content-Type': 'application/json',
|
|
76
|
+
'model': 's1',
|
|
77
|
+
},
|
|
78
|
+
body: JSON.stringify(body),
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
if (!response.ok) {
|
|
82
|
+
const errText = await response.text().catch(() => '');
|
|
83
|
+
throw new Error(`fish.audio TTS ${response.status}: ${errText.slice(0, 200)}`);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
return Buffer.from(await response.arrayBuffer());
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Transcribe audio to text via fish.audio ASR API.
|
|
91
|
+
* audioBuffer should be OGG/Opus data (standard Telegram voice format).
|
|
92
|
+
*
|
|
93
|
+
* @param {Buffer} audioBuffer
|
|
94
|
+
* @param {object} config - Must include fishAudioApiKey
|
|
95
|
+
* @returns {Promise<string>} - Transcribed text
|
|
96
|
+
*/
|
|
97
|
+
export async function speechToText(audioBuffer, config) {
|
|
98
|
+
const { fishAudioApiKey } = config;
|
|
99
|
+
|
|
100
|
+
const formData = new FormData();
|
|
101
|
+
const blob = new Blob([audioBuffer], { type: 'audio/ogg' });
|
|
102
|
+
formData.append('audio', blob, 'voice.ogg');
|
|
103
|
+
formData.append('ignore_timestamps', 'true');
|
|
104
|
+
|
|
105
|
+
const response = await fetch('https://api.fish.audio/v1/asr', {
|
|
106
|
+
method: 'POST',
|
|
107
|
+
headers: {
|
|
108
|
+
'Authorization': `Bearer ${fishAudioApiKey}`,
|
|
109
|
+
},
|
|
110
|
+
body: formData,
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
if (!response.ok) {
|
|
114
|
+
const errText = await response.text().catch(() => '');
|
|
115
|
+
throw new Error(`fish.audio ASR ${response.status}: ${errText.slice(0, 200)}`);
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
const data = await response.json();
|
|
119
|
+
return (data.text || '').trim();
|
|
120
|
+
}
|