@ducci/jarvis 1.0.86 → 1.0.88

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ducci/jarvis",
3
- "version": "1.0.86",
3
+ "version": "1.0.88",
4
4
  "description": "A fully automated agent system that lives on a server.",
5
5
  "main": "./src/index.js",
6
6
  "type": "module",
@@ -3,6 +3,12 @@
3
3
  */
4
4
 
5
5
  import { createClient } from './provider.js';
6
+ import { exec } from 'child_process';
7
+ import { promisify } from 'util';
8
+ import { writeFile, readFile, unlink } from 'fs/promises';
9
+ import { tmpdir } from 'os';
10
+ import { join } from 'path';
11
+ const execAsync = promisify(exec);
6
12
 
7
13
  // System prompt for TTS summary generation.
8
14
  // fish.audio s1 emotion tags: (emotion) at the START of a sentence only — applies to the whole sentence.
@@ -20,7 +26,7 @@ Rules:
20
26
  - Choose emotions that fit the content: use (confident) or (calm) for informational answers,
21
27
  (excited) or (satisfied) for completed tasks, (curious) for questions, etc.
22
28
  - You may combine two tags on one sentence: (excited)(soft tone) Great news!
23
- - No markdown, no code blocks, no bullet points plain speech only.
29
+ - Plain text only — no emojis, no markdown, no code blocks, no bullet points, no special characters.
24
30
  - Keep technical jargon minimal; explain concepts simply as you would speak them.
25
31
  - Match the language of the original response.
26
32
 
@@ -83,7 +89,13 @@ export async function textToSpeech(text, config) {
83
89
  throw new Error(`fish.audio TTS ${response.status}: ${errText.slice(0, 200)}`);
84
90
  }
85
91
 
86
- return Buffer.from(await response.arrayBuffer());
92
+ // Explicitly collect all streaming chunks — arrayBuffer() can miss trailing chunks
93
+ // on chunked transfer-encoded responses from fish.audio.
94
+ const chunks = [];
95
+ for await (const chunk of response.body) {
96
+ chunks.push(chunk);
97
+ }
98
+ return Buffer.concat(chunks);
87
99
  }
88
100
 
89
101
  /**
@@ -97,10 +109,24 @@ export async function textToSpeech(text, config) {
97
109
  export async function speechToText(audioBuffer, config) {
98
110
  const { fishAudioApiKey } = config;
99
111
 
112
+ // Telegram voice messages are OGG/Opus — fish.audio ASR doesn't support Opus.
113
+ // Convert to WAV first via ffmpeg.
114
+ const id = `jarvis-stt-${Date.now()}`;
115
+ const inPath = join(tmpdir(), `${id}.ogg`);
116
+ const outPath = join(tmpdir(), `${id}.wav`);
117
+ let wavBuffer;
118
+ try {
119
+ await writeFile(inPath, audioBuffer);
120
+ await execAsync(`ffmpeg -y -i "${inPath}" -ar 16000 -ac 1 "${outPath}"`);
121
+ wavBuffer = await readFile(outPath);
122
+ } finally {
123
+ unlink(inPath).catch(() => {});
124
+ unlink(outPath).catch(() => {});
125
+ }
126
+
100
127
  const formData = new FormData();
101
- const blob = new Blob([audioBuffer], { type: 'audio/ogg' });
102
- formData.append('audio', blob, 'voice.ogg');
103
- formData.append('ignore_timestamps', 'true');
128
+ const blob = new Blob([wavBuffer], { type: 'audio/wav' });
129
+ formData.append('audio', blob, 'voice.wav');
104
130
 
105
131
  const response = await fetch('https://api.fish.audio/v1/asr', {
106
132
  method: 'POST',