@polpo-ai/tools 0.6.32 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/__tests__/email-tools.test.d.ts +2 -0
- package/dist/__tests__/email-tools.test.d.ts.map +1 -0
- package/dist/__tests__/email-tools.test.js +705 -0
- package/dist/__tests__/email-tools.test.js.map +1 -0
- package/dist/__tests__/extended-tools.test.d.ts +2 -0
- package/dist/__tests__/extended-tools.test.d.ts.map +1 -0
- package/dist/__tests__/extended-tools.test.js +743 -0
- package/dist/__tests__/extended-tools.test.js.map +1 -0
- package/dist/__tests__/external-api-tools.test.d.ts +2 -0
- package/dist/__tests__/external-api-tools.test.d.ts.map +1 -0
- package/dist/__tests__/external-api-tools.test.js +1731 -0
- package/dist/__tests__/external-api-tools.test.js.map +1 -0
- package/dist/__tests__/memory-tools.test.d.ts +2 -0
- package/dist/__tests__/memory-tools.test.d.ts.map +1 -0
- package/dist/__tests__/memory-tools.test.js +0 -0
- package/dist/__tests__/memory-tools.test.js.map +1 -0
- package/dist/audio-tools.d.ts +25 -27
- package/dist/audio-tools.d.ts.map +1 -1
- package/dist/audio-tools.js +156 -438
- package/dist/audio-tools.js.map +1 -1
- package/dist/browser-tools.d.ts.map +1 -1
- package/dist/browser-tools.js +5 -1
- package/dist/browser-tools.js.map +1 -1
- package/dist/email-tools.d.ts.map +1 -1
- package/dist/email-tools.js +11 -3
- package/dist/email-tools.js.map +1 -1
- package/dist/image-tools.d.ts +27 -25
- package/dist/image-tools.d.ts.map +1 -1
- package/dist/image-tools.js +151 -332
- package/dist/image-tools.js.map +1 -1
- package/dist/index.d.ts +1 -2
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +3 -2
- package/dist/index.js.map +1 -1
- package/dist/lib/edge-speech-model.d.ts +61 -0
- package/dist/lib/edge-speech-model.d.ts.map +1 -0
- package/dist/lib/edge-speech-model.js +144 -0
- package/dist/lib/edge-speech-model.js.map +1 -0
- package/dist/lib/exa-search-provider.d.ts +27 -0
- package/dist/lib/exa-search-provider.d.ts.map +1 -0
- package/dist/lib/exa-search-provider.js +109 -0
- package/dist/lib/exa-search-provider.js.map +1 -0
- package/dist/lib/provider-resolver.d.ts +54 -0
- package/dist/lib/provider-resolver.d.ts.map +1 -0
- package/dist/lib/provider-resolver.js +115 -0
- package/dist/lib/provider-resolver.js.map +1 -0
- package/dist/search-tools.d.ts +10 -13
- package/dist/search-tools.d.ts.map +1 -1
- package/dist/search-tools.js +63 -140
- package/dist/search-tools.js.map +1 -1
- package/dist/system-tools.d.ts +19 -5
- package/dist/system-tools.d.ts.map +1 -1
- package/dist/system-tools.js +16 -10
- package/dist/system-tools.js.map +1 -1
- package/package.json +12 -2
- package/dist/phone-tools.d.ts +0 -27
- package/dist/phone-tools.d.ts.map +0 -1
- package/dist/phone-tools.js +0 -577
- package/dist/phone-tools.js.map +0 -1
package/dist/audio-tools.js
CHANGED
|
@@ -1,39 +1,28 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Audio tools for speech-to-text (STT) and text-to-speech (TTS).
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
4
|
+
* Architecture: thin wrappers over the Vercel AI SDK v6.
|
|
5
|
+
* - audio_transcribe → `experimental_transcribe`
|
|
6
|
+
* - audio_speak → `experimental_generateSpeech`
|
|
7
7
|
*
|
|
8
|
-
*
|
|
8
|
+
* Model selection: each tool picks its model in this order:
|
|
9
|
+
* 1. per-call `model` input (`<provider>/<model>` string),
|
|
10
|
+
* 2. agent-config default (transcribe_model / tts_model),
|
|
11
|
+
* 3. DEFAULT_TRANSCRIBE_MODEL / DEFAULT_TTS_MODEL from @polpo-ai/core.
|
|
9
12
|
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
13
|
-
*
|
|
14
|
-
* Edge TTS: Uses Microsoft Edge's neural TTS engine via the `edge-tts` CLI.
|
|
15
|
-
* Free, no API key, ~400 voices in 60+ languages. Auto-selects voice from
|
|
16
|
-
* language + gender params. Also used as automatic fallback when cloud providers fail.
|
|
17
|
-
* Install: `pip install edge-tts`
|
|
18
|
-
*
|
|
19
|
-
* Credential resolution order (same as email/image tools):
|
|
20
|
-
* 1. Agent vault (per-agent credentials — e.g. service "openai" key "key")
|
|
21
|
-
* 2. Environment variables (global fallback)
|
|
22
|
-
* 3. Edge TTS (automatic fallback — no credentials needed)
|
|
23
|
-
*
|
|
24
|
-
* Environment variables (fallback):
|
|
25
|
-
* OPENAI_API_KEY — openai provider (STT + TTS)
|
|
26
|
-
* DEEPGRAM_API_KEY — deepgram provider (STT + TTS)
|
|
27
|
-
* ELEVENLABS_API_KEY — elevenlabs provider (TTS)
|
|
13
|
+
* audio_speak's `edge` provider is wrapped as a custom SpeechModelV3 in
|
|
14
|
+
* `lib/edge-speech-model.ts` so it slots into the same SDK call as
|
|
15
|
+
* cloud providers (no special-casing in the tool layer).
|
|
28
16
|
*/
|
|
29
17
|
import { resolve, dirname, extname } from "node:path";
|
|
30
18
|
import { Type } from "@sinclair/typebox";
|
|
19
|
+
import { parseModelString, DEFAULT_TRANSCRIBE_MODEL, DEFAULT_TTS_MODEL, } from "@polpo-ai/core";
|
|
31
20
|
import { NodeFileSystem } from "./adapters/node-filesystem.js";
|
|
32
21
|
import { NodeShell } from "./adapters/node-shell.js";
|
|
33
22
|
import { resolveAllowedPaths, assertPathAllowed } from "./path-sandbox.js";
|
|
23
|
+
import { resolveTranscribeProvider, resolveSpeakProvider, } from "./lib/provider-resolver.js";
|
|
34
24
|
// ─── Constants ───
|
|
35
25
|
const MAX_AUDIO_SIZE = 25 * 1024 * 1024; // 25 MB (OpenAI Whisper limit)
|
|
36
|
-
const DEFAULT_TIMEOUT = 120_000; // 2 min for audio processing
|
|
37
26
|
// ─── Helpers ───
|
|
38
27
|
function requireEnv(key) {
|
|
39
28
|
const val = process.env[key];
|
|
@@ -41,54 +30,38 @@ function requireEnv(key) {
|
|
|
41
30
|
throw new Error(`Missing environment variable: ${key}. Set it before using this tool.`);
|
|
42
31
|
return val;
|
|
43
32
|
}
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
const form = new FormData();
|
|
47
|
-
const blob = new Blob([new Uint8Array(fileBuffer)], { type: mimeFromExt(extname(filename)) });
|
|
48
|
-
form.append("file", blob, filename);
|
|
49
|
-
for (const [k, v] of Object.entries(fields)) {
|
|
50
|
-
form.append(k, v);
|
|
51
|
-
}
|
|
52
|
-
return { body: form };
|
|
53
|
-
}
|
|
54
|
-
function mimeFromExt(ext) {
|
|
55
|
-
const map = {
|
|
56
|
-
".mp3": "audio/mpeg",
|
|
57
|
-
".wav": "audio/wav",
|
|
58
|
-
".flac": "audio/flac",
|
|
59
|
-
".ogg": "audio/ogg",
|
|
60
|
-
".m4a": "audio/mp4",
|
|
61
|
-
".webm": "audio/webm",
|
|
62
|
-
".mp4": "audio/mp4",
|
|
63
|
-
".mpeg": "audio/mpeg",
|
|
64
|
-
".mpga": "audio/mpeg",
|
|
65
|
-
};
|
|
66
|
-
return map[ext.toLowerCase()] ?? "application/octet-stream";
|
|
33
|
+
function resolveEffectiveModel(override, configured, fallback) {
|
|
34
|
+
return parseModelString(override ?? configured ?? fallback);
|
|
67
35
|
}
|
|
36
|
+
/** Default voices per TTS provider. Used when the input doesn't pass an explicit voice. */
|
|
37
|
+
const SPEAK_DEFAULT_VOICES = {
|
|
38
|
+
openai: "alloy",
|
|
39
|
+
deepgram: undefined, // voice is encoded in the model id
|
|
40
|
+
elevenlabs: "21m00Tcm4TlvDq8ikWAM", // Rachel
|
|
41
|
+
edge: undefined, // resolved from language+gender by EdgeSpeechModel
|
|
42
|
+
};
|
|
68
43
|
// ─── Tool: audio_transcribe ───
|
|
69
44
|
const AudioTranscribeSchema = Type.Object({
|
|
70
45
|
path: Type.String({ description: "Path to the audio file to transcribe (mp3, wav, flac, ogg, m4a, webm)" }),
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
model: Type.Optional(Type.String({ description: "Model name. OpenAI: 'whisper-1' (default). Deepgram: 'nova-3' (default)." })),
|
|
46
|
+
model: Type.Optional(Type.String({
|
|
47
|
+
description: "Override the agent's transcribe_model for this call. Format: '<provider>/<model>' " +
|
|
48
|
+
"(e.g. 'openai/whisper-1', 'deepgram/nova-3'). When omitted, uses the agent's configured transcribe_model.",
|
|
49
|
+
})),
|
|
76
50
|
language: Type.Optional(Type.String({ description: "ISO 639-1 language code (e.g. 'en', 'it', 'es'). Helps accuracy." })),
|
|
77
|
-
prompt: Type.Optional(Type.String({ description: "Optional context/prompt to guide transcription (OpenAI only)" })),
|
|
51
|
+
prompt: Type.Optional(Type.String({ description: "Optional context/prompt to guide transcription (OpenAI Whisper only)" })),
|
|
78
52
|
});
|
|
79
|
-
function createTranscribeTool(cwd, sandbox, fs, vault) {
|
|
53
|
+
function createTranscribeTool(cwd, sandbox, fs, configuredModel, vault) {
|
|
80
54
|
return {
|
|
81
55
|
name: "audio_transcribe",
|
|
82
56
|
label: "Transcribe Audio",
|
|
83
57
|
description: "Transcribe an audio file to text using speech-to-text AI. " +
|
|
84
58
|
"Supports mp3, wav, flac, ogg, m4a, webm formats. Max file size: 25 MB. " +
|
|
85
|
-
"
|
|
86
|
-
"
|
|
59
|
+
"Model is configured at agent level (transcribe_model) — pass `model` here only to override per-call. " +
|
|
60
|
+
"Default: openai/whisper-1. Supported providers: openai, deepgram.",
|
|
87
61
|
parameters: AudioTranscribeSchema,
|
|
88
62
|
async execute(_id, params, signal) {
|
|
89
63
|
const filePath = resolve(cwd, params.path);
|
|
90
64
|
assertPathAllowed(filePath, sandbox, "audio_transcribe");
|
|
91
|
-
const provider = params.provider ?? "openai";
|
|
92
65
|
if (!fs.readFileBuffer) {
|
|
93
66
|
return {
|
|
94
67
|
content: [{ type: "text", text: "FileSystem implementation does not support readFileBuffer (required for binary reads)." }],
|
|
@@ -113,111 +86,61 @@ function createTranscribeTool(cwd, sandbox, fs, vault) {
|
|
|
113
86
|
};
|
|
114
87
|
}
|
|
115
88
|
try {
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
}
|
|
119
|
-
else {
|
|
120
|
-
return await transcribeDeepgram(filePath, fileBuffer, params, vault, signal);
|
|
121
|
-
}
|
|
89
|
+
const parsed = resolveEffectiveModel(params.model, configuredModel, DEFAULT_TRANSCRIBE_MODEL);
|
|
90
|
+
return await transcribeWithSdk(filePath, fileBuffer, parsed, params, vault, signal);
|
|
122
91
|
}
|
|
123
92
|
catch (err) {
|
|
124
93
|
return {
|
|
125
|
-
content: [{ type: "text", text: `Transcription error
|
|
126
|
-
details: {
|
|
94
|
+
content: [{ type: "text", text: `Transcription error: ${err.message}` }],
|
|
95
|
+
details: { error: err.message },
|
|
127
96
|
};
|
|
128
97
|
}
|
|
129
98
|
},
|
|
130
99
|
};
|
|
131
100
|
}
|
|
132
|
-
async function
|
|
133
|
-
const
|
|
134
|
-
const
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
body,
|
|
150
|
-
signal: controller.signal,
|
|
151
|
-
});
|
|
152
|
-
clearTimeout(timer);
|
|
153
|
-
if (!response.ok) {
|
|
154
|
-
const errText = await response.text();
|
|
155
|
-
throw new Error(`OpenAI API ${response.status}: ${errText}`);
|
|
101
|
+
async function transcribeWithSdk(_filePath, fileBuffer, parsed, params, vault, signal) {
|
|
102
|
+
const { experimental_transcribe } = await import("ai");
|
|
103
|
+
const apiKey = parsed.provider === "openai"
|
|
104
|
+
? vault?.getKey("openai", "key") ?? requireEnv("OPENAI_API_KEY")
|
|
105
|
+
: parsed.provider === "deepgram"
|
|
106
|
+
? vault?.getKey("deepgram", "key") ?? requireEnv("DEEPGRAM_API_KEY")
|
|
107
|
+
: (() => { throw new Error(`Unsupported transcribe provider: ${parsed.provider}`); })();
|
|
108
|
+
const provider = await resolveTranscribeProvider(parsed.provider, apiKey);
|
|
109
|
+
const providerOptions = {};
|
|
110
|
+
if (parsed.provider === "openai") {
|
|
111
|
+
const opts = {};
|
|
112
|
+
if (params.language)
|
|
113
|
+
opts.language = params.language;
|
|
114
|
+
if (params.prompt)
|
|
115
|
+
opts.prompt = params.prompt;
|
|
116
|
+
if (Object.keys(opts).length)
|
|
117
|
+
providerOptions.openai = opts;
|
|
156
118
|
}
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
].join(" | ");
|
|
163
|
-
return {
|
|
164
|
-
content: [{ type: "text", text: `${info}\n\n${data.text}` }],
|
|
165
|
-
details: {
|
|
166
|
-
provider: "openai",
|
|
167
|
-
model,
|
|
168
|
-
language: data.language,
|
|
169
|
-
duration: data.duration,
|
|
170
|
-
textLength: data.text.length,
|
|
171
|
-
},
|
|
172
|
-
};
|
|
173
|
-
}
|
|
174
|
-
async function transcribeDeepgram(filePath, fileBuffer, params, vault, signal) {
|
|
175
|
-
const apiKey = vault?.getKey("deepgram", "key") ?? requireEnv("DEEPGRAM_API_KEY");
|
|
176
|
-
const model = params.model ?? "nova-3";
|
|
177
|
-
const queryParams = new URLSearchParams({
|
|
178
|
-
model,
|
|
179
|
-
smart_format: "true",
|
|
180
|
-
punctuate: "true",
|
|
181
|
-
});
|
|
182
|
-
if (params.language)
|
|
183
|
-
queryParams.set("language", params.language);
|
|
184
|
-
const controller = new AbortController();
|
|
185
|
-
const timer = setTimeout(() => controller.abort(), DEFAULT_TIMEOUT);
|
|
186
|
-
if (signal)
|
|
187
|
-
signal.addEventListener("abort", () => controller.abort(), { once: true });
|
|
188
|
-
const ext = extname(filePath).toLowerCase();
|
|
189
|
-
const mime = mimeFromExt(ext);
|
|
190
|
-
const response = await fetch(`https://api.deepgram.com/v1/listen?${queryParams}`, {
|
|
191
|
-
method: "POST",
|
|
192
|
-
headers: {
|
|
193
|
-
Authorization: `Token ${apiKey}`,
|
|
194
|
-
"Content-Type": mime,
|
|
195
|
-
},
|
|
196
|
-
body: new Uint8Array(fileBuffer),
|
|
197
|
-
signal: controller.signal,
|
|
198
|
-
});
|
|
199
|
-
clearTimeout(timer);
|
|
200
|
-
if (!response.ok) {
|
|
201
|
-
const errText = await response.text();
|
|
202
|
-
throw new Error(`Deepgram API ${response.status}: ${errText}`);
|
|
119
|
+
else {
|
|
120
|
+
const opts = { smart_format: true, punctuate: true };
|
|
121
|
+
if (params.language)
|
|
122
|
+
opts.language = params.language;
|
|
123
|
+
providerOptions.deepgram = opts;
|
|
203
124
|
}
|
|
204
|
-
const
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
125
|
+
const result = await experimental_transcribe({
|
|
126
|
+
model: provider.transcription(parsed.model),
|
|
127
|
+
audio: new Uint8Array(fileBuffer),
|
|
128
|
+
providerOptions: Object.keys(providerOptions).length ? providerOptions : undefined,
|
|
129
|
+
abortSignal: signal,
|
|
130
|
+
});
|
|
208
131
|
const info = [
|
|
209
|
-
`
|
|
210
|
-
`Duration: ${
|
|
211
|
-
`Model: ${model}`,
|
|
132
|
+
`Language: ${result.language ?? "unknown"}`,
|
|
133
|
+
`Duration: ${result.durationInSeconds ? `${result.durationInSeconds.toFixed(1)}s` : "unknown"}`,
|
|
134
|
+
`Model: ${parsed.provider}/${parsed.model}`,
|
|
212
135
|
].join(" | ");
|
|
213
136
|
return {
|
|
214
|
-
content: [{ type: "text", text: `${info}\n\n${
|
|
137
|
+
content: [{ type: "text", text: `${info}\n\n${result.text}` }],
|
|
215
138
|
details: {
|
|
216
|
-
provider:
|
|
217
|
-
model,
|
|
218
|
-
|
|
219
|
-
duration,
|
|
220
|
-
textLength:
|
|
139
|
+
provider: parsed.provider,
|
|
140
|
+
model: parsed.model,
|
|
141
|
+
language: result.language,
|
|
142
|
+
duration: result.durationInSeconds,
|
|
143
|
+
textLength: result.text.length,
|
|
221
144
|
},
|
|
222
145
|
};
|
|
223
146
|
}
|
|
@@ -225,324 +148,118 @@ async function transcribeDeepgram(filePath, fileBuffer, params, vault, signal) {
|
|
|
225
148
|
const AudioSpeakSchema = Type.Object({
|
|
226
149
|
text: Type.String({ description: "Text to convert to speech" }),
|
|
227
150
|
path: Type.String({ description: "Output file path (e.g. 'output.mp3'). Format inferred from extension." }),
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
model: Type.Optional(Type.String({ description: "Model name. OpenAI: 'tts-1' (default), 'tts-1-hd', 'gpt-4o-mini-tts'. Deepgram: 'aura-2-en' (default). ElevenLabs: 'eleven_multilingual_v2' (default)." })),
|
|
235
|
-
voice: Type.Optional(Type.String({ description: "Voice name/ID. OpenAI: alloy, echo, fable, onyx, nova, shimmer (default: alloy). ElevenLabs: voice ID. Edge: full voice name like 'it-IT-DiegoNeural' (auto-selected from language+gender if omitted)." })),
|
|
151
|
+
model: Type.Optional(Type.String({
|
|
152
|
+
description: "Override the agent's tts_model for this call. Format: '<provider>/<model>' " +
|
|
153
|
+
"(e.g. 'openai/tts-1', 'openai/tts-1-hd', 'openai/gpt-4o-mini-tts', 'deepgram/aura-2-asteria-en', " +
|
|
154
|
+
"'elevenlabs/eleven_multilingual_v2', 'edge/edge-tts'). When omitted, uses the agent's configured tts_model.",
|
|
155
|
+
})),
|
|
156
|
+
voice: Type.Optional(Type.String({ description: "Voice name/ID. OpenAI: alloy/echo/fable/onyx/nova/shimmer (default: alloy). ElevenLabs: voice ID (default: Rachel). Edge: full voice name like 'it-IT-DiegoNeural' (auto-selected from language+gender if omitted)." })),
|
|
236
157
|
language: Type.Optional(Type.String({ description: "ISO 639-1 language code (e.g. 'it', 'en', 'es'). Used by edge provider to select the right voice. Also useful for other providers with multilingual models." })),
|
|
237
158
|
gender: Type.Optional(Type.Union([
|
|
238
159
|
Type.Literal("male"),
|
|
239
160
|
Type.Literal("female"),
|
|
240
|
-
], { description: "Voice gender preference. Used by edge provider to pick the right voice when no explicit voice is given.
|
|
161
|
+
], { description: "Voice gender preference. Used by the edge provider to pick the right voice when no explicit voice is given." })),
|
|
241
162
|
speed: Type.Optional(Type.Number({ description: "Playback speed 0.25-4.0 (OpenAI only, default: 1.0)" })),
|
|
242
163
|
instructions: Type.Optional(Type.String({ description: "Voice style instructions (OpenAI gpt-4o-mini-tts only, e.g. 'Speak in a cheerful tone')" })),
|
|
243
164
|
});
|
|
244
|
-
function
|
|
165
|
+
function audioFormat(filePath, providerName) {
|
|
166
|
+
const ext = extname(filePath).toLowerCase().replace(".", "");
|
|
167
|
+
if (providerName === "elevenlabs") {
|
|
168
|
+
const map = { mp3: "mp3_44100_128", wav: "pcm_44100", flac: "flac" };
|
|
169
|
+
return map[ext] ?? "mp3_44100_128";
|
|
170
|
+
}
|
|
171
|
+
return ext || "mp3";
|
|
172
|
+
}
|
|
173
|
+
function createSpeakTool(cwd, sandbox, fs, shell, configuredModel, vault) {
|
|
245
174
|
return {
|
|
246
175
|
name: "audio_speak",
|
|
247
176
|
label: "Text to Speech",
|
|
248
177
|
description: "Generate speech audio from text using text-to-speech AI. " +
|
|
249
|
-
"Output format
|
|
250
|
-
"
|
|
251
|
-
"
|
|
252
|
-
"Use 'language' (ISO 639-1) and 'gender' params to help select the right voice, especially for edge provider. " +
|
|
253
|
-
"Credentials resolved from: agent vault > OPENAI_API_KEY, DEEPGRAM_API_KEY, or ELEVENLABS_API_KEY env var.",
|
|
178
|
+
"Output format inferred from file extension (mp3, wav, flac, opus, aac, pcm). " +
|
|
179
|
+
"Model is configured at agent level (tts_model) — pass `model` here only to override per-call. " +
|
|
180
|
+
"Default: openai/tts-1. Supported providers: openai, deepgram, elevenlabs, edge (free, local Microsoft Edge TTS — no API key needed).",
|
|
254
181
|
parameters: AudioSpeakSchema,
|
|
255
182
|
async execute(_id, params, signal) {
|
|
256
183
|
const filePath = resolve(cwd, params.path);
|
|
257
184
|
assertPathAllowed(filePath, sandbox, "audio_speak");
|
|
258
|
-
const provider = params.provider ?? "openai";
|
|
259
|
-
// Direct edge-tts request — no fallback needed
|
|
260
|
-
if (provider === "edge") {
|
|
261
|
-
if (!(await edgeTtsAvailable(shell))) {
|
|
262
|
-
return {
|
|
263
|
-
content: [{ type: "text", text: "Edge TTS not available in this environment. Use openai/deepgram/elevenlabs instead." }],
|
|
264
|
-
details: { provider: "edge", error: "edge_tts_unavailable" },
|
|
265
|
-
};
|
|
266
|
-
}
|
|
267
|
-
try {
|
|
268
|
-
return await speakEdgeTts(filePath, params, fs, shell);
|
|
269
|
-
}
|
|
270
|
-
catch (err) {
|
|
271
|
-
return {
|
|
272
|
-
content: [{ type: "text", text: `TTS error (edge): ${err.message}` }],
|
|
273
|
-
details: { provider: "edge", error: err.message },
|
|
274
|
-
};
|
|
275
|
-
}
|
|
276
|
-
}
|
|
277
|
-
// Provider with edge-tts fallback
|
|
278
185
|
try {
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
}
|
|
282
|
-
else if (provider === "deepgram") {
|
|
283
|
-
return await speakDeepgram(filePath, params, fs, vault, signal);
|
|
284
|
-
}
|
|
285
|
-
else {
|
|
286
|
-
return await speakElevenLabs(filePath, params, fs, vault, signal);
|
|
287
|
-
}
|
|
186
|
+
const parsed = resolveEffectiveModel(params.model, configuredModel, DEFAULT_TTS_MODEL);
|
|
187
|
+
return await speakWithSdk(filePath, parsed, params, fs, shell, vault, signal);
|
|
288
188
|
}
|
|
289
189
|
catch (err) {
|
|
290
|
-
// Automatic fallback to edge-tts if available
|
|
291
|
-
if (await edgeTtsAvailable(shell)) {
|
|
292
|
-
try {
|
|
293
|
-
const result = await speakEdgeTts(filePath, params, fs, shell);
|
|
294
|
-
// Prepend fallback notice
|
|
295
|
-
const notice = `[Fallback] ${provider} failed (${err.message}), used edge-tts instead.\n`;
|
|
296
|
-
return {
|
|
297
|
-
content: [{ type: "text", text: notice + result.content[0].text }],
|
|
298
|
-
details: { ...result.details, fallbackFrom: provider, fallbackReason: err.message },
|
|
299
|
-
};
|
|
300
|
-
}
|
|
301
|
-
catch (edgeErr) {
|
|
302
|
-
return {
|
|
303
|
-
content: [{ type: "text", text: `TTS error (${provider}): ${err.message}\nEdge-tts fallback also failed: ${edgeErr.message}` }],
|
|
304
|
-
details: { provider, error: err.message, edgeError: edgeErr.message },
|
|
305
|
-
};
|
|
306
|
-
}
|
|
307
|
-
}
|
|
308
190
|
return {
|
|
309
|
-
content: [{ type: "text", text: `TTS error
|
|
310
|
-
details: {
|
|
191
|
+
content: [{ type: "text", text: `TTS error: ${err.message}` }],
|
|
192
|
+
details: { error: err.message },
|
|
311
193
|
};
|
|
312
194
|
}
|
|
313
195
|
},
|
|
314
196
|
};
|
|
315
197
|
}
|
|
316
|
-
async function
|
|
317
|
-
const
|
|
318
|
-
const
|
|
319
|
-
const voice = params.voice ??
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
const responseFormat = formatMap[ext] ?? "mp3";
|
|
325
|
-
const body = {
|
|
326
|
-
model,
|
|
327
|
-
input: params.text,
|
|
328
|
-
voice,
|
|
329
|
-
response_format: responseFormat,
|
|
330
|
-
};
|
|
331
|
-
if (params.speed !== undefined)
|
|
332
|
-
body.speed = params.speed;
|
|
333
|
-
if (params.instructions)
|
|
334
|
-
body.instructions = params.instructions;
|
|
335
|
-
const controller = new AbortController();
|
|
336
|
-
const timer = setTimeout(() => controller.abort(), DEFAULT_TIMEOUT);
|
|
337
|
-
if (signal)
|
|
338
|
-
signal.addEventListener("abort", () => controller.abort(), { once: true });
|
|
339
|
-
const response = await fetch("https://api.openai.com/v1/audio/speech", {
|
|
340
|
-
method: "POST",
|
|
341
|
-
headers: {
|
|
342
|
-
Authorization: `Bearer ${apiKey}`,
|
|
343
|
-
"Content-Type": "application/json",
|
|
344
|
-
},
|
|
345
|
-
body: JSON.stringify(body),
|
|
346
|
-
signal: controller.signal,
|
|
347
|
-
});
|
|
348
|
-
clearTimeout(timer);
|
|
349
|
-
if (!response.ok) {
|
|
350
|
-
const errText = await response.text();
|
|
351
|
-
throw new Error(`OpenAI TTS API ${response.status}: ${errText}`);
|
|
198
|
+
async function speakWithSdk(filePath, parsed, params, fs, shell, vault, signal) {
|
|
199
|
+
const { experimental_generateSpeech } = await import("ai");
|
|
200
|
+
const providerName = parsed.provider;
|
|
201
|
+
const voice = params.voice ?? SPEAK_DEFAULT_VOICES[providerName];
|
|
202
|
+
// Cloud providers need an apiKey. The edge provider needs shell+fs.
|
|
203
|
+
let apiKey;
|
|
204
|
+
if (providerName === "openai") {
|
|
205
|
+
apiKey = vault?.getKey("openai", "key") ?? requireEnv("OPENAI_API_KEY");
|
|
352
206
|
}
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
throw new Error("FileSystem implementation does not support writeFileBuffer (required for binary writes).");
|
|
207
|
+
else if (providerName === "deepgram") {
|
|
208
|
+
apiKey = vault?.getKey("deepgram", "key") ?? requireEnv("DEEPGRAM_API_KEY");
|
|
356
209
|
}
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
return {
|
|
360
|
-
content: [{ type: "text", text: `Speech audio saved: ${filePath} (${(buffer.byteLength / 1024).toFixed(1)} KB, ${responseFormat}, voice: ${voice}, model: ${model})` }],
|
|
361
|
-
details: {
|
|
362
|
-
provider: "openai",
|
|
363
|
-
model,
|
|
364
|
-
voice,
|
|
365
|
-
format: responseFormat,
|
|
366
|
-
path: filePath,
|
|
367
|
-
bytes: buffer.byteLength,
|
|
368
|
-
textLength: params.text.length,
|
|
369
|
-
},
|
|
370
|
-
};
|
|
371
|
-
}
|
|
372
|
-
async function speakDeepgram(filePath, params, fs, vault, signal) {
|
|
373
|
-
const apiKey = vault?.getKey("deepgram", "key") ?? requireEnv("DEEPGRAM_API_KEY");
|
|
374
|
-
const model = params.model ?? "aura-2-en";
|
|
375
|
-
const controller = new AbortController();
|
|
376
|
-
const timer = setTimeout(() => controller.abort(), DEFAULT_TIMEOUT);
|
|
377
|
-
if (signal)
|
|
378
|
-
signal.addEventListener("abort", () => controller.abort(), { once: true });
|
|
379
|
-
const response = await fetch(`https://api.deepgram.com/v1/speak?model=${encodeURIComponent(model)}`, {
|
|
380
|
-
method: "POST",
|
|
381
|
-
headers: {
|
|
382
|
-
Authorization: `Token ${apiKey}`,
|
|
383
|
-
"Content-Type": "application/json",
|
|
384
|
-
},
|
|
385
|
-
body: JSON.stringify({ text: params.text }),
|
|
386
|
-
signal: controller.signal,
|
|
387
|
-
});
|
|
388
|
-
clearTimeout(timer);
|
|
389
|
-
if (!response.ok) {
|
|
390
|
-
const errText = await response.text();
|
|
391
|
-
throw new Error(`Deepgram TTS API ${response.status}: ${errText}`);
|
|
210
|
+
else if (providerName === "elevenlabs") {
|
|
211
|
+
apiKey = vault?.getKey("elevenlabs", "key") ?? requireEnv("ELEVENLABS_API_KEY");
|
|
392
212
|
}
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
throw new Error("FileSystem implementation does not support writeFileBuffer (required for binary writes).");
|
|
213
|
+
else if (providerName !== "edge") {
|
|
214
|
+
throw new Error(`Unsupported tts provider: ${providerName}`);
|
|
396
215
|
}
|
|
397
|
-
await fs
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
}
|
|
411
|
-
|
|
412
|
-
const
|
|
413
|
-
const
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
if (signal)
|
|
424
|
-
signal.addEventListener("abort", () => controller.abort(), { once: true });
|
|
425
|
-
const response = await fetch(`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}?output_format=${outputFormat}`, {
|
|
426
|
-
method: "POST",
|
|
427
|
-
headers: {
|
|
428
|
-
"xi-api-key": apiKey,
|
|
429
|
-
"Content-Type": "application/json",
|
|
430
|
-
},
|
|
431
|
-
body: JSON.stringify({
|
|
432
|
-
text: params.text,
|
|
433
|
-
model_id: model,
|
|
434
|
-
}),
|
|
435
|
-
signal: controller.signal,
|
|
216
|
+
const provider = await resolveSpeakProvider(providerName, { apiKey, shell, fs });
|
|
217
|
+
// Provider-specific knobs flow through providerOptions.
|
|
218
|
+
const providerOptions = {};
|
|
219
|
+
if (providerName === "openai") {
|
|
220
|
+
const opts = {};
|
|
221
|
+
if (params.speed !== undefined)
|
|
222
|
+
opts.speed = params.speed;
|
|
223
|
+
if (params.instructions)
|
|
224
|
+
opts.instructions = params.instructions;
|
|
225
|
+
if (Object.keys(opts).length)
|
|
226
|
+
providerOptions.openai = opts;
|
|
227
|
+
}
|
|
228
|
+
if (providerName === "edge" && params.gender) {
|
|
229
|
+
providerOptions.edge = { gender: params.gender };
|
|
230
|
+
}
|
|
231
|
+
const outputFormat = audioFormat(filePath, providerName);
|
|
232
|
+
const result = await experimental_generateSpeech({
|
|
233
|
+
model: provider.speech(parsed.model),
|
|
234
|
+
text: params.text,
|
|
235
|
+
voice,
|
|
236
|
+
outputFormat,
|
|
237
|
+
language: params.language,
|
|
238
|
+
instructions: params.instructions,
|
|
239
|
+
speed: params.speed,
|
|
240
|
+
providerOptions: Object.keys(providerOptions).length ? providerOptions : undefined,
|
|
241
|
+
abortSignal: signal,
|
|
436
242
|
});
|
|
437
|
-
|
|
438
|
-
if (!
|
|
439
|
-
|
|
440
|
-
throw new Error(`ElevenLabs API ${response.status}: ${errText}`);
|
|
243
|
+
const bytes = result.audio.uint8Array;
|
|
244
|
+
if (!bytes || bytes.byteLength === 0) {
|
|
245
|
+
throw new Error("No audio bytes in SDK response");
|
|
441
246
|
}
|
|
442
|
-
const buffer = Buffer.from(await response.arrayBuffer());
|
|
443
247
|
if (!fs.writeFileBuffer) {
|
|
444
248
|
throw new Error("FileSystem implementation does not support writeFileBuffer (required for binary writes).");
|
|
445
249
|
}
|
|
446
250
|
await fs.mkdir(dirname(filePath));
|
|
447
|
-
await fs.writeFileBuffer(filePath,
|
|
251
|
+
await fs.writeFileBuffer(filePath, bytes);
|
|
252
|
+
const voiceLabel = voice ?? "(model-bound)";
|
|
253
|
+
const summary = `Speech audio saved: ${filePath} (${(bytes.byteLength / 1024).toFixed(1)} KB, ${outputFormat}, voice: ${voiceLabel}, model: ${parsed.provider}/${parsed.model})`;
|
|
448
254
|
return {
|
|
449
|
-
content: [{ type: "text", text:
|
|
255
|
+
content: [{ type: "text", text: summary }],
|
|
450
256
|
details: {
|
|
451
|
-
provider:
|
|
452
|
-
model,
|
|
453
|
-
|
|
257
|
+
provider: providerName,
|
|
258
|
+
model: parsed.model,
|
|
259
|
+
voice: voiceLabel,
|
|
454
260
|
format: outputFormat,
|
|
455
261
|
path: filePath,
|
|
456
|
-
bytes:
|
|
457
|
-
textLength: params.text.length,
|
|
458
|
-
},
|
|
459
|
-
};
|
|
460
|
-
}
|
|
461
|
-
// ─── Edge TTS (free, local CLI, automatic fallback) ───
|
|
462
|
-
/**
|
|
463
|
-
* Default Edge TTS voices per language+gender.
|
|
464
|
-
* Format: `${lang}-${region}-${name}Neural`
|
|
465
|
-
* Each entry: [female, male]. First match wins.
|
|
466
|
-
*/
|
|
467
|
-
const EDGE_VOICES = {
|
|
468
|
-
"it": ["it-IT-ElsaNeural", "it-IT-DiegoNeural"],
|
|
469
|
-
"en": ["en-US-EmmaMultilingualNeural", "en-US-AndrewMultilingualNeural"],
|
|
470
|
-
"es": ["es-ES-ElviraNeural", "es-ES-AlvaroNeural"],
|
|
471
|
-
"fr": ["fr-FR-DeniseNeural", "fr-FR-HenriNeural"],
|
|
472
|
-
"de": ["de-DE-KatjaNeural", "de-DE-ConradNeural"],
|
|
473
|
-
"pt": ["pt-BR-FranciscaNeural", "pt-BR-AntonioNeural"],
|
|
474
|
-
"ja": ["ja-JP-NanamiNeural", "ja-JP-KeitaNeural"],
|
|
475
|
-
"zh": ["zh-CN-XiaoxiaoNeural", "zh-CN-YunxiNeural"],
|
|
476
|
-
"ko": ["ko-KR-SunHiNeural", "ko-KR-InJoonNeural"],
|
|
477
|
-
"ar": ["ar-SA-ZariyahNeural", "ar-SA-HamedNeural"],
|
|
478
|
-
"hi": ["hi-IN-SwaraNeural", "hi-IN-MadhurNeural"],
|
|
479
|
-
"ru": ["ru-RU-SvetlanaNeural", "ru-RU-DmitryNeural"],
|
|
480
|
-
"nl": ["nl-NL-ColetteNeural", "nl-NL-MaartenNeural"],
|
|
481
|
-
"pl": ["pl-PL-AgnieszkaNeural", "pl-PL-MarekNeural"],
|
|
482
|
-
"tr": ["tr-TR-EmelNeural", "tr-TR-AhmetNeural"],
|
|
483
|
-
"sv": ["sv-SE-SofieNeural", "sv-SE-MattiasNeural"],
|
|
484
|
-
};
|
|
485
|
-
/**
|
|
486
|
-
* Resolve the best Edge TTS voice for a given language and gender hint.
|
|
487
|
-
* Falls back to en-US if the language is unknown.
|
|
488
|
-
*/
|
|
489
|
-
function resolveEdgeVoice(voice, language, gender) {
|
|
490
|
-
// If the agent passed an explicit voice name like "it-IT-DiegoNeural", use it directly
|
|
491
|
-
if (voice && voice.includes("-") && voice.endsWith("Neural"))
|
|
492
|
-
return voice;
|
|
493
|
-
const lang = (language ?? "en").toLowerCase().split("-")[0]; // "it-IT" → "it"
|
|
494
|
-
const pair = EDGE_VOICES[lang] ?? EDGE_VOICES["en"];
|
|
495
|
-
return gender === "male" ? pair[1] : pair[0]; // default female if no gender hint
|
|
496
|
-
}
|
|
497
|
-
/** Per-Shell cache of "is edge-tts on PATH" — checked once per shell. */
|
|
498
|
-
const _edgeTtsAvailable = new WeakMap();
|
|
499
|
-
/** Check if edge-tts CLI is available, routed through the Shell so the
|
|
500
|
-
* check runs in the same environment as the actual TTS call (sandbox
|
|
501
|
-
* in cloud, local Node in OSS). */
|
|
502
|
-
function edgeTtsAvailable(shell) {
|
|
503
|
-
const existing = _edgeTtsAvailable.get(shell);
|
|
504
|
-
if (existing)
|
|
505
|
-
return existing;
|
|
506
|
-
const fresh = shell
|
|
507
|
-
.execute("edge-tts --version", { timeout: 5000 })
|
|
508
|
-
.then((r) => r.exitCode === 0)
|
|
509
|
-
.catch(() => false);
|
|
510
|
-
_edgeTtsAvailable.set(shell, fresh);
|
|
511
|
-
return fresh;
|
|
512
|
-
}
|
|
513
|
-
/** Quote a CLI argument for inclusion in a `shell.execute` command line. */
|
|
514
|
-
function quoteArg(arg) {
|
|
515
|
-
return `'${arg.replace(/'/g, `'\\''`)}'`;
|
|
516
|
-
}
|
|
517
|
-
async function speakEdgeTts(filePath, params, fs, shell) {
|
|
518
|
-
if (!(await edgeTtsAvailable(shell))) {
|
|
519
|
-
throw new Error("edge-tts CLI is not installed. Install it with: pip install edge-tts");
|
|
520
|
-
}
|
|
521
|
-
const voice = resolveEdgeVoice(params.voice, params.language, params.gender);
|
|
522
|
-
await fs.mkdir(dirname(filePath));
|
|
523
|
-
const cmd = [
|
|
524
|
-
"edge-tts",
|
|
525
|
-
"--text", quoteArg(params.text),
|
|
526
|
-
"--voice", quoteArg(voice),
|
|
527
|
-
"--write-media", quoteArg(filePath),
|
|
528
|
-
].join(" ");
|
|
529
|
-
const result = await shell.execute(cmd, { timeout: DEFAULT_TIMEOUT });
|
|
530
|
-
if (result.exitCode !== 0) {
|
|
531
|
-
throw new Error(`edge-tts failed: ${(result.stderr || result.stdout || "").trim() || `exit ${result.exitCode}`}`);
|
|
532
|
-
}
|
|
533
|
-
let bytes = 0;
|
|
534
|
-
try {
|
|
535
|
-
const stat = await fs.stat(filePath);
|
|
536
|
-
bytes = stat?.size ?? 0;
|
|
537
|
-
}
|
|
538
|
-
catch { /* ignore */ }
|
|
539
|
-
return {
|
|
540
|
-
content: [{ type: "text", text: `Speech audio saved: ${filePath} (${(bytes / 1024).toFixed(1)} KB, voice: ${voice}, provider: edge-tts)` }],
|
|
541
|
-
details: {
|
|
542
|
-
provider: "edge",
|
|
543
|
-
voice,
|
|
544
|
-
path: filePath,
|
|
545
|
-
bytes,
|
|
262
|
+
bytes: bytes.byteLength,
|
|
546
263
|
textLength: params.text.length,
|
|
547
264
|
},
|
|
548
265
|
};
|
|
@@ -551,21 +268,22 @@ export const ALL_AUDIO_TOOL_NAMES = ["audio_transcribe", "audio_speak"];
|
|
|
551
268
|
/**
|
|
552
269
|
* Create audio tools for speech-to-text and text-to-speech.
|
|
553
270
|
*
|
|
554
|
-
*
|
|
555
|
-
*
|
|
556
|
-
* @param allowedTools - Optional filter
|
|
557
|
-
* @param vault - Resolved vault credentials for credential resolution
|
|
271
|
+
* The 6-arg positional signature is preserved for back-compat. Prefer
|
|
272
|
+
* the options-object form for new callers.
|
|
558
273
|
*/
|
|
559
274
|
export function createAudioTools(cwd, allowedPaths, allowedTools, vault, fs, shell) {
|
|
560
|
-
const
|
|
561
|
-
|
|
562
|
-
|
|
275
|
+
const opts = typeof cwd === "string"
|
|
276
|
+
? { cwd, allowedPaths, allowedTools, vault, fs, shell }
|
|
277
|
+
: cwd;
|
|
278
|
+
const sandbox = resolveAllowedPaths(opts.cwd, opts.allowedPaths);
|
|
279
|
+
const _fs = opts.fs ?? new NodeFileSystem();
|
|
280
|
+
const _shell = opts.shell ?? new NodeShell();
|
|
563
281
|
const factories = {
|
|
564
|
-
audio_transcribe: () => createTranscribeTool(cwd, sandbox, _fs, vault),
|
|
565
|
-
audio_speak: () => createSpeakTool(cwd, sandbox, _fs, _shell, vault),
|
|
282
|
+
audio_transcribe: () => createTranscribeTool(opts.cwd, sandbox, _fs, opts.transcribeModel, opts.vault),
|
|
283
|
+
audio_speak: () => createSpeakTool(opts.cwd, sandbox, _fs, _shell, opts.ttsModel, opts.vault),
|
|
566
284
|
};
|
|
567
|
-
const names = allowedTools
|
|
568
|
-
? ALL_AUDIO_TOOL_NAMES.filter(n => allowedTools.some(a => a.toLowerCase() === n))
|
|
285
|
+
const names = opts.allowedTools
|
|
286
|
+
? ALL_AUDIO_TOOL_NAMES.filter(n => opts.allowedTools.some(a => a.toLowerCase() === n))
|
|
569
287
|
: ALL_AUDIO_TOOL_NAMES;
|
|
570
288
|
return names.map(n => factories[n]());
|
|
571
289
|
}
|