@polpo-ai/tools 0.6.31 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/dist/__tests__/email-tools.test.d.ts +2 -0
  2. package/dist/__tests__/email-tools.test.d.ts.map +1 -0
  3. package/dist/__tests__/email-tools.test.js +705 -0
  4. package/dist/__tests__/email-tools.test.js.map +1 -0
  5. package/dist/__tests__/extended-tools.test.d.ts +2 -0
  6. package/dist/__tests__/extended-tools.test.d.ts.map +1 -0
  7. package/dist/__tests__/extended-tools.test.js +743 -0
  8. package/dist/__tests__/extended-tools.test.js.map +1 -0
  9. package/dist/__tests__/external-api-tools.test.d.ts +2 -0
  10. package/dist/__tests__/external-api-tools.test.d.ts.map +1 -0
  11. package/dist/__tests__/external-api-tools.test.js +1731 -0
  12. package/dist/__tests__/external-api-tools.test.js.map +1 -0
  13. package/dist/__tests__/memory-tools.test.d.ts +2 -0
  14. package/dist/__tests__/memory-tools.test.d.ts.map +1 -0
  15. package/dist/__tests__/memory-tools.test.js +0 -0
  16. package/dist/__tests__/memory-tools.test.js.map +1 -0
  17. package/dist/__tests__/system-tools.test.d.ts +2 -0
  18. package/dist/__tests__/system-tools.test.d.ts.map +1 -0
  19. package/dist/__tests__/system-tools.test.js +417 -0
  20. package/dist/__tests__/system-tools.test.js.map +1 -0
  21. package/dist/adapters/node-shell.d.ts +9 -0
  22. package/dist/adapters/node-shell.d.ts.map +1 -1
  23. package/dist/adapters/node-shell.js +40 -9
  24. package/dist/adapters/node-shell.js.map +1 -1
  25. package/dist/audio-tools.d.ts +25 -27
  26. package/dist/audio-tools.d.ts.map +1 -1
  27. package/dist/audio-tools.js +156 -438
  28. package/dist/audio-tools.js.map +1 -1
  29. package/dist/browser-tools.d.ts.map +1 -1
  30. package/dist/browser-tools.js +5 -1
  31. package/dist/browser-tools.js.map +1 -1
  32. package/dist/email-tools.d.ts.map +1 -1
  33. package/dist/email-tools.js +11 -3
  34. package/dist/email-tools.js.map +1 -1
  35. package/dist/image-tools.d.ts +27 -25
  36. package/dist/image-tools.d.ts.map +1 -1
  37. package/dist/image-tools.js +151 -332
  38. package/dist/image-tools.js.map +1 -1
  39. package/dist/index.d.ts +1 -2
  40. package/dist/index.d.ts.map +1 -1
  41. package/dist/index.js +3 -2
  42. package/dist/index.js.map +1 -1
  43. package/dist/lib/edge-speech-model.d.ts +61 -0
  44. package/dist/lib/edge-speech-model.d.ts.map +1 -0
  45. package/dist/lib/edge-speech-model.js +144 -0
  46. package/dist/lib/edge-speech-model.js.map +1 -0
  47. package/dist/lib/exa-search-provider.d.ts +27 -0
  48. package/dist/lib/exa-search-provider.d.ts.map +1 -0
  49. package/dist/lib/exa-search-provider.js +109 -0
  50. package/dist/lib/exa-search-provider.js.map +1 -0
  51. package/dist/lib/provider-resolver.d.ts +54 -0
  52. package/dist/lib/provider-resolver.d.ts.map +1 -0
  53. package/dist/lib/provider-resolver.js +115 -0
  54. package/dist/lib/provider-resolver.js.map +1 -0
  55. package/dist/search-tools.d.ts +10 -13
  56. package/dist/search-tools.d.ts.map +1 -1
  57. package/dist/search-tools.js +63 -140
  58. package/dist/search-tools.js.map +1 -1
  59. package/dist/system-tools.d.ts +19 -5
  60. package/dist/system-tools.d.ts.map +1 -1
  61. package/dist/system-tools.js +48 -31
  62. package/dist/system-tools.js.map +1 -1
  63. package/package.json +16 -4
  64. package/dist/phone-tools.d.ts +0 -27
  65. package/dist/phone-tools.d.ts.map +0 -1
  66. package/dist/phone-tools.js +0 -577
  67. package/dist/phone-tools.js.map +0 -1
@@ -1,39 +1,28 @@
1
1
  /**
2
2
  * Audio tools for speech-to-text (STT) and text-to-speech (TTS).
3
3
  *
4
- * Provides agent capabilities to:
5
- * - Transcribe audio files to text (audio_transcribe)
6
- * - Generate speech audio from text (audio_speak)
4
+ * Architecture: thin wrappers over the Vercel AI SDK v6.
5
+ * - audio_transcribe `experimental_transcribe`
6
+ * - audio_speak → `experimental_generateSpeech`
7
7
  *
8
- * Architecture: direct fetch() to provider REST APIs zero vendor SDK dependencies.
8
+ * Model selection: each tool picks its model in this order:
9
+ * 1. per-call `model` input (`<provider>/<model>` string),
10
+ * 2. agent-config default (transcribe_model / tts_model),
11
+ * 3. DEFAULT_TRANSCRIBE_MODEL / DEFAULT_TTS_MODEL from @polpo-ai/core.
9
12
  *
10
- * Supported providers:
11
- * STT: openai (Whisper), deepgram (Nova)
12
- * TTS: openai (gpt-4o-mini-tts / tts-1), deepgram (Aura), elevenlabs, edge (free, local)
13
- *
14
- * Edge TTS: Uses Microsoft Edge's neural TTS engine via the `edge-tts` CLI.
15
- * Free, no API key, ~400 voices in 60+ languages. Auto-selects voice from
16
- * language + gender params. Also used as automatic fallback when cloud providers fail.
17
- * Install: `pip install edge-tts`
18
- *
19
- * Credential resolution order (same as email/image tools):
20
- * 1. Agent vault (per-agent credentials — e.g. service "openai" key "key")
21
- * 2. Environment variables (global fallback)
22
- * 3. Edge TTS (automatic fallback — no credentials needed)
23
- *
24
- * Environment variables (fallback):
25
- * OPENAI_API_KEY — openai provider (STT + TTS)
26
- * DEEPGRAM_API_KEY — deepgram provider (STT + TTS)
27
- * ELEVENLABS_API_KEY — elevenlabs provider (TTS)
13
+ * audio_speak's `edge` provider is wrapped as a custom SpeechModelV3 in
14
+ * `lib/edge-speech-model.ts` so it slots into the same SDK call as
15
+ * cloud providers (no special-casing in the tool layer).
28
16
  */
29
17
  import { resolve, dirname, extname } from "node:path";
30
18
  import { Type } from "@sinclair/typebox";
19
+ import { parseModelString, DEFAULT_TRANSCRIBE_MODEL, DEFAULT_TTS_MODEL, } from "@polpo-ai/core";
31
20
  import { NodeFileSystem } from "./adapters/node-filesystem.js";
32
21
  import { NodeShell } from "./adapters/node-shell.js";
33
22
  import { resolveAllowedPaths, assertPathAllowed } from "./path-sandbox.js";
23
+ import { resolveTranscribeProvider, resolveSpeakProvider, } from "./lib/provider-resolver.js";
34
24
  // ─── Constants ───
35
25
  const MAX_AUDIO_SIZE = 25 * 1024 * 1024; // 25 MB (OpenAI Whisper limit)
36
- const DEFAULT_TIMEOUT = 120_000; // 2 min for audio processing
37
26
  // ─── Helpers ───
38
27
  function requireEnv(key) {
39
28
  const val = process.env[key];
@@ -41,54 +30,38 @@ function requireEnv(key) {
41
30
  throw new Error(`Missing environment variable: ${key}. Set it before using this tool.`);
42
31
  return val;
43
32
  }
44
- /** Build a FormData-like multipart body for fetch (Node 18+). */
45
- function audioFormData(fileBuffer, filename, fields) {
46
- const form = new FormData();
47
- const blob = new Blob([new Uint8Array(fileBuffer)], { type: mimeFromExt(extname(filename)) });
48
- form.append("file", blob, filename);
49
- for (const [k, v] of Object.entries(fields)) {
50
- form.append(k, v);
51
- }
52
- return { body: form };
53
- }
54
- function mimeFromExt(ext) {
55
- const map = {
56
- ".mp3": "audio/mpeg",
57
- ".wav": "audio/wav",
58
- ".flac": "audio/flac",
59
- ".ogg": "audio/ogg",
60
- ".m4a": "audio/mp4",
61
- ".webm": "audio/webm",
62
- ".mp4": "audio/mp4",
63
- ".mpeg": "audio/mpeg",
64
- ".mpga": "audio/mpeg",
65
- };
66
- return map[ext.toLowerCase()] ?? "application/octet-stream";
33
+ function resolveEffectiveModel(override, configured, fallback) {
34
+ return parseModelString(override ?? configured ?? fallback);
67
35
  }
36
+ /** Default voices per TTS provider. Used when the input doesn't pass an explicit voice. */
37
+ const SPEAK_DEFAULT_VOICES = {
38
+ openai: "alloy",
39
+ deepgram: undefined, // voice is encoded in the model id
40
+ elevenlabs: "21m00Tcm4TlvDq8ikWAM", // Rachel
41
+ edge: undefined, // resolved from language+gender by EdgeSpeechModel
42
+ };
68
43
  // ─── Tool: audio_transcribe ───
69
44
  const AudioTranscribeSchema = Type.Object({
70
45
  path: Type.String({ description: "Path to the audio file to transcribe (mp3, wav, flac, ogg, m4a, webm)" }),
71
- provider: Type.Optional(Type.Union([
72
- Type.Literal("openai"),
73
- Type.Literal("deepgram"),
74
- ], { description: "STT provider (default: openai)" })),
75
- model: Type.Optional(Type.String({ description: "Model name. OpenAI: 'whisper-1' (default). Deepgram: 'nova-3' (default)." })),
46
+ model: Type.Optional(Type.String({
47
+ description: "Override the agent's transcribe_model for this call. Format: '<provider>/<model>' " +
48
+ "(e.g. 'openai/whisper-1', 'deepgram/nova-3'). When omitted, uses the agent's configured transcribe_model.",
49
+ })),
76
50
  language: Type.Optional(Type.String({ description: "ISO 639-1 language code (e.g. 'en', 'it', 'es'). Helps accuracy." })),
77
- prompt: Type.Optional(Type.String({ description: "Optional context/prompt to guide transcription (OpenAI only)" })),
51
+ prompt: Type.Optional(Type.String({ description: "Optional context/prompt to guide transcription (OpenAI Whisper only)" })),
78
52
  });
79
- function createTranscribeTool(cwd, sandbox, fs, vault) {
53
+ function createTranscribeTool(cwd, sandbox, fs, configuredModel, vault) {
80
54
  return {
81
55
  name: "audio_transcribe",
82
56
  label: "Transcribe Audio",
83
57
  description: "Transcribe an audio file to text using speech-to-text AI. " +
84
58
  "Supports mp3, wav, flac, ogg, m4a, webm formats. Max file size: 25 MB. " +
85
- "Providers: openai (Whisper, default), deepgram (Nova). " +
86
- "Credentials resolved from: agent vault > OPENAI_API_KEY or DEEPGRAM_API_KEY env var.",
59
+ "Model is configured at agent level (transcribe_model) — pass `model` here only to override per-call. " +
60
+ "Default: openai/whisper-1. Supported providers: openai, deepgram.",
87
61
  parameters: AudioTranscribeSchema,
88
62
  async execute(_id, params, signal) {
89
63
  const filePath = resolve(cwd, params.path);
90
64
  assertPathAllowed(filePath, sandbox, "audio_transcribe");
91
- const provider = params.provider ?? "openai";
92
65
  if (!fs.readFileBuffer) {
93
66
  return {
94
67
  content: [{ type: "text", text: "FileSystem implementation does not support readFileBuffer (required for binary reads)." }],
@@ -113,111 +86,61 @@ function createTranscribeTool(cwd, sandbox, fs, vault) {
113
86
  };
114
87
  }
115
88
  try {
116
- if (provider === "openai") {
117
- return await transcribeOpenAI(filePath, fileBuffer, params, vault, signal);
118
- }
119
- else {
120
- return await transcribeDeepgram(filePath, fileBuffer, params, vault, signal);
121
- }
89
+ const parsed = resolveEffectiveModel(params.model, configuredModel, DEFAULT_TRANSCRIBE_MODEL);
90
+ return await transcribeWithSdk(filePath, fileBuffer, parsed, params, vault, signal);
122
91
  }
123
92
  catch (err) {
124
93
  return {
125
- content: [{ type: "text", text: `Transcription error (${provider}): ${err.message}` }],
126
- details: { provider, error: err.message },
94
+ content: [{ type: "text", text: `Transcription error: ${err.message}` }],
95
+ details: { error: err.message },
127
96
  };
128
97
  }
129
98
  },
130
99
  };
131
100
  }
132
- async function transcribeOpenAI(filePath, fileBuffer, params, vault, signal) {
133
- const apiKey = vault?.getKey("openai", "key") ?? requireEnv("OPENAI_API_KEY");
134
- const model = params.model ?? "whisper-1";
135
- const fields = { model };
136
- if (params.language)
137
- fields.language = params.language;
138
- if (params.prompt)
139
- fields.prompt = params.prompt;
140
- fields.response_format = "verbose_json";
141
- const { body } = audioFormData(fileBuffer, filePath.split("/").pop(), fields);
142
- const controller = new AbortController();
143
- const timer = setTimeout(() => controller.abort(), DEFAULT_TIMEOUT);
144
- if (signal)
145
- signal.addEventListener("abort", () => controller.abort(), { once: true });
146
- const response = await fetch("https://api.openai.com/v1/audio/transcriptions", {
147
- method: "POST",
148
- headers: { Authorization: `Bearer ${apiKey}` },
149
- body,
150
- signal: controller.signal,
151
- });
152
- clearTimeout(timer);
153
- if (!response.ok) {
154
- const errText = await response.text();
155
- throw new Error(`OpenAI API ${response.status}: ${errText}`);
101
+ async function transcribeWithSdk(_filePath, fileBuffer, parsed, params, vault, signal) {
102
+ const { experimental_transcribe } = await import("ai");
103
+ const apiKey = parsed.provider === "openai"
104
+ ? vault?.getKey("openai", "key") ?? requireEnv("OPENAI_API_KEY")
105
+ : parsed.provider === "deepgram"
106
+ ? vault?.getKey("deepgram", "key") ?? requireEnv("DEEPGRAM_API_KEY")
107
+ : (() => { throw new Error(`Unsupported transcribe provider: ${parsed.provider}`); })();
108
+ const provider = await resolveTranscribeProvider(parsed.provider, apiKey);
109
+ const providerOptions = {};
110
+ if (parsed.provider === "openai") {
111
+ const opts = {};
112
+ if (params.language)
113
+ opts.language = params.language;
114
+ if (params.prompt)
115
+ opts.prompt = params.prompt;
116
+ if (Object.keys(opts).length)
117
+ providerOptions.openai = opts;
156
118
  }
157
- const data = await response.json();
158
- const info = [
159
- `Language: ${data.language ?? "unknown"}`,
160
- `Duration: ${data.duration ? `${data.duration.toFixed(1)}s` : "unknown"}`,
161
- `Model: ${model}`,
162
- ].join(" | ");
163
- return {
164
- content: [{ type: "text", text: `${info}\n\n${data.text}` }],
165
- details: {
166
- provider: "openai",
167
- model,
168
- language: data.language,
169
- duration: data.duration,
170
- textLength: data.text.length,
171
- },
172
- };
173
- }
174
- async function transcribeDeepgram(filePath, fileBuffer, params, vault, signal) {
175
- const apiKey = vault?.getKey("deepgram", "key") ?? requireEnv("DEEPGRAM_API_KEY");
176
- const model = params.model ?? "nova-3";
177
- const queryParams = new URLSearchParams({
178
- model,
179
- smart_format: "true",
180
- punctuate: "true",
181
- });
182
- if (params.language)
183
- queryParams.set("language", params.language);
184
- const controller = new AbortController();
185
- const timer = setTimeout(() => controller.abort(), DEFAULT_TIMEOUT);
186
- if (signal)
187
- signal.addEventListener("abort", () => controller.abort(), { once: true });
188
- const ext = extname(filePath).toLowerCase();
189
- const mime = mimeFromExt(ext);
190
- const response = await fetch(`https://api.deepgram.com/v1/listen?${queryParams}`, {
191
- method: "POST",
192
- headers: {
193
- Authorization: `Token ${apiKey}`,
194
- "Content-Type": mime,
195
- },
196
- body: new Uint8Array(fileBuffer),
197
- signal: controller.signal,
198
- });
199
- clearTimeout(timer);
200
- if (!response.ok) {
201
- const errText = await response.text();
202
- throw new Error(`Deepgram API ${response.status}: ${errText}`);
119
+ else {
120
+ const opts = { smart_format: true, punctuate: true };
121
+ if (params.language)
122
+ opts.language = params.language;
123
+ providerOptions.deepgram = opts;
203
124
  }
204
- const data = await response.json();
205
- const transcript = data.results?.channels?.[0]?.alternatives?.[0]?.transcript ?? "";
206
- const confidence = data.results?.channels?.[0]?.alternatives?.[0]?.confidence;
207
- const duration = data.metadata?.duration;
125
+ const result = await experimental_transcribe({
126
+ model: provider.transcription(parsed.model),
127
+ audio: new Uint8Array(fileBuffer),
128
+ providerOptions: Object.keys(providerOptions).length ? providerOptions : undefined,
129
+ abortSignal: signal,
130
+ });
208
131
  const info = [
209
- `Confidence: ${confidence ? `${(confidence * 100).toFixed(1)}%` : "unknown"}`,
210
- `Duration: ${duration ? `${duration.toFixed(1)}s` : "unknown"}`,
211
- `Model: ${model}`,
132
+ `Language: ${result.language ?? "unknown"}`,
133
+ `Duration: ${result.durationInSeconds ? `${result.durationInSeconds.toFixed(1)}s` : "unknown"}`,
134
+ `Model: ${parsed.provider}/${parsed.model}`,
212
135
  ].join(" | ");
213
136
  return {
214
- content: [{ type: "text", text: `${info}\n\n${transcript}` }],
137
+ content: [{ type: "text", text: `${info}\n\n${result.text}` }],
215
138
  details: {
216
- provider: "deepgram",
217
- model,
218
- confidence,
219
- duration,
220
- textLength: transcript.length,
139
+ provider: parsed.provider,
140
+ model: parsed.model,
141
+ language: result.language,
142
+ duration: result.durationInSeconds,
143
+ textLength: result.text.length,
221
144
  },
222
145
  };
223
146
  }
@@ -225,324 +148,118 @@ async function transcribeDeepgram(filePath, fileBuffer, params, vault, signal) {
225
148
  const AudioSpeakSchema = Type.Object({
226
149
  text: Type.String({ description: "Text to convert to speech" }),
227
150
  path: Type.String({ description: "Output file path (e.g. 'output.mp3'). Format inferred from extension." }),
228
- provider: Type.Optional(Type.Union([
229
- Type.Literal("openai"),
230
- Type.Literal("deepgram"),
231
- Type.Literal("elevenlabs"),
232
- Type.Literal("edge"),
233
- ], { description: "TTS provider. 'openai' (default), 'deepgram', 'elevenlabs', or 'edge' (free, local Microsoft Edge TTS — no API key needed). If the chosen provider fails, edge-tts is tried as automatic fallback." })),
234
- model: Type.Optional(Type.String({ description: "Model name. OpenAI: 'tts-1' (default), 'tts-1-hd', 'gpt-4o-mini-tts'. Deepgram: 'aura-2-en' (default). ElevenLabs: 'eleven_multilingual_v2' (default)." })),
235
- voice: Type.Optional(Type.String({ description: "Voice name/ID. OpenAI: alloy, echo, fable, onyx, nova, shimmer (default: alloy). ElevenLabs: voice ID. Edge: full voice name like 'it-IT-DiegoNeural' (auto-selected from language+gender if omitted)." })),
151
+ model: Type.Optional(Type.String({
152
+ description: "Override the agent's tts_model for this call. Format: '<provider>/<model>' " +
153
+ "(e.g. 'openai/tts-1', 'openai/tts-1-hd', 'openai/gpt-4o-mini-tts', 'deepgram/aura-2-asteria-en', " +
154
+ "'elevenlabs/eleven_multilingual_v2', 'edge/edge-tts'). When omitted, uses the agent's configured tts_model.",
155
+ })),
156
+ voice: Type.Optional(Type.String({ description: "Voice name/ID. OpenAI: alloy/echo/fable/onyx/nova/shimmer (default: alloy). ElevenLabs: voice ID (default: Rachel). Edge: full voice name like 'it-IT-DiegoNeural' (auto-selected from language+gender if omitted)." })),
236
157
  language: Type.Optional(Type.String({ description: "ISO 639-1 language code (e.g. 'it', 'en', 'es'). Used by edge provider to select the right voice. Also useful for other providers with multilingual models." })),
237
158
  gender: Type.Optional(Type.Union([
238
159
  Type.Literal("male"),
239
160
  Type.Literal("female"),
240
- ], { description: "Voice gender preference. Used by edge provider to pick the right voice when no explicit voice is given. For other providers, choose the voice directly." })),
161
+ ], { description: "Voice gender preference. Used by the edge provider to pick the right voice when no explicit voice is given." })),
241
162
  speed: Type.Optional(Type.Number({ description: "Playback speed 0.25-4.0 (OpenAI only, default: 1.0)" })),
242
163
  instructions: Type.Optional(Type.String({ description: "Voice style instructions (OpenAI gpt-4o-mini-tts only, e.g. 'Speak in a cheerful tone')" })),
243
164
  });
244
- function createSpeakTool(cwd, sandbox, fs, shell, vault) {
165
+ function audioFormat(filePath, providerName) {
166
+ const ext = extname(filePath).toLowerCase().replace(".", "");
167
+ if (providerName === "elevenlabs") {
168
+ const map = { mp3: "mp3_44100_128", wav: "pcm_44100", flac: "flac" };
169
+ return map[ext] ?? "mp3_44100_128";
170
+ }
171
+ return ext || "mp3";
172
+ }
173
+ function createSpeakTool(cwd, sandbox, fs, shell, configuredModel, vault) {
245
174
  return {
246
175
  name: "audio_speak",
247
176
  label: "Text to Speech",
248
177
  description: "Generate speech audio from text using text-to-speech AI. " +
249
- "Output format is inferred from file extension (mp3, wav, flac, opus, aac, pcm). " +
250
- "Providers: openai (default), deepgram (Aura), elevenlabs, edge (free, no API key Microsoft Edge neural voices). " +
251
- "If the chosen provider fails (quota, auth, billing), edge-tts is tried automatically as fallback. " +
252
- "Use 'language' (ISO 639-1) and 'gender' params to help select the right voice, especially for edge provider. " +
253
- "Credentials resolved from: agent vault > OPENAI_API_KEY, DEEPGRAM_API_KEY, or ELEVENLABS_API_KEY env var.",
178
+ "Output format inferred from file extension (mp3, wav, flac, opus, aac, pcm). " +
179
+ "Model is configured at agent level (tts_model) pass `model` here only to override per-call. " +
180
+ "Default: openai/tts-1. Supported providers: openai, deepgram, elevenlabs, edge (free, local Microsoft Edge TTS — no API key needed).",
254
181
  parameters: AudioSpeakSchema,
255
182
  async execute(_id, params, signal) {
256
183
  const filePath = resolve(cwd, params.path);
257
184
  assertPathAllowed(filePath, sandbox, "audio_speak");
258
- const provider = params.provider ?? "openai";
259
- // Direct edge-tts request — no fallback needed
260
- if (provider === "edge") {
261
- if (!(await edgeTtsAvailable(shell))) {
262
- return {
263
- content: [{ type: "text", text: "Edge TTS not available in this environment. Use openai/deepgram/elevenlabs instead." }],
264
- details: { provider: "edge", error: "edge_tts_unavailable" },
265
- };
266
- }
267
- try {
268
- return await speakEdgeTts(filePath, params, fs, shell);
269
- }
270
- catch (err) {
271
- return {
272
- content: [{ type: "text", text: `TTS error (edge): ${err.message}` }],
273
- details: { provider: "edge", error: err.message },
274
- };
275
- }
276
- }
277
- // Provider with edge-tts fallback
278
185
  try {
279
- if (provider === "openai") {
280
- return await speakOpenAI(filePath, params, fs, vault, signal);
281
- }
282
- else if (provider === "deepgram") {
283
- return await speakDeepgram(filePath, params, fs, vault, signal);
284
- }
285
- else {
286
- return await speakElevenLabs(filePath, params, fs, vault, signal);
287
- }
186
+ const parsed = resolveEffectiveModel(params.model, configuredModel, DEFAULT_TTS_MODEL);
187
+ return await speakWithSdk(filePath, parsed, params, fs, shell, vault, signal);
288
188
  }
289
189
  catch (err) {
290
- // Automatic fallback to edge-tts if available
291
- if (await edgeTtsAvailable(shell)) {
292
- try {
293
- const result = await speakEdgeTts(filePath, params, fs, shell);
294
- // Prepend fallback notice
295
- const notice = `[Fallback] ${provider} failed (${err.message}), used edge-tts instead.\n`;
296
- return {
297
- content: [{ type: "text", text: notice + result.content[0].text }],
298
- details: { ...result.details, fallbackFrom: provider, fallbackReason: err.message },
299
- };
300
- }
301
- catch (edgeErr) {
302
- return {
303
- content: [{ type: "text", text: `TTS error (${provider}): ${err.message}\nEdge-tts fallback also failed: ${edgeErr.message}` }],
304
- details: { provider, error: err.message, edgeError: edgeErr.message },
305
- };
306
- }
307
- }
308
190
  return {
309
- content: [{ type: "text", text: `TTS error (${provider}): ${err.message}` }],
310
- details: { provider, error: err.message },
191
+ content: [{ type: "text", text: `TTS error: ${err.message}` }],
192
+ details: { error: err.message },
311
193
  };
312
194
  }
313
195
  },
314
196
  };
315
197
  }
316
- async function speakOpenAI(filePath, params, fs, vault, signal) {
317
- const apiKey = vault?.getKey("openai", "key") ?? requireEnv("OPENAI_API_KEY");
318
- const model = params.model ?? "tts-1";
319
- const voice = params.voice ?? "alloy";
320
- const ext = extname(filePath).toLowerCase().replace(".", "");
321
- const formatMap = {
322
- mp3: "mp3", wav: "wav", flac: "flac", opus: "opus", aac: "aac", pcm: "pcm",
323
- };
324
- const responseFormat = formatMap[ext] ?? "mp3";
325
- const body = {
326
- model,
327
- input: params.text,
328
- voice,
329
- response_format: responseFormat,
330
- };
331
- if (params.speed !== undefined)
332
- body.speed = params.speed;
333
- if (params.instructions)
334
- body.instructions = params.instructions;
335
- const controller = new AbortController();
336
- const timer = setTimeout(() => controller.abort(), DEFAULT_TIMEOUT);
337
- if (signal)
338
- signal.addEventListener("abort", () => controller.abort(), { once: true });
339
- const response = await fetch("https://api.openai.com/v1/audio/speech", {
340
- method: "POST",
341
- headers: {
342
- Authorization: `Bearer ${apiKey}`,
343
- "Content-Type": "application/json",
344
- },
345
- body: JSON.stringify(body),
346
- signal: controller.signal,
347
- });
348
- clearTimeout(timer);
349
- if (!response.ok) {
350
- const errText = await response.text();
351
- throw new Error(`OpenAI TTS API ${response.status}: ${errText}`);
198
+ async function speakWithSdk(filePath, parsed, params, fs, shell, vault, signal) {
199
+ const { experimental_generateSpeech } = await import("ai");
200
+ const providerName = parsed.provider;
201
+ const voice = params.voice ?? SPEAK_DEFAULT_VOICES[providerName];
202
+ // Cloud providers need an apiKey. The edge provider needs shell+fs.
203
+ let apiKey;
204
+ if (providerName === "openai") {
205
+ apiKey = vault?.getKey("openai", "key") ?? requireEnv("OPENAI_API_KEY");
352
206
  }
353
- const buffer = Buffer.from(await response.arrayBuffer());
354
- if (!fs.writeFileBuffer) {
355
- throw new Error("FileSystem implementation does not support writeFileBuffer (required for binary writes).");
207
+ else if (providerName === "deepgram") {
208
+ apiKey = vault?.getKey("deepgram", "key") ?? requireEnv("DEEPGRAM_API_KEY");
356
209
  }
357
- await fs.mkdir(dirname(filePath));
358
- await fs.writeFileBuffer(filePath, new Uint8Array(buffer));
359
- return {
360
- content: [{ type: "text", text: `Speech audio saved: ${filePath} (${(buffer.byteLength / 1024).toFixed(1)} KB, ${responseFormat}, voice: ${voice}, model: ${model})` }],
361
- details: {
362
- provider: "openai",
363
- model,
364
- voice,
365
- format: responseFormat,
366
- path: filePath,
367
- bytes: buffer.byteLength,
368
- textLength: params.text.length,
369
- },
370
- };
371
- }
372
- async function speakDeepgram(filePath, params, fs, vault, signal) {
373
- const apiKey = vault?.getKey("deepgram", "key") ?? requireEnv("DEEPGRAM_API_KEY");
374
- const model = params.model ?? "aura-2-en";
375
- const controller = new AbortController();
376
- const timer = setTimeout(() => controller.abort(), DEFAULT_TIMEOUT);
377
- if (signal)
378
- signal.addEventListener("abort", () => controller.abort(), { once: true });
379
- const response = await fetch(`https://api.deepgram.com/v1/speak?model=${encodeURIComponent(model)}`, {
380
- method: "POST",
381
- headers: {
382
- Authorization: `Token ${apiKey}`,
383
- "Content-Type": "application/json",
384
- },
385
- body: JSON.stringify({ text: params.text }),
386
- signal: controller.signal,
387
- });
388
- clearTimeout(timer);
389
- if (!response.ok) {
390
- const errText = await response.text();
391
- throw new Error(`Deepgram TTS API ${response.status}: ${errText}`);
210
+ else if (providerName === "elevenlabs") {
211
+ apiKey = vault?.getKey("elevenlabs", "key") ?? requireEnv("ELEVENLABS_API_KEY");
392
212
  }
393
- const buffer = Buffer.from(await response.arrayBuffer());
394
- if (!fs.writeFileBuffer) {
395
- throw new Error("FileSystem implementation does not support writeFileBuffer (required for binary writes).");
213
+ else if (providerName !== "edge") {
214
+ throw new Error(`Unsupported tts provider: ${providerName}`);
396
215
  }
397
- await fs.mkdir(dirname(filePath));
398
- await fs.writeFileBuffer(filePath, new Uint8Array(buffer));
399
- return {
400
- content: [{ type: "text", text: `Speech audio saved: ${filePath} (${(buffer.byteLength / 1024).toFixed(1)} KB, model: ${model})` }],
401
- details: {
402
- provider: "deepgram",
403
- model,
404
- format: "mp3",
405
- path: filePath,
406
- bytes: buffer.byteLength,
407
- textLength: params.text.length,
408
- },
409
- };
410
- }
411
- async function speakElevenLabs(filePath, params, fs, vault, signal) {
412
- const apiKey = vault?.getKey("elevenlabs", "key") ?? requireEnv("ELEVENLABS_API_KEY");
413
- const model = params.model ?? "eleven_multilingual_v2";
414
- // ElevenLabs default voice: "Rachel" (21m00Tcm4TlvDq8ikWAM)
415
- const voiceId = params.voice ?? "21m00Tcm4TlvDq8ikWAM";
416
- const ext = extname(filePath).toLowerCase().replace(".", "");
417
- const formatMap = {
418
- mp3: "mp3_44100_128", wav: "pcm_44100", flac: "flac",
419
- };
420
- const outputFormat = formatMap[ext] ?? "mp3_44100_128";
421
- const controller = new AbortController();
422
- const timer = setTimeout(() => controller.abort(), DEFAULT_TIMEOUT);
423
- if (signal)
424
- signal.addEventListener("abort", () => controller.abort(), { once: true });
425
- const response = await fetch(`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}?output_format=${outputFormat}`, {
426
- method: "POST",
427
- headers: {
428
- "xi-api-key": apiKey,
429
- "Content-Type": "application/json",
430
- },
431
- body: JSON.stringify({
432
- text: params.text,
433
- model_id: model,
434
- }),
435
- signal: controller.signal,
216
+ const provider = await resolveSpeakProvider(providerName, { apiKey, shell, fs });
217
+ // Provider-specific knobs flow through providerOptions.
218
+ const providerOptions = {};
219
+ if (providerName === "openai") {
220
+ const opts = {};
221
+ if (params.speed !== undefined)
222
+ opts.speed = params.speed;
223
+ if (params.instructions)
224
+ opts.instructions = params.instructions;
225
+ if (Object.keys(opts).length)
226
+ providerOptions.openai = opts;
227
+ }
228
+ if (providerName === "edge" && params.gender) {
229
+ providerOptions.edge = { gender: params.gender };
230
+ }
231
+ const outputFormat = audioFormat(filePath, providerName);
232
+ const result = await experimental_generateSpeech({
233
+ model: provider.speech(parsed.model),
234
+ text: params.text,
235
+ voice,
236
+ outputFormat,
237
+ language: params.language,
238
+ instructions: params.instructions,
239
+ speed: params.speed,
240
+ providerOptions: Object.keys(providerOptions).length ? providerOptions : undefined,
241
+ abortSignal: signal,
436
242
  });
437
- clearTimeout(timer);
438
- if (!response.ok) {
439
- const errText = await response.text();
440
- throw new Error(`ElevenLabs API ${response.status}: ${errText}`);
243
+ const bytes = result.audio.uint8Array;
244
+ if (!bytes || bytes.byteLength === 0) {
245
+ throw new Error("No audio bytes in SDK response");
441
246
  }
442
- const buffer = Buffer.from(await response.arrayBuffer());
443
247
  if (!fs.writeFileBuffer) {
444
248
  throw new Error("FileSystem implementation does not support writeFileBuffer (required for binary writes).");
445
249
  }
446
250
  await fs.mkdir(dirname(filePath));
447
- await fs.writeFileBuffer(filePath, new Uint8Array(buffer));
251
+ await fs.writeFileBuffer(filePath, bytes);
252
+ const voiceLabel = voice ?? "(model-bound)";
253
+ const summary = `Speech audio saved: ${filePath} (${(bytes.byteLength / 1024).toFixed(1)} KB, ${outputFormat}, voice: ${voiceLabel}, model: ${parsed.provider}/${parsed.model})`;
448
254
  return {
449
- content: [{ type: "text", text: `Speech audio saved: ${filePath} (${(buffer.byteLength / 1024).toFixed(1)} KB, voice: ${voiceId}, model: ${model})` }],
255
+ content: [{ type: "text", text: summary }],
450
256
  details: {
451
- provider: "elevenlabs",
452
- model,
453
- voiceId,
257
+ provider: providerName,
258
+ model: parsed.model,
259
+ voice: voiceLabel,
454
260
  format: outputFormat,
455
261
  path: filePath,
456
- bytes: buffer.byteLength,
457
- textLength: params.text.length,
458
- },
459
- };
460
- }
461
- // ─── Edge TTS (free, local CLI, automatic fallback) ───
462
- /**
463
- * Default Edge TTS voices per language+gender.
464
- * Format: `${lang}-${region}-${name}Neural`
465
- * Each entry: [female, male]. First match wins.
466
- */
467
- const EDGE_VOICES = {
468
- "it": ["it-IT-ElsaNeural", "it-IT-DiegoNeural"],
469
- "en": ["en-US-EmmaMultilingualNeural", "en-US-AndrewMultilingualNeural"],
470
- "es": ["es-ES-ElviraNeural", "es-ES-AlvaroNeural"],
471
- "fr": ["fr-FR-DeniseNeural", "fr-FR-HenriNeural"],
472
- "de": ["de-DE-KatjaNeural", "de-DE-ConradNeural"],
473
- "pt": ["pt-BR-FranciscaNeural", "pt-BR-AntonioNeural"],
474
- "ja": ["ja-JP-NanamiNeural", "ja-JP-KeitaNeural"],
475
- "zh": ["zh-CN-XiaoxiaoNeural", "zh-CN-YunxiNeural"],
476
- "ko": ["ko-KR-SunHiNeural", "ko-KR-InJoonNeural"],
477
- "ar": ["ar-SA-ZariyahNeural", "ar-SA-HamedNeural"],
478
- "hi": ["hi-IN-SwaraNeural", "hi-IN-MadhurNeural"],
479
- "ru": ["ru-RU-SvetlanaNeural", "ru-RU-DmitryNeural"],
480
- "nl": ["nl-NL-ColetteNeural", "nl-NL-MaartenNeural"],
481
- "pl": ["pl-PL-AgnieszkaNeural", "pl-PL-MarekNeural"],
482
- "tr": ["tr-TR-EmelNeural", "tr-TR-AhmetNeural"],
483
- "sv": ["sv-SE-SofieNeural", "sv-SE-MattiasNeural"],
484
- };
485
- /**
486
- * Resolve the best Edge TTS voice for a given language and gender hint.
487
- * Falls back to en-US if the language is unknown.
488
- */
489
- function resolveEdgeVoice(voice, language, gender) {
490
- // If the agent passed an explicit voice name like "it-IT-DiegoNeural", use it directly
491
- if (voice && voice.includes("-") && voice.endsWith("Neural"))
492
- return voice;
493
- const lang = (language ?? "en").toLowerCase().split("-")[0]; // "it-IT" → "it"
494
- const pair = EDGE_VOICES[lang] ?? EDGE_VOICES["en"];
495
- return gender === "male" ? pair[1] : pair[0]; // default female if no gender hint
496
- }
497
- /** Per-Shell cache of "is edge-tts on PATH" — checked once per shell. */
498
- const _edgeTtsAvailable = new WeakMap();
499
- /** Check if edge-tts CLI is available, routed through the Shell so the
500
- * check runs in the same environment as the actual TTS call (sandbox
501
- * in cloud, local Node in OSS). */
502
- function edgeTtsAvailable(shell) {
503
- const existing = _edgeTtsAvailable.get(shell);
504
- if (existing)
505
- return existing;
506
- const fresh = shell
507
- .execute("edge-tts --version", { timeout: 5000 })
508
- .then((r) => r.exitCode === 0)
509
- .catch(() => false);
510
- _edgeTtsAvailable.set(shell, fresh);
511
- return fresh;
512
- }
513
- /** Quote a CLI argument for inclusion in a `shell.execute` command line. */
514
- function quoteArg(arg) {
515
- return `'${arg.replace(/'/g, `'\\''`)}'`;
516
- }
517
- async function speakEdgeTts(filePath, params, fs, shell) {
518
- if (!(await edgeTtsAvailable(shell))) {
519
- throw new Error("edge-tts CLI is not installed. Install it with: pip install edge-tts");
520
- }
521
- const voice = resolveEdgeVoice(params.voice, params.language, params.gender);
522
- await fs.mkdir(dirname(filePath));
523
- const cmd = [
524
- "edge-tts",
525
- "--text", quoteArg(params.text),
526
- "--voice", quoteArg(voice),
527
- "--write-media", quoteArg(filePath),
528
- ].join(" ");
529
- const result = await shell.execute(cmd, { timeout: DEFAULT_TIMEOUT });
530
- if (result.exitCode !== 0) {
531
- throw new Error(`edge-tts failed: ${(result.stderr || result.stdout || "").trim() || `exit ${result.exitCode}`}`);
532
- }
533
- let bytes = 0;
534
- try {
535
- const stat = await fs.stat(filePath);
536
- bytes = stat?.size ?? 0;
537
- }
538
- catch { /* ignore */ }
539
- return {
540
- content: [{ type: "text", text: `Speech audio saved: ${filePath} (${(bytes / 1024).toFixed(1)} KB, voice: ${voice}, provider: edge-tts)` }],
541
- details: {
542
- provider: "edge",
543
- voice,
544
- path: filePath,
545
- bytes,
262
+ bytes: bytes.byteLength,
546
263
  textLength: params.text.length,
547
264
  },
548
265
  };
@@ -551,21 +268,22 @@ export const ALL_AUDIO_TOOL_NAMES = ["audio_transcribe", "audio_speak"];
551
268
  /**
552
269
  * Create audio tools for speech-to-text and text-to-speech.
553
270
  *
554
- * @param cwd - Working directory for resolving file paths
555
- * @param allowedPaths - Sandbox paths for file validation
556
- * @param allowedTools - Optional filter
557
- * @param vault - Resolved vault credentials for credential resolution
271
+ * The 6-arg positional signature is preserved for back-compat. Prefer
272
+ * the options-object form for new callers.
558
273
  */
559
274
  export function createAudioTools(cwd, allowedPaths, allowedTools, vault, fs, shell) {
560
- const sandbox = resolveAllowedPaths(cwd, allowedPaths);
561
- const _fs = fs ?? new NodeFileSystem();
562
- const _shell = shell ?? new NodeShell();
275
+ const opts = typeof cwd === "string"
276
+ ? { cwd, allowedPaths, allowedTools, vault, fs, shell }
277
+ : cwd;
278
+ const sandbox = resolveAllowedPaths(opts.cwd, opts.allowedPaths);
279
+ const _fs = opts.fs ?? new NodeFileSystem();
280
+ const _shell = opts.shell ?? new NodeShell();
563
281
  const factories = {
564
- audio_transcribe: () => createTranscribeTool(cwd, sandbox, _fs, vault),
565
- audio_speak: () => createSpeakTool(cwd, sandbox, _fs, _shell, vault),
282
+ audio_transcribe: () => createTranscribeTool(opts.cwd, sandbox, _fs, opts.transcribeModel, opts.vault),
283
+ audio_speak: () => createSpeakTool(opts.cwd, sandbox, _fs, _shell, opts.ttsModel, opts.vault),
566
284
  };
567
- const names = allowedTools
568
- ? ALL_AUDIO_TOOL_NAMES.filter(n => allowedTools.some(a => a.toLowerCase() === n))
285
+ const names = opts.allowedTools
286
+ ? ALL_AUDIO_TOOL_NAMES.filter(n => opts.allowedTools.some(a => a.toLowerCase() === n))
569
287
  : ALL_AUDIO_TOOL_NAMES;
570
288
  return names.map(n => factories[n]());
571
289
  }