@juspay/neurolink 9.61.2 → 9.62.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/README.md +23 -17
  3. package/dist/adapters/tts/googleTTSHandler.js +1 -1
  4. package/dist/browser/neurolink.min.js +373 -355
  5. package/dist/cli/commands/serve.js +9 -0
  6. package/dist/cli/commands/voiceServer.d.ts +7 -0
  7. package/dist/cli/commands/voiceServer.js +9 -1
  8. package/dist/cli/factories/commandFactory.js +136 -11
  9. package/dist/cli/loop/optionsSchema.d.ts +1 -1
  10. package/dist/cli/utils/audioFileUtils.d.ts +3 -3
  11. package/dist/cli/utils/audioFileUtils.js +5 -1
  12. package/dist/core/baseProvider.js +29 -6
  13. package/dist/factories/providerRegistry.d.ts +14 -0
  14. package/dist/factories/providerRegistry.js +141 -2
  15. package/dist/lib/adapters/tts/googleTTSHandler.js +1 -1
  16. package/dist/lib/core/baseProvider.js +29 -6
  17. package/dist/lib/factories/providerRegistry.d.ts +14 -0
  18. package/dist/lib/factories/providerRegistry.js +141 -2
  19. package/dist/lib/neurolink.d.ts +19 -0
  20. package/dist/lib/neurolink.js +248 -12
  21. package/dist/lib/observability/exporters/laminarExporter.js +1 -0
  22. package/dist/lib/observability/exporters/posthogExporter.js +1 -0
  23. package/dist/lib/observability/utils/spanSerializer.js +1 -0
  24. package/dist/lib/server/voice/tokenCompare.d.ts +14 -0
  25. package/dist/lib/server/voice/tokenCompare.js +23 -0
  26. package/dist/lib/server/voice/voiceServerApp.js +62 -3
  27. package/dist/lib/server/voice/voiceWebSocketHandler.d.ts +20 -3
  28. package/dist/lib/server/voice/voiceWebSocketHandler.js +555 -435
  29. package/dist/lib/types/generate.d.ts +47 -0
  30. package/dist/lib/types/index.d.ts +1 -1
  31. package/dist/lib/types/index.js +1 -1
  32. package/dist/lib/types/realtime.d.ts +243 -0
  33. package/dist/lib/types/realtime.js +70 -0
  34. package/dist/lib/types/server.d.ts +68 -0
  35. package/dist/lib/types/span.d.ts +2 -0
  36. package/dist/lib/types/span.js +2 -0
  37. package/dist/lib/types/stream.d.ts +36 -14
  38. package/dist/lib/types/stt.d.ts +585 -0
  39. package/dist/lib/types/stt.js +90 -0
  40. package/dist/lib/types/tts.d.ts +23 -11
  41. package/dist/lib/types/tts.js +7 -0
  42. package/dist/lib/types/voice.d.ts +272 -0
  43. package/dist/lib/types/voice.js +137 -0
  44. package/dist/lib/utils/audioFormatDetector.d.ts +15 -0
  45. package/dist/lib/utils/audioFormatDetector.js +34 -0
  46. package/dist/lib/utils/sttProcessor.d.ts +115 -0
  47. package/dist/lib/utils/sttProcessor.js +295 -0
  48. package/dist/lib/voice/RealtimeVoiceAPI.d.ts +183 -0
  49. package/dist/lib/voice/RealtimeVoiceAPI.js +439 -0
  50. package/dist/lib/voice/audio-utils.d.ts +135 -0
  51. package/dist/lib/voice/audio-utils.js +435 -0
  52. package/dist/lib/voice/errors.d.ts +123 -0
  53. package/dist/lib/voice/errors.js +386 -0
  54. package/dist/lib/voice/index.d.ts +26 -0
  55. package/dist/lib/voice/index.js +55 -0
  56. package/dist/lib/voice/providers/AzureSTT.d.ts +47 -0
  57. package/dist/lib/voice/providers/AzureSTT.js +345 -0
  58. package/dist/lib/voice/providers/AzureTTS.d.ts +59 -0
  59. package/dist/lib/voice/providers/AzureTTS.js +349 -0
  60. package/dist/lib/voice/providers/DeepgramSTT.d.ts +40 -0
  61. package/dist/lib/voice/providers/DeepgramSTT.js +550 -0
  62. package/dist/lib/voice/providers/ElevenLabsTTS.d.ts +53 -0
  63. package/dist/lib/voice/providers/ElevenLabsTTS.js +311 -0
  64. package/dist/lib/voice/providers/GeminiLive.d.ts +52 -0
  65. package/dist/lib/voice/providers/GeminiLive.js +372 -0
  66. package/dist/lib/voice/providers/GoogleSTT.d.ts +60 -0
  67. package/dist/lib/voice/providers/GoogleSTT.js +454 -0
  68. package/dist/lib/voice/providers/OpenAIRealtime.d.ts +47 -0
  69. package/dist/lib/voice/providers/OpenAIRealtime.js +412 -0
  70. package/dist/lib/voice/providers/OpenAISTT.d.ts +41 -0
  71. package/dist/lib/voice/providers/OpenAISTT.js +286 -0
  72. package/dist/lib/voice/providers/OpenAITTS.d.ts +49 -0
  73. package/dist/lib/voice/providers/OpenAITTS.js +271 -0
  74. package/dist/lib/voice/stream-handler.d.ts +166 -0
  75. package/dist/lib/voice/stream-handler.js +514 -0
  76. package/dist/neurolink.d.ts +19 -0
  77. package/dist/neurolink.js +248 -12
  78. package/dist/observability/exporters/laminarExporter.js +1 -0
  79. package/dist/observability/exporters/posthogExporter.js +1 -0
  80. package/dist/observability/utils/spanSerializer.js +1 -0
  81. package/dist/server/voice/tokenCompare.d.ts +14 -0
  82. package/dist/server/voice/tokenCompare.js +22 -0
  83. package/dist/server/voice/voiceServerApp.js +62 -3
  84. package/dist/server/voice/voiceWebSocketHandler.d.ts +20 -3
  85. package/dist/server/voice/voiceWebSocketHandler.js +555 -435
  86. package/dist/types/generate.d.ts +47 -0
  87. package/dist/types/index.d.ts +1 -1
  88. package/dist/types/index.js +1 -1
  89. package/dist/types/realtime.d.ts +243 -0
  90. package/dist/types/realtime.js +69 -0
  91. package/dist/types/server.d.ts +68 -0
  92. package/dist/types/span.d.ts +2 -0
  93. package/dist/types/span.js +2 -0
  94. package/dist/types/stream.d.ts +36 -14
  95. package/dist/types/stt.d.ts +585 -0
  96. package/dist/types/stt.js +89 -0
  97. package/dist/types/tts.d.ts +23 -11
  98. package/dist/types/tts.js +7 -0
  99. package/dist/types/voice.d.ts +272 -0
  100. package/dist/types/voice.js +136 -0
  101. package/dist/utils/audioFormatDetector.d.ts +15 -0
  102. package/dist/utils/audioFormatDetector.js +33 -0
  103. package/dist/utils/sttProcessor.d.ts +115 -0
  104. package/dist/utils/sttProcessor.js +294 -0
  105. package/dist/voice/RealtimeVoiceAPI.d.ts +183 -0
  106. package/dist/voice/RealtimeVoiceAPI.js +438 -0
  107. package/dist/voice/audio-utils.d.ts +135 -0
  108. package/dist/voice/audio-utils.js +434 -0
  109. package/dist/voice/errors.d.ts +123 -0
  110. package/dist/voice/errors.js +385 -0
  111. package/dist/voice/index.d.ts +26 -0
  112. package/dist/voice/index.js +54 -0
  113. package/dist/voice/providers/AzureSTT.d.ts +47 -0
  114. package/dist/voice/providers/AzureSTT.js +344 -0
  115. package/dist/voice/providers/AzureTTS.d.ts +59 -0
  116. package/dist/voice/providers/AzureTTS.js +348 -0
  117. package/dist/voice/providers/DeepgramSTT.d.ts +40 -0
  118. package/dist/voice/providers/DeepgramSTT.js +549 -0
  119. package/dist/voice/providers/ElevenLabsTTS.d.ts +53 -0
  120. package/dist/voice/providers/ElevenLabsTTS.js +310 -0
  121. package/dist/voice/providers/GeminiLive.d.ts +52 -0
  122. package/dist/voice/providers/GeminiLive.js +371 -0
  123. package/dist/voice/providers/GoogleSTT.d.ts +60 -0
  124. package/dist/voice/providers/GoogleSTT.js +453 -0
  125. package/dist/voice/providers/OpenAIRealtime.d.ts +47 -0
  126. package/dist/voice/providers/OpenAIRealtime.js +411 -0
  127. package/dist/voice/providers/OpenAISTT.d.ts +41 -0
  128. package/dist/voice/providers/OpenAISTT.js +285 -0
  129. package/dist/voice/providers/OpenAITTS.d.ts +49 -0
  130. package/dist/voice/providers/OpenAITTS.js +270 -0
  131. package/dist/voice/stream-handler.d.ts +166 -0
  132. package/dist/voice/stream-handler.js +513 -0
  133. package/package.json +3 -1
@@ -145,6 +145,15 @@ export class ServeCommandFactory {
145
145
  builder: (yargs) => {
146
146
  return yargs
147
147
  .command("status", "Show server status", (yargs) => ServeCommandFactory.buildStatusOptions(yargs), (argv) => ServeCommandFactory.executeStatus(argv))
148
+ .command("voice", "Start the real-time voice assistant server (OpenAI Realtime / Gemini Live)", (yargs) => yargs.option("port", {
149
+ alias: "p",
150
+ type: "number",
151
+ default: 3000,
152
+ describe: "Port to listen on",
153
+ }), async (argv) => {
154
+ const { startVoiceServer } = await import("../../lib/server/voice/voiceServerApp.js");
155
+ await startVoiceServer(argv.port);
156
+ })
148
157
  .option("port", {
149
158
  type: "number",
150
159
  alias: "p",
@@ -1,3 +1,10 @@
1
1
  import type { CommandModule } from "yargs";
2
2
  import type { VoiceServerArgs } from "../../lib/types/index.js";
3
+ /**
4
+ * @deprecated Use `neurolink serve voice` instead. This top-level alias is
5
+ * kept for one release for backwards compatibility and will be removed in a
6
+ * future version. The voice server is now subsumed under the existing
7
+ * `serve` infra command per CLAUDE.md's "everything via generate/stream/serve
8
+ * only" contract.
9
+ */
3
10
  export declare const voiceServerCommand: CommandModule<object, VoiceServerArgs>;
@@ -1,8 +1,15 @@
1
1
  import { startVoiceServer } from "../../lib/server/voice/voiceServerApp.js";
2
2
  import { configureVoiceServerEnvironment } from "../../lib/server/voice/voiceWebSocketHandler.js";
3
+ /**
4
+ * @deprecated Use `neurolink serve voice` instead. This top-level alias is
5
+ * kept for one release for backwards compatibility and will be removed in a
6
+ * future version. The voice server is now subsumed under the existing
7
+ * `serve` infra command per CLAUDE.md's "everything via generate/stream/serve
8
+ * only" contract.
9
+ */
3
10
  export const voiceServerCommand = {
4
11
  command: "voice-server",
5
- describe: "Start the real-time voice assistant server (Soniox STT + Cartesia TTS + Cobra VAD)",
12
+ describe: "[DEPRECATED — use 'neurolink serve voice'] Start the real-time voice assistant server",
6
13
  builder: (yargs) => yargs.option("port", {
7
14
  alias: "p",
8
15
  type: "number",
@@ -10,6 +17,7 @@ export const voiceServerCommand = {
10
17
  describe: "Port to listen on",
11
18
  }),
12
19
  handler: async (argv) => {
20
+ console.warn("[deprecation] 'neurolink voice-server' is deprecated. Use 'neurolink serve voice' instead. This alias will be removed in a future release.");
13
21
  configureVoiceServerEnvironment();
14
22
  await startVoiceServer(argv.port);
15
23
  },
@@ -276,9 +276,25 @@ export class CLICommandFactory {
276
276
  type: "string",
277
277
  description: "TTS voice to use (e.g., 'en-US-Neural2-C')",
278
278
  },
279
+ ttsProvider: {
280
+ type: "string",
281
+ choices: ["google-ai", "vertex", "openai-tts", "elevenlabs", "azure-tts"],
282
+ description: "TTS provider (overrides --provider for speech synthesis)",
283
+ },
279
284
  ttsFormat: {
280
285
  type: "string",
281
- choices: ["mp3", "wav", "ogg", "opus"],
286
+ choices: [
287
+ "mp3",
288
+ "wav",
289
+ "ogg",
290
+ "opus",
291
+ "m4a",
292
+ "flac",
293
+ "webm",
294
+ "mp4",
295
+ "mpeg",
296
+ "mpga",
297
+ ],
282
298
  default: "mp3",
283
299
  description: "Audio output format",
284
300
  },
@@ -302,6 +318,25 @@ export class CLICommandFactory {
302
318
  default: false,
303
319
  description: "Auto-play generated audio",
304
320
  },
321
+ // STT (Speech-to-Text) options
322
+ stt: {
323
+ type: "boolean",
324
+ default: false,
325
+ description: "Enable speech-to-text transcription of input audio",
326
+ },
327
+ sttProvider: {
328
+ type: "string",
329
+ choices: ["whisper", "deepgram", "google-stt", "azure-stt"],
330
+ description: "STT provider to use",
331
+ },
332
+ sttLanguage: {
333
+ type: "string",
334
+ description: "Audio language code for STT (e.g., en-US)",
335
+ },
336
+ inputAudio: {
337
+ type: "string",
338
+ description: "Path to audio file for STT transcription",
339
+ },
305
340
  // Video Generation options (Veo 3.1)
306
341
  outputMode: {
307
342
  type: "string",
@@ -439,10 +474,16 @@ export class CLICommandFactory {
439
474
  };
440
475
  // Helper method to build options for commands
441
476
  static buildOptions(yargs, additionalOptions = {}) {
442
- return yargs.options({
477
+ return (yargs
478
+ .options({
443
479
  ...CLICommandFactory.commonOptions,
444
480
  ...additionalOptions,
445
- });
481
+ })
482
+ // NEW9: implies relationships so users who pass --stt-provider or
483
+ // --input-audio without --stt get an actionable error from yargs
484
+ // instead of silently skipping STT.
485
+ .implies("sttProvider", "stt")
486
+ .implies("inputAudio", "stt"));
446
487
  }
447
488
  // Helper method to process CLI images with smart auto-detection
448
489
  static processCliImages(images) {
@@ -603,11 +644,17 @@ export class CLICommandFactory {
603
644
  // TTS options
604
645
  tts: argv.tts,
605
646
  ttsVoice: argv.ttsVoice,
647
+ ttsProvider: argv.ttsProvider,
606
648
  ttsFormat: argv.ttsFormat,
607
649
  ttsSpeed: argv.ttsSpeed,
608
650
  ttsQuality: argv.ttsQuality,
609
651
  ttsOutput: argv.ttsOutput,
610
652
  ttsPlay: argv.ttsPlay,
653
+ // STT options
654
+ stt: argv.stt,
655
+ sttProvider: argv.sttProvider,
656
+ sttLanguage: argv.sttLanguage,
657
+ inputAudio: argv.inputAudio,
611
658
  // Video generation options (Veo 3.1)
612
659
  outputMode: argv.outputMode,
613
660
  videoOutput: argv.videoOutput,
@@ -1080,7 +1127,7 @@ export class CLICommandFactory {
1080
1127
  */
1081
1128
  static createGenerateCommand() {
1082
1129
  return {
1083
- command: ["generate <input>", "gen <input>"],
1130
+ command: ["generate [input]", "gen [input]"],
1084
1131
  describe: "Generate content using AI providers",
1085
1132
  builder: (yargs) => {
1086
1133
  return CLICommandFactory.buildOptions(yargs
@@ -1113,7 +1160,7 @@ export class CLICommandFactory {
1113
1160
  */
1114
1161
  static createStreamCommand() {
1115
1162
  return {
1116
- command: "stream <input>",
1163
+ command: "stream [input]",
1117
1164
  describe: "Stream generation in real-time",
1118
1165
  builder: (yargs) => {
1119
1166
  return CLICommandFactory.buildOptions(yargs
@@ -1642,6 +1689,12 @@ export class CLICommandFactory {
1642
1689
  * Handle stdin input for generate command
1643
1690
  */
1644
1691
  static async handleGenerateStdinInput(argv) {
1692
+ // M10: STT-only runs (--stt + --input-audio with no positional prompt)
1693
+ // are valid — the transcription becomes the prompt downstream. Skip the
1694
+ // stdin/empty-input check in that case so users don't get
1695
+ // "Input required..." for an STT-only command.
1696
+ const isSttOnly = !!(argv.stt &&
1697
+ argv.inputAudio);
1645
1698
  if (!argv.input && !process.stdin.isTTY) {
1646
1699
  let stdinData = "";
1647
1700
  process.stdin.setEncoding("utf8");
@@ -1650,11 +1703,17 @@ export class CLICommandFactory {
1650
1703
  }
1651
1704
  const trimmedData = stdinData.trim();
1652
1705
  if (!trimmedData) {
1706
+ if (isSttOnly) {
1707
+ return "";
1708
+ }
1653
1709
  throw new Error("No input received from stdin");
1654
1710
  }
1655
1711
  return trimmedData;
1656
1712
  }
1657
1713
  else if (!argv.input) {
1714
+ if (isSttOnly) {
1715
+ return "";
1716
+ }
1658
1717
  throw new Error('Input required. Use: neurolink generate "your prompt" or echo "prompt" | neurolink generate');
1659
1718
  }
1660
1719
  return argv.input;
@@ -1903,6 +1962,20 @@ export class CLICommandFactory {
1903
1962
  // Build multimodal input and output configuration
1904
1963
  const generateInput = CLICommandFactory.buildGenerateMultimodalInput(inputText, argv);
1905
1964
  const outputConfig = CLICommandFactory.buildGenerateOutputConfig(isVideoMode, isPPTMode, enhancedOptions);
1965
+ // Read audio file for STT if --input-audio is provided.
1966
+ // NEW10: existsSync guard mirrors the stream handler so a missing file
1967
+ // produces a friendly error here too instead of a raw ENOENT crash.
1968
+ const inputAudioPath = enhancedOptions.inputAudio;
1969
+ if (inputAudioPath && !fs.existsSync(inputAudioPath)) {
1970
+ throw new Error(`--input-audio file not found: ${inputAudioPath}`);
1971
+ }
1972
+ const inputAudioBuffer = inputAudioPath
1973
+ ? fs.readFileSync(inputAudioPath)
1974
+ : undefined;
1975
+ // m2: shared format helper (was duplicated in generate + stream
1976
+ // handlers; now lives in src/lib/utils/audioFormatDetector.ts).
1977
+ const { inferAudioFormatFromPath } = await import("../../lib/utils/audioFormatDetector.js");
1978
+ const inputAudioFormat = inferAudioFormatFromPath(inputAudioPath);
1906
1979
  const runGenerate = () => sdk.generate({
1907
1980
  input: generateInput,
1908
1981
  csvOptions: {
@@ -1956,6 +2029,7 @@ export class CLICommandFactory {
1956
2029
  enabled: true,
1957
2030
  useAiResponse: true,
1958
2031
  voice: enhancedOptions.ttsVoice,
2032
+ provider: enhancedOptions.ttsProvider,
1959
2033
  format: enhancedOptions.ttsFormat || undefined,
1960
2034
  speed: enhancedOptions.ttsSpeed,
1961
2035
  quality: enhancedOptions.ttsQuality,
@@ -1963,6 +2037,16 @@ export class CLICommandFactory {
1963
2037
  play: enhancedOptions.ttsPlay,
1964
2038
  }
1965
2039
  : undefined,
2040
+ // STT configuration
2041
+ stt: enhancedOptions.stt
2042
+ ? {
2043
+ enabled: true,
2044
+ provider: enhancedOptions.sttProvider,
2045
+ language: enhancedOptions.sttLanguage,
2046
+ ...(inputAudioBuffer && { audio: inputAudioBuffer }),
2047
+ ...(inputAudioFormat && { format: inputAudioFormat }),
2048
+ }
2049
+ : undefined,
1966
2050
  });
1967
2051
  const result = await runGenerate();
1968
2052
  // Handle successful result
@@ -2094,7 +2178,7 @@ export class CLICommandFactory {
2094
2178
  const pdfFiles = CLICommandFactory.processCliPDFFiles(argv.pdf);
2095
2179
  const videoFiles = CLICommandFactory.processCliVideoFiles(argv.video);
2096
2180
  const files = CLICommandFactory.processCliFiles(argv.file);
2097
- const runStream = () => sdk.stream({
2181
+ const runStream = async () => sdk.stream({
2098
2182
  input: {
2099
2183
  text: inputText,
2100
2184
  ...(imageBuffers && { images: imageBuffers }),
@@ -2153,14 +2237,39 @@ export class CLICommandFactory {
2153
2237
  enabled: true,
2154
2238
  useAiResponse: true,
2155
2239
  voice: enhancedOptions.ttsVoice,
2156
- format: enhancedOptions.ttsFormat ||
2157
- undefined,
2240
+ provider: enhancedOptions.ttsProvider,
2241
+ format: enhancedOptions.ttsFormat || undefined,
2158
2242
  speed: enhancedOptions.ttsSpeed,
2159
2243
  quality: enhancedOptions.ttsQuality,
2160
2244
  output: enhancedOptions.ttsOutput,
2161
2245
  play: enhancedOptions.ttsPlay,
2162
2246
  }
2163
2247
  : undefined,
2248
+ // STT configuration. m2: shared format helper (was duplicated with
2249
+ // the generate handler; now lives in audioFormatDetector.ts).
2250
+ stt: enhancedOptions.stt
2251
+ ? await (async () => {
2252
+ const streamSttAudioPath = enhancedOptions.inputAudio;
2253
+ // Fail fast on a missing --input-audio so a CLI typo doesn't
2254
+ // turn into a confusing provider/validation error later
2255
+ // (matches the generate path).
2256
+ if (streamSttAudioPath && !fs.existsSync(streamSttAudioPath)) {
2257
+ throw new Error(`--input-audio file not found: ${streamSttAudioPath}`);
2258
+ }
2259
+ const streamSttAudio = streamSttAudioPath
2260
+ ? fs.readFileSync(streamSttAudioPath)
2261
+ : undefined;
2262
+ const { inferAudioFormatFromPath: inferFmt } = await import("../../lib/utils/audioFormatDetector.js");
2263
+ const streamSttFormat = inferFmt(streamSttAudioPath);
2264
+ return {
2265
+ enabled: true,
2266
+ provider: enhancedOptions.sttProvider,
2267
+ language: enhancedOptions.sttLanguage,
2268
+ ...(streamSttAudio && { audio: streamSttAudio }),
2269
+ ...(streamSttFormat && { format: streamSttFormat }),
2270
+ };
2271
+ })()
2272
+ : undefined,
2164
2273
  });
2165
2274
  const stream = await runStream();
2166
2275
  const streamResult = await CLICommandFactory.processStreamWithTimeout(stream, options);
@@ -2277,9 +2386,13 @@ export class CLICommandFactory {
2277
2386
  const isText = (o) => !!o &&
2278
2387
  typeof o === "object" &&
2279
2388
  typeof o.content === "string";
2280
- const isAudio = (o) => !!o &&
2281
- typeof o === "object" &&
2282
- o.type === "audio";
2389
+ const isAudio = (o) => {
2390
+ if (!o || typeof o !== "object") {
2391
+ return false;
2392
+ }
2393
+ const t = o.type;
2394
+ return t === "audio" || t === "tts_audio";
2395
+ };
2283
2396
  const isImage = (o) => {
2284
2397
  if (!o || typeof o !== "object") {
2285
2398
  return false;
@@ -2433,6 +2546,10 @@ export class CLICommandFactory {
2433
2546
  * Handle stdin input for stream command
2434
2547
  */
2435
2548
  static async handleStdinInput(argv) {
2549
+ // STT-only flow: --stt --input-audio <file> with no text prompt is now
2550
+ // valid (the stream pipeline transcribes the audio and uses the result
2551
+ // as the prompt). Skip the stdin/empty-input rejection in that case.
2552
+ const isSttOnly = !!argv.stt && !!argv.inputAudio;
2436
2553
  if (!argv.input && !process.stdin.isTTY) {
2437
2554
  let stdinData = "";
2438
2555
  process.stdin.setEncoding("utf8");
@@ -2441,10 +2558,18 @@ export class CLICommandFactory {
2441
2558
  }
2442
2559
  argv.input = stdinData.trim();
2443
2560
  if (!argv.input) {
2561
+ if (isSttOnly) {
2562
+ argv.input = "";
2563
+ return;
2564
+ }
2444
2565
  throw new Error("No input received from stdin");
2445
2566
  }
2446
2567
  }
2447
2568
  else if (!argv.input) {
2569
+ if (isSttOnly) {
2570
+ argv.input = "";
2571
+ return;
2572
+ }
2448
2573
  throw new Error('Input required. Use: neurolink stream "your prompt" or echo "prompt" | neurolink stream');
2449
2574
  }
2450
2575
  }
@@ -4,4 +4,4 @@ import type { OptionSchema, TextGenerationOptions } from "../../lib/types/index.
4
4
  * This object provides metadata for validation and help text in the CLI loop.
5
5
  * It is derived from the main TextGenerationOptions interface to ensure consistency.
6
6
  */
7
- export declare const textGenerationOptionsSchema: Record<keyof Omit<TextGenerationOptions, "prompt" | "input" | "schema" | "tools" | "context" | "conversationHistory" | "conversationMessages" | "conversationMemoryConfig" | "originalPrompt" | "middleware" | "expectedOutcome" | "evaluationCriteria" | "region" | "csvOptions" | "tts" | "thinkingConfig" | "requestId" | "fileRegistry" | "abortSignal" | "toolFilter" | "excludeTools" | "toolChoice" | "prepareStep" | "credentials">, OptionSchema>;
7
+ export declare const textGenerationOptionsSchema: Record<keyof Omit<TextGenerationOptions, "prompt" | "input" | "schema" | "tools" | "context" | "conversationHistory" | "conversationMessages" | "conversationMemoryConfig" | "originalPrompt" | "middleware" | "expectedOutcome" | "evaluationCriteria" | "region" | "csvOptions" | "tts" | "stt" | "thinkingConfig" | "requestId" | "fileRegistry" | "abortSignal" | "toolFilter" | "excludeTools" | "toolChoice" | "prepareStep" | "credentials">, OptionSchema>;
@@ -6,7 +6,7 @@
6
6
  *
7
7
  * @module cli/utils/audioFileUtils
8
8
  */
9
- import type { TTSResult, AudioSaveResult, AudioFormat } from "../../lib/types/index.js";
9
+ import type { TTSResult, AudioSaveResult, TTSAudioFormat } from "../../lib/types/index.js";
10
10
  /**
11
11
  * Format file size in human-readable format
12
12
  *
@@ -33,7 +33,7 @@ export declare function ensureDirectoryExists(filePath: string): Promise<void>;
33
33
  * @param format - Audio format
34
34
  * @returns File extension (including dot)
35
35
  */
36
- export declare function getAudioExtension(format: AudioFormat): string;
36
+ export declare function getAudioExtension(format: TTSAudioFormat): string;
37
37
  /**
38
38
  * Validate and normalize output path, adding extension if needed
39
39
  *
@@ -41,7 +41,7 @@ export declare function getAudioExtension(format: AudioFormat): string;
41
41
  * @param format - Audio format for extension
42
42
  * @returns Normalized output path
43
43
  */
44
- export declare function normalizeOutputPath(outputPath: string, format?: AudioFormat): string;
44
+ export declare function normalizeOutputPath(outputPath: string, format?: TTSAudioFormat): string;
45
45
  /**
46
46
  * Save TTS audio result to a file
47
47
  *
@@ -71,6 +71,10 @@ export function getAudioExtension(format) {
71
71
  return ".ogg";
72
72
  case "opus":
73
73
  return ".opus";
74
+ case "pcm16":
75
+ // Raw PCM16 (no RIFF/WAV header) — write to .pcm so consumers don't
76
+ // mistake it for a parseable WAV file.
77
+ return ".pcm";
74
78
  default:
75
79
  return ".mp3";
76
80
  }
@@ -86,7 +90,7 @@ export function normalizeOutputPath(outputPath, format = "mp3") {
86
90
  const resolvedPath = resolveOutputPath(outputPath);
87
91
  const ext = path.extname(resolvedPath).toLowerCase();
88
92
  // If no extension or wrong extension, add the correct one
89
- const validExtensions = [".mp3", ".wav", ".ogg", ".opus"];
93
+ const validExtensions = [".mp3", ".wav", ".ogg", ".opus", ".pcm"];
90
94
  if (!ext || !validExtensions.includes(ext)) {
91
95
  return resolvedPath + getAudioExtension(format);
92
96
  }
@@ -249,6 +249,11 @@ export class BaseProvider {
249
249
  excludeTools: options.excludeTools,
250
250
  skipToolPromptInjection: options.skipToolPromptInjection,
251
251
  timeout: options.timeout,
252
+ stt: options.stt,
253
+ // Forward TTS options too — without this, the fake-streaming fallback
254
+ // path silently drops `tts` and the resulting StreamResult never
255
+ // produces a `tts_audio` chunk even when synthesis was requested.
256
+ tts: options.tts,
252
257
  };
253
258
  logger.debug(`Calling generate for fake streaming`, {
254
259
  provider: this.providerName,
@@ -299,6 +304,23 @@ export class BaseProvider {
299
304
  imageOutput: result.imageOutput,
300
305
  };
301
306
  }
307
+ // Yield synthesized audio so callers using stream() with tts.enabled
308
+ // still receive a tts_audio chunk on the fake-streaming fallback
309
+ // path (matches the discriminator used by the real streaming path).
310
+ if (result?.audio) {
311
+ yield {
312
+ type: "tts_audio",
313
+ audio: {
314
+ data: result.audio.buffer,
315
+ format: result.audio.format,
316
+ index: 0,
317
+ isFinal: true,
318
+ cumulativeSize: result.audio.size,
319
+ voice: result.audio.voice,
320
+ sampleRate: result.audio.sampleRate,
321
+ },
322
+ };
323
+ }
302
324
  })(),
303
325
  usage: result?.usage,
304
326
  provider: result?.provider,
@@ -587,7 +609,7 @@ export class BaseProvider {
587
609
  if (!options.tts) {
588
610
  return this.enhanceResult(baseResult, options, startTime);
589
611
  }
590
- baseResult.audio = await TTSProcessor.synthesize(textToSynthesize, options.provider ?? this.providerName, options.tts);
612
+ baseResult.audio = await TTSProcessor.synthesize(textToSynthesize, options.tts.provider ?? options.provider ?? this.providerName, options.tts);
591
613
  }
592
614
  catch (ttsError) {
593
615
  logger.error(`TTS synthesis failed in Mode 1 (direct input synthesis):`, ttsError);
@@ -691,20 +713,21 @@ export class BaseProvider {
691
713
  const { toolsUsed, toolExecutions } = this.extractToolInformation(generateResult);
692
714
  let enhancedResult = this.formatEnhancedResult(generateResult, tools, toolsUsed, toolExecutions, options);
693
715
  enhancedResult = await this.synthesizeAIResponseIfNeeded(enhancedResult, options);
694
- return this.enhanceResult(enhancedResult, options, startTime);
716
+ const finalResult = await this.enhanceResult(enhancedResult, options, startTime);
717
+ return finalResult;
695
718
  }
696
719
  async synthesizeAIResponseIfNeeded(enhancedResult, options) {
697
720
  if (!options.tts?.enabled || !options.tts?.useAiResponse) {
698
721
  return enhancedResult;
699
722
  }
700
723
  const aiResponse = enhancedResult.content;
701
- const provider = options.provider ?? this.providerName;
702
- if (!aiResponse || !provider) {
724
+ const ttsProvider = options.tts?.provider ?? options.provider ?? this.providerName;
725
+ if (!aiResponse || !ttsProvider) {
703
726
  logger.warn(`TTS synthesis skipped despite being enabled`, {
704
727
  provider: this.providerName,
705
728
  hasAiResponse: !!aiResponse,
706
729
  aiResponseLength: aiResponse?.length ?? 0,
707
- hasProvider: !!provider,
730
+ hasProvider: !!ttsProvider,
708
731
  ttsConfig: {
709
732
  enabled: options.tts?.enabled,
710
733
  useAiResponse: options.tts?.useAiResponse,
@@ -716,7 +739,7 @@ export class BaseProvider {
716
739
  return enhancedResult;
717
740
  }
718
741
  try {
719
- const ttsResult = await TTSProcessor.synthesize(aiResponse, provider, options.tts);
742
+ const ttsResult = await TTSProcessor.synthesize(aiResponse, ttsProvider, options.tts);
720
743
  return {
721
744
  ...enhancedResult,
722
745
  audio: ttsResult,
@@ -7,6 +7,20 @@ export declare class ProviderRegistry {
7
7
  private static registered;
8
8
  private static registrationPromise;
9
9
  private static options;
10
+ /**
11
+ * NEW4: per-handler registration outcomes for the realtime voice
12
+ * providers. `"ok"` = registered; any other string = the error message.
13
+ * Empty until the first `registerAllProviders()` call.
14
+ */
15
+ static realtimeRegistration: Record<string, "ok" | string>;
16
+ /**
17
+ * Returns a snapshot of voice provider registration outcomes so callers
18
+ * can detect at runtime which voice handlers are usable. Useful in
19
+ * health-check endpoints and CI startup probes.
20
+ */
21
+ static getRegistrationReport(): {
22
+ realtime: Record<string, "ok" | string>;
23
+ };
10
24
  /**
11
25
  * Register all providers with the factory
12
26
  */
@@ -11,6 +11,20 @@ export class ProviderRegistry {
11
11
  static options = {
12
12
  enableManualMCP: false, // Default to disabled for safety
13
13
  };
14
+ /**
15
+ * NEW4: per-handler registration outcomes for the realtime voice
16
+ * providers. `"ok"` = registered; any other string = the error message.
17
+ * Empty until the first `registerAllProviders()` call.
18
+ */
19
+ static realtimeRegistration = {};
20
+ /**
21
+ * Returns a snapshot of voice provider registration outcomes so callers
22
+ * can detect at runtime which voice handlers are usable. Useful in
23
+ * health-check endpoints and CI startup probes.
24
+ */
25
+ static getRegistrationReport() {
26
+ return { realtime: { ...this.realtimeRegistration } };
27
+ }
14
28
  /**
15
29
  * Register all providers with the factory
16
30
  */
@@ -152,8 +166,7 @@ export class ProviderRegistry {
152
166
  const { LlamaCppProvider } = await import("../providers/llamaCpp.js");
153
167
  return new LlamaCppProvider(modelName, sdk, undefined, llamaCppCreds);
154
168
  }, process.env.LLAMACPP_MODEL || undefined, ["llamacpp", "llama.cpp", "llama-cpp"]);
155
- logger.debug("All providers registered successfully");
156
- this.registered = true;
169
+ logger.debug("All AI providers registered successfully");
157
170
  // ===== TTS HANDLER REGISTRATION =====
158
171
  try {
159
172
  // Create handler instance and register explicitly
@@ -172,6 +185,128 @@ export class ProviderRegistry {
172
185
  });
173
186
  // Don't throw - TTS is optional functionality
174
187
  }
188
+ // New TTS providers
189
+ try {
190
+ const { TTSProcessor } = await import("../utils/ttsProcessor.js");
191
+ const { OpenAITTS } = await import("../voice/providers/OpenAITTS.js");
192
+ TTSProcessor.registerHandler("openai-tts", new OpenAITTS());
193
+ }
194
+ catch (err) {
195
+ logger.debug(`[ProviderRegistry] openai-tts registration skipped: ${err instanceof Error ? err.message : String(err)}`);
196
+ }
197
+ try {
198
+ const { TTSProcessor } = await import("../utils/ttsProcessor.js");
199
+ const { ElevenLabsTTS } = await import("../voice/providers/ElevenLabsTTS.js");
200
+ const elevenLabsHandler = new ElevenLabsTTS();
201
+ TTSProcessor.registerHandler("elevenlabs", elevenLabsHandler);
202
+ TTSProcessor.registerHandler("elevenlabs-tts", elevenLabsHandler);
203
+ }
204
+ catch (err) {
205
+ logger.debug(`[ProviderRegistry] elevenlabs registration skipped: ${err instanceof Error ? err.message : String(err)}`);
206
+ }
207
+ try {
208
+ const { TTSProcessor } = await import("../utils/ttsProcessor.js");
209
+ const { AzureTTS } = await import("../voice/providers/AzureTTS.js");
210
+ TTSProcessor.registerHandler("azure-tts", new AzureTTS());
211
+ }
212
+ catch (err) {
213
+ logger.debug(`[ProviderRegistry] azure-tts registration skipped: ${err instanceof Error ? err.message : String(err)}`);
214
+ }
215
+ // ===== STT HANDLER REGISTRATION =====
216
+ try {
217
+ const { STTProcessor } = await import("../utils/sttProcessor.js");
218
+ try {
219
+ const { OpenAISTT } = await import("../voice/providers/OpenAISTT.js");
220
+ const openAISTT = new OpenAISTT();
221
+ STTProcessor.registerHandler("whisper", openAISTT);
222
+ STTProcessor.registerHandler("openai-stt", openAISTT);
223
+ }
224
+ catch (err) {
225
+ logger.debug(`[ProviderRegistry] whisper/openai-stt registration skipped: ${err instanceof Error ? err.message : String(err)}`);
226
+ }
227
+ try {
228
+ const { DeepgramSTT } = await import("../voice/providers/DeepgramSTT.js");
229
+ STTProcessor.registerHandler("deepgram", new DeepgramSTT());
230
+ }
231
+ catch (err) {
232
+ logger.debug(`[ProviderRegistry] deepgram registration skipped: ${err instanceof Error ? err.message : String(err)}`);
233
+ }
234
+ try {
235
+ const { GoogleSTT } = await import("../voice/providers/GoogleSTT.js");
236
+ STTProcessor.registerHandler("google-stt", new GoogleSTT());
237
+ }
238
+ catch (err) {
239
+ logger.debug(`[ProviderRegistry] google-stt registration skipped: ${err instanceof Error ? err.message : String(err)}`);
240
+ }
241
+ try {
242
+ const { AzureSTT } = await import("../voice/providers/AzureSTT.js");
243
+ STTProcessor.registerHandler("azure-stt", new AzureSTT());
244
+ }
245
+ catch (err) {
246
+ logger.debug(`[ProviderRegistry] azure-stt registration skipped: ${err instanceof Error ? err.message : String(err)}`);
247
+ }
248
+ logger.debug("STT handlers registered successfully", {
249
+ providers: ["whisper", "deepgram", "google-stt", "azure-stt"],
250
+ });
251
+ }
252
+ catch (sttError) {
253
+ logger.warn("Failed to register STT handlers - STT functionality will be unavailable", {
254
+ error: sttError instanceof Error ? sttError.message : String(sttError),
255
+ });
256
+ }
257
+ // ===== REALTIME HANDLER REGISTRATION =====
258
+ try {
259
+ const { RealtimeProcessor } = await import("../voice/RealtimeVoiceAPI.js");
260
+ // M9 + NEW4: track per-handler registration outcomes so the final
261
+ // log accurately reflects which voice providers succeeded vs which
262
+ // were skipped — instead of unconditionally claiming "registered
263
+ // successfully" or hiding failures at debug level.
264
+ const realtimeOutcomes = {};
265
+ try {
266
+ const { OpenAIRealtime } = await import("../voice/providers/OpenAIRealtime.js");
267
+ RealtimeProcessor.registerHandler("openai-realtime", new OpenAIRealtime());
268
+ realtimeOutcomes["openai-realtime"] = "ok";
269
+ }
270
+ catch (err) {
271
+ const msg = err instanceof Error ? err.message : String(err);
272
+ realtimeOutcomes["openai-realtime"] = msg;
273
+ // M9: promote per-handler failures to error level so users can
274
+ // see which shipped voice provider failed to register at startup.
275
+ logger.error(`[ProviderRegistry] openai-realtime registration failed: ${msg}`);
276
+ }
277
+ try {
278
+ const { GeminiLive } = await import("../voice/providers/GeminiLive.js");
279
+ RealtimeProcessor.registerHandler("gemini-live", new GeminiLive());
280
+ realtimeOutcomes["gemini-live"] = "ok";
281
+ }
282
+ catch (err) {
283
+ const msg = err instanceof Error ? err.message : String(err);
284
+ realtimeOutcomes["gemini-live"] = msg;
285
+ logger.error(`[ProviderRegistry] gemini-live registration failed: ${msg}`);
286
+ }
287
+ // NEW4: report the actual per-handler outcomes instead of an
288
+ // unconditional success log. Stored on the registry so callers can
289
+ // introspect via getRegistrationReport().
290
+ ProviderRegistry.realtimeRegistration = realtimeOutcomes;
291
+ const skipped = Object.entries(realtimeOutcomes).filter(([, v]) => v !== "ok");
292
+ if (skipped.length === 0) {
293
+ logger.info("[ProviderRegistry] Realtime handlers registered: openai-realtime, gemini-live");
294
+ }
295
+ else {
296
+ logger.warn(`[ProviderRegistry] Realtime handlers partial: ${skipped.length} skipped`, { outcomes: realtimeOutcomes });
297
+ }
298
+ }
299
+ catch (realtimeError) {
300
+ logger.warn("Failed to register Realtime handlers - Realtime functionality will be unavailable", {
301
+ error: realtimeError instanceof Error
302
+ ? realtimeError.message
303
+ : String(realtimeError),
304
+ });
305
+ }
306
+ // Mark registered ONLY after all blocks (AI + voice) attempted, so a
307
+ // subsequent registerAllProviders() call does not short-circuit when an
308
+ // optional handler block silently failed.
309
+ this.registered = true;
175
310
  }
176
311
  catch (error) {
177
312
  logger.error("Failed to register providers:", error);
@@ -191,6 +326,10 @@ export class ProviderRegistry {
191
326
  ProviderFactory.clearRegistrations();
192
327
  this.registered = false;
193
328
  this.registrationPromise = null;
329
+ // Reset realtime registration too — otherwise getRegistrationReport()
330
+ // can surface stale data from a previous run if the realtime block
331
+ // failed before reaching `realtimeRegistration = realtimeOutcomes`.
332
+ ProviderRegistry.realtimeRegistration = {};
194
333
  }
195
334
  /**
196
335
  * Set registry options (should be called before initialization)