@everworker/oneringai 0.4.6 → 0.4.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -18,6 +18,7 @@ var spawn = require('cross-spawn');
18
18
  var process2 = require('process');
19
19
  var stream = require('stream');
20
20
  var fs17 = require('fs/promises');
21
+ var events = require('events');
21
22
  var simpleIcons = require('simple-icons');
22
23
  var child_process = require('child_process');
23
24
  var util = require('util');
@@ -12774,6 +12775,9 @@ var MODEL_REGISTRY = {
12774
12775
  video: false,
12775
12776
  batchAPI: true,
12776
12777
  promptCaching: true,
12778
+ parameters: {
12779
+ temperature: false
12780
+ },
12777
12781
  input: {
12778
12782
  tokens: 128e3,
12779
12783
  text: true,
@@ -20931,6 +20935,9 @@ var StreamEventType = /* @__PURE__ */ ((StreamEventType2) => {
20931
20935
  StreamEventType2["REASONING_DONE"] = "response.reasoning.done";
20932
20936
  StreamEventType2["RESPONSE_COMPLETE"] = "response.complete";
20933
20937
  StreamEventType2["ERROR"] = "response.error";
20938
+ StreamEventType2["AUDIO_CHUNK_READY"] = "response.audio_chunk.ready";
20939
+ StreamEventType2["AUDIO_CHUNK_ERROR"] = "response.audio_chunk.error";
20940
+ StreamEventType2["AUDIO_STREAM_COMPLETE"] = "response.audio_stream.complete";
20934
20941
  return StreamEventType2;
20935
20942
  })(StreamEventType || {});
20936
20943
  function isStreamEvent(event, type) {
@@ -20960,6 +20967,15 @@ function isResponseComplete(event) {
20960
20967
  function isErrorEvent(event) {
20961
20968
  return event.type === "response.error" /* ERROR */;
20962
20969
  }
20970
+ function isAudioChunkReady(event) {
20971
+ return event.type === "response.audio_chunk.ready" /* AUDIO_CHUNK_READY */;
20972
+ }
20973
+ function isAudioChunkError(event) {
20974
+ return event.type === "response.audio_chunk.error" /* AUDIO_CHUNK_ERROR */;
20975
+ }
20976
+ function isAudioStreamComplete(event) {
20977
+ return event.type === "response.audio_stream.complete" /* AUDIO_STREAM_COMPLETE */;
20978
+ }
20963
20979
 
20964
20980
  // src/infrastructure/providers/openai/OpenAIResponsesStreamConverter.ts
20965
20981
  var OpenAIResponsesStreamConverter = class {
@@ -34267,6 +34283,56 @@ var OpenAITTSProvider = class extends BaseMediaProvider {
34267
34283
  { model: options.model, voice: options.voice }
34268
34284
  );
34269
34285
  }
34286
+ /**
34287
+ * Check if streaming is supported for the given format
34288
+ */
34289
+ supportsStreaming(format) {
34290
+ if (!format) return true;
34291
+ return ["pcm", "wav", "mp3", "opus", "aac", "flac"].includes(format);
34292
+ }
34293
+ /**
34294
+ * Stream TTS audio chunks as they arrive from the API
34295
+ */
34296
+ async *synthesizeStream(options) {
34297
+ const format = this.mapFormat(options.format);
34298
+ const requestParams = {
34299
+ model: options.model,
34300
+ input: options.input,
34301
+ voice: options.voice,
34302
+ response_format: format,
34303
+ speed: options.speed
34304
+ };
34305
+ if (options.vendorOptions?.instructions) {
34306
+ requestParams.instructions = options.vendorOptions.instructions;
34307
+ }
34308
+ this.logOperationStart("tts.synthesizeStream", {
34309
+ model: options.model,
34310
+ voice: options.voice,
34311
+ inputLength: options.input.length,
34312
+ format
34313
+ });
34314
+ try {
34315
+ const response = await this.client.audio.speech.create(requestParams);
34316
+ const body = response.body;
34317
+ if (!body) {
34318
+ throw new Error("No response body from OpenAI TTS API");
34319
+ }
34320
+ let totalBytes = 0;
34321
+ for await (const chunk of body) {
34322
+ const buf = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
34323
+ totalBytes += buf.length;
34324
+ yield { audio: buf, isFinal: false };
34325
+ }
34326
+ yield { audio: Buffer.alloc(0), isFinal: true };
34327
+ this.logOperationComplete("tts.synthesizeStream", {
34328
+ model: options.model,
34329
+ totalBytes
34330
+ });
34331
+ } catch (error) {
34332
+ this.handleError(error);
34333
+ throw error;
34334
+ }
34335
+ }
34270
34336
  /**
34271
34337
  * List available voices (returns static list for OpenAI)
34272
34338
  */
@@ -35009,6 +35075,35 @@ var TextToSpeech = class _TextToSpeech {
35009
35075
  const response = await this.synthesize(text, options);
35010
35076
  await fs17__namespace.writeFile(filePath, response.audio);
35011
35077
  }
35078
+ // ======================== Streaming Methods ========================
35079
+ /**
35080
+ * Check if the underlying provider supports streaming TTS
35081
+ */
35082
+ supportsStreaming(format) {
35083
+ const provider = this.provider;
35084
+ return typeof provider.supportsStreaming === "function" && provider.supportsStreaming(format);
35085
+ }
35086
+ /**
35087
+ * Stream TTS audio chunks as they arrive from the API.
35088
+ * Falls back to buffered synthesis yielding a single chunk if provider doesn't support streaming.
35089
+ */
35090
+ async *synthesizeStream(text, options) {
35091
+ const fullOptions = {
35092
+ model: this.config.model ?? this.getDefaultModel(),
35093
+ input: text,
35094
+ voice: options?.voice ?? this.config.voice ?? this.getDefaultVoice(),
35095
+ format: options?.format ?? this.config.format,
35096
+ speed: options?.speed ?? this.config.speed,
35097
+ vendorOptions: options?.vendorOptions
35098
+ };
35099
+ const provider = this.provider;
35100
+ if (typeof provider.synthesizeStream === "function" && provider.supportsStreaming?.(fullOptions.format)) {
35101
+ yield* provider.synthesizeStream(fullOptions);
35102
+ } else {
35103
+ const response = await this.provider.synthesize(fullOptions);
35104
+ yield { audio: response.audio, isFinal: true };
35105
+ }
35106
+ }
35012
35107
  // ======================== Introspection Methods ========================
35013
35108
  /**
35014
35109
  * Get model information for current or specified model
@@ -38713,6 +38808,694 @@ var VideoGeneration = class _VideoGeneration {
38713
38808
  }
38714
38809
  };
38715
38810
 
38811
+ // src/capabilities/speech/SentenceSplitter.ts
38812
+ var DEFAULT_ABBREVIATIONS = /* @__PURE__ */ new Set([
38813
+ "dr.",
38814
+ "mr.",
38815
+ "mrs.",
38816
+ "ms.",
38817
+ "prof.",
38818
+ "sr.",
38819
+ "jr.",
38820
+ "st.",
38821
+ "ave.",
38822
+ "blvd.",
38823
+ "rd.",
38824
+ "u.s.",
38825
+ "u.k.",
38826
+ "u.s.a.",
38827
+ "u.n.",
38828
+ "e.g.",
38829
+ "i.e.",
38830
+ "etc.",
38831
+ "vs.",
38832
+ "viz.",
38833
+ "approx.",
38834
+ "dept.",
38835
+ "est.",
38836
+ "inc.",
38837
+ "ltd.",
38838
+ "corp.",
38839
+ "no.",
38840
+ "vol.",
38841
+ "rev.",
38842
+ "gen.",
38843
+ "gov.",
38844
+ "jan.",
38845
+ "feb.",
38846
+ "mar.",
38847
+ "apr.",
38848
+ "jun.",
38849
+ "jul.",
38850
+ "aug.",
38851
+ "sep.",
38852
+ "oct.",
38853
+ "nov.",
38854
+ "dec.",
38855
+ "fig.",
38856
+ "eq.",
38857
+ "ref.",
38858
+ "sec.",
38859
+ "ch.",
38860
+ "min.",
38861
+ "max.",
38862
+ "avg."
38863
+ ]);
38864
+ var DEFAULT_OPTIONS = {
38865
+ minChunkLength: 20,
38866
+ maxChunkLength: 500,
38867
+ skipCodeBlocks: true,
38868
+ stripMarkdown: true,
38869
+ additionalAbbreviations: []
38870
+ };
38871
+ var SentenceChunkingStrategy = class {
38872
+ buffer = "";
38873
+ inCodeBlock = false;
38874
+ codeBlockBuffer = "";
38875
+ options;
38876
+ abbreviations;
38877
+ constructor(options) {
38878
+ this.options = { ...DEFAULT_OPTIONS, ...options };
38879
+ this.abbreviations = /* @__PURE__ */ new Set([
38880
+ ...DEFAULT_ABBREVIATIONS,
38881
+ ...this.options.additionalAbbreviations.map((a) => a.toLowerCase())
38882
+ ]);
38883
+ }
38884
+ feed(delta) {
38885
+ this.buffer += delta;
38886
+ return this.extractChunks();
38887
+ }
38888
+ flush() {
38889
+ if (this.inCodeBlock) {
38890
+ this.codeBlockBuffer = "";
38891
+ this.inCodeBlock = false;
38892
+ }
38893
+ const text = this.cleanForSpeech(this.buffer.trim());
38894
+ this.buffer = "";
38895
+ return text.length > 0 ? text : null;
38896
+ }
38897
+ reset() {
38898
+ this.buffer = "";
38899
+ this.inCodeBlock = false;
38900
+ this.codeBlockBuffer = "";
38901
+ }
38902
+ // ======================== Private Methods ========================
38903
+ extractChunks() {
38904
+ const chunks = [];
38905
+ if (this.options.skipCodeBlocks) {
38906
+ this.processCodeBlocks();
38907
+ }
38908
+ let paragraphIdx = this.buffer.indexOf("\n\n");
38909
+ while (paragraphIdx !== -1) {
38910
+ const chunk = this.buffer.slice(0, paragraphIdx).trim();
38911
+ this.buffer = this.buffer.slice(paragraphIdx + 2);
38912
+ if (chunk.length > 0) {
38913
+ const cleaned = this.cleanForSpeech(chunk);
38914
+ if (cleaned.length > 0) {
38915
+ chunks.push(cleaned);
38916
+ }
38917
+ }
38918
+ paragraphIdx = this.buffer.indexOf("\n\n");
38919
+ }
38920
+ let sentenceEnd = this.findSentenceBoundary();
38921
+ while (sentenceEnd !== -1) {
38922
+ const sentence = this.buffer.slice(0, sentenceEnd).trim();
38923
+ this.buffer = this.buffer.slice(sentenceEnd).trimStart();
38924
+ if (sentence.length > 0) {
38925
+ const cleaned = this.cleanForSpeech(sentence);
38926
+ if (cleaned.length > 0) {
38927
+ chunks.push(cleaned);
38928
+ }
38929
+ }
38930
+ sentenceEnd = this.findSentenceBoundary();
38931
+ }
38932
+ if (this.buffer.length > this.options.maxChunkLength) {
38933
+ const splitChunks = this.splitLongText(this.buffer);
38934
+ this.buffer = splitChunks.pop() ?? "";
38935
+ for (const chunk of splitChunks) {
38936
+ const cleaned = this.cleanForSpeech(chunk.trim());
38937
+ if (cleaned.length > 0) {
38938
+ chunks.push(cleaned);
38939
+ }
38940
+ }
38941
+ }
38942
+ return this.mergeSmallChunks(chunks);
38943
+ }
38944
+ /**
38945
+ * Track and remove fenced code blocks from the buffer.
38946
+ * Text inside code blocks is discarded (not spoken).
38947
+ */
38948
+ processCodeBlocks() {
38949
+ let idx = 0;
38950
+ let result = "";
38951
+ while (idx < this.buffer.length) {
38952
+ if (this.buffer.startsWith("```", idx)) {
38953
+ if (this.inCodeBlock) {
38954
+ this.inCodeBlock = false;
38955
+ this.codeBlockBuffer = "";
38956
+ idx += 3;
38957
+ const newline = this.buffer.indexOf("\n", idx);
38958
+ idx = newline !== -1 ? newline + 1 : this.buffer.length;
38959
+ } else {
38960
+ this.inCodeBlock = true;
38961
+ this.codeBlockBuffer = "";
38962
+ idx += 3;
38963
+ const newline = this.buffer.indexOf("\n", idx);
38964
+ idx = newline !== -1 ? newline + 1 : this.buffer.length;
38965
+ }
38966
+ } else if (this.inCodeBlock) {
38967
+ this.codeBlockBuffer += this.buffer[idx];
38968
+ idx++;
38969
+ } else {
38970
+ result += this.buffer[idx];
38971
+ idx++;
38972
+ }
38973
+ }
38974
+ this.buffer = result;
38975
+ }
38976
+ /**
38977
+ * Find the position right after the next sentence boundary.
38978
+ * Returns -1 if no complete sentence boundary found.
38979
+ */
38980
+ findSentenceBoundary() {
38981
+ const terminators = [".", "?", "!"];
38982
+ for (let i = 0; i < this.buffer.length; i++) {
38983
+ const ch = this.buffer.charAt(i);
38984
+ if (!terminators.includes(ch)) continue;
38985
+ if (i + 1 >= this.buffer.length) return -1;
38986
+ const nextChar = this.buffer[i + 1];
38987
+ if (nextChar !== " " && nextChar !== "\n" && nextChar !== "\r" && nextChar !== " ") {
38988
+ continue;
38989
+ }
38990
+ if (ch === ".") {
38991
+ if (this.isAbbreviation(i)) continue;
38992
+ if (this.isDecimalNumber(i)) continue;
38993
+ if (this.isEllipsis(i)) continue;
38994
+ }
38995
+ const candidate = this.buffer.slice(0, i + 1).trim();
38996
+ if (candidate.length < this.options.minChunkLength) continue;
38997
+ return i + 1;
38998
+ }
38999
+ return -1;
39000
+ }
39001
+ /**
39002
+ * Check if the period at position `pos` is part of a known abbreviation.
39003
+ */
39004
+ isAbbreviation(pos) {
39005
+ let wordStart = pos - 1;
39006
+ while (wordStart >= 0 && this.buffer[wordStart] !== " " && this.buffer[wordStart] !== "\n") {
39007
+ wordStart--;
39008
+ }
39009
+ wordStart++;
39010
+ const word = this.buffer.slice(wordStart, pos + 1).toLowerCase();
39011
+ return this.abbreviations.has(word);
39012
+ }
39013
+ /**
39014
+ * Check if the period at position `pos` is a decimal point.
39015
+ * e.g., 3.14, $1.50
39016
+ */
39017
+ isDecimalNumber(pos) {
39018
+ if (pos === 0 || pos + 1 >= this.buffer.length) return false;
39019
+ const before = this.buffer.charAt(pos - 1);
39020
+ const after = this.buffer.charAt(pos + 1);
39021
+ return /\d/.test(before) && /\d/.test(after);
39022
+ }
39023
+ /**
39024
+ * Check if the period at position `pos` is part of an ellipsis (...).
39025
+ */
39026
+ isEllipsis(pos) {
39027
+ if (pos >= 2 && this.buffer[pos - 1] === "." && this.buffer[pos - 2] === ".") return true;
39028
+ if (pos + 1 < this.buffer.length && this.buffer[pos + 1] === ".") return true;
39029
+ return false;
39030
+ }
39031
+ /**
39032
+ * Split text that exceeds maxChunkLength at clause boundaries.
39033
+ */
39034
+ splitLongText(text) {
39035
+ const max = this.options.maxChunkLength;
39036
+ const chunks = [];
39037
+ let remaining = text;
39038
+ while (remaining.length > max) {
39039
+ let splitPos = -1;
39040
+ const clauseBreaks = [",", ";", ":", " \u2014", " \u2013", " -"];
39041
+ for (const brk of clauseBreaks) {
39042
+ const searchRegion = remaining.slice(0, max);
39043
+ const lastPos = searchRegion.lastIndexOf(brk);
39044
+ if (lastPos > this.options.minChunkLength) {
39045
+ splitPos = lastPos + brk.length;
39046
+ break;
39047
+ }
39048
+ }
39049
+ if (splitPos === -1) {
39050
+ const searchRegion = remaining.slice(0, max);
39051
+ splitPos = searchRegion.lastIndexOf(" ");
39052
+ if (splitPos <= this.options.minChunkLength) {
39053
+ splitPos = max;
39054
+ }
39055
+ }
39056
+ chunks.push(remaining.slice(0, splitPos));
39057
+ remaining = remaining.slice(splitPos);
39058
+ }
39059
+ chunks.push(remaining);
39060
+ return chunks;
39061
+ }
39062
+ /**
39063
+ * Merge chunks that are shorter than minChunkLength with the next chunk.
39064
+ */
39065
+ mergeSmallChunks(chunks) {
39066
+ if (chunks.length <= 1) return chunks;
39067
+ const merged = [];
39068
+ let accumulator = "";
39069
+ for (const chunk of chunks) {
39070
+ if (accumulator.length > 0) {
39071
+ accumulator += " " + chunk;
39072
+ } else {
39073
+ accumulator = chunk;
39074
+ }
39075
+ if (accumulator.length >= this.options.minChunkLength) {
39076
+ merged.push(accumulator);
39077
+ accumulator = "";
39078
+ }
39079
+ }
39080
+ if (accumulator.length > 0) {
39081
+ if (merged.length > 0) {
39082
+ merged[merged.length - 1] += " " + accumulator;
39083
+ } else {
39084
+ merged.push(accumulator);
39085
+ }
39086
+ }
39087
+ return merged;
39088
+ }
39089
+ /**
39090
+ * Strip markdown formatting from text for natural speech.
39091
+ */
39092
+ cleanForSpeech(text) {
39093
+ if (!this.options.stripMarkdown) return text;
39094
+ let cleaned = text;
39095
+ cleaned = cleaned.replace(/`([^`]+)`/g, "$1");
39096
+ cleaned = cleaned.replace(/\*\*([^*]+)\*\*/g, "$1");
39097
+ cleaned = cleaned.replace(/__([^_]+)__/g, "$1");
39098
+ cleaned = cleaned.replace(/(?<!\*)\*([^*]+)\*(?!\*)/g, "$1");
39099
+ cleaned = cleaned.replace(/(?<!_)_([^_]+)_(?!_)/g, "$1");
39100
+ cleaned = cleaned.replace(/~~([^~]+)~~/g, "$1");
39101
+ cleaned = cleaned.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1");
39102
+ cleaned = cleaned.replace(/!\[([^\]]*)\]\([^)]+\)/g, "");
39103
+ cleaned = cleaned.replace(/^#{1,6}\s+/gm, "");
39104
+ cleaned = cleaned.replace(/^[-*+]\s+/gm, "");
39105
+ cleaned = cleaned.replace(/^\d+\.\s+/gm, "");
39106
+ cleaned = cleaned.replace(/^>\s+/gm, "");
39107
+ cleaned = cleaned.replace(/^[-*_]{3,}\s*$/gm, "");
39108
+ cleaned = cleaned.replace(/\n+/g, " ");
39109
+ cleaned = cleaned.replace(/\s{2,}/g, " ");
39110
+ return cleaned.trim();
39111
+ }
39112
+ };
39113
+
39114
+ // src/capabilities/speech/VoiceStream.ts
39115
+ var VoiceStream = class _VoiceStream extends events.EventEmitter {
39116
+ tts;
39117
+ chunker;
39118
+ format;
39119
+ speed;
39120
+ maxConcurrentTTS;
39121
+ maxQueuedChunks;
39122
+ vendorOptions;
39123
+ streaming;
39124
+ // Pipeline state
39125
+ chunkIndex = 0;
39126
+ totalCharacters = 0;
39127
+ totalDuration = 0;
39128
+ activeJobs = /* @__PURE__ */ new Map();
39129
+ activeTTSCount = 0;
39130
+ interrupted = false;
39131
+ lastResponseId = "";
39132
+ _isDestroyed = false;
39133
+ // Semaphore for TTS concurrency control
39134
+ slotWaiters = [];
39135
+ // Audio event buffer for interleaving with text events
39136
+ audioEventBuffer = [];
39137
+ // Async notification: resolves when new events are pushed to audioEventBuffer
39138
+ bufferNotify = null;
39139
+ // Queue backpressure
39140
+ queueWaiters = [];
39141
+ /**
39142
+ * Create a new VoiceStream instance
39143
+ */
39144
+ static create(config) {
39145
+ return new _VoiceStream(config);
39146
+ }
39147
+ constructor(config) {
39148
+ super();
39149
+ this.tts = TextToSpeech.create({
39150
+ connector: config.ttsConnector,
39151
+ model: config.ttsModel,
39152
+ voice: config.voice
39153
+ });
39154
+ this.chunker = config.chunkingStrategy ?? new SentenceChunkingStrategy(config.chunkingOptions);
39155
+ this.format = config.format ?? "mp3";
39156
+ this.speed = config.speed ?? 1;
39157
+ this.maxConcurrentTTS = config.maxConcurrentTTS ?? 2;
39158
+ this.maxQueuedChunks = config.maxQueuedChunks ?? 5;
39159
+ this.vendorOptions = config.vendorOptions;
39160
+ this.streaming = config.streaming ?? false;
39161
+ }
39162
+ // ======================== Public API ========================
39163
+ /**
39164
+ * Transform an agent text stream into an augmented stream with audio events.
39165
+ * Original text events pass through unchanged; audio events are interleaved.
39166
+ *
39167
+ * The generator yields events in this order:
39168
+ * 1. All original StreamEvents (pass-through)
39169
+ * 2. AudioChunkReady/AudioChunkError events as TTS completes
39170
+ * 3. AudioStreamComplete as the final audio event
39171
+ */
39172
+ async *wrap(textStream) {
39173
+ this.reset();
39174
+ try {
39175
+ for await (const event of textStream) {
39176
+ yield event;
39177
+ if (event.response_id) {
39178
+ this.lastResponseId = event.response_id;
39179
+ }
39180
+ if (event.type === "response.output_text.delta" /* OUTPUT_TEXT_DELTA */ && !this.interrupted) {
39181
+ const completedChunks = this.chunker.feed(event.delta);
39182
+ for (const chunk of completedChunks) {
39183
+ await this.scheduleTTS(chunk);
39184
+ }
39185
+ }
39186
+ if ((event.type === "response.output_text.done" /* OUTPUT_TEXT_DONE */ || event.type === "response.complete" /* RESPONSE_COMPLETE */) && !this.interrupted) {
39187
+ const remaining = this.chunker.flush();
39188
+ if (remaining) {
39189
+ await this.scheduleTTS(remaining);
39190
+ }
39191
+ }
39192
+ yield* this.drainAudioBuffer();
39193
+ }
39194
+ while (this.activeJobs.size > 0 || this.audioEventBuffer.length > 0) {
39195
+ if (this.audioEventBuffer.length === 0) {
39196
+ await Promise.race([
39197
+ this.waitForBufferNotify(),
39198
+ ...Array.from(this.activeJobs.values()).map((j) => j.promise)
39199
+ ]);
39200
+ }
39201
+ yield* this.drainAudioBuffer();
39202
+ }
39203
+ if (this.chunkIndex > 0) {
39204
+ const completeEvent = {
39205
+ type: "response.audio_stream.complete" /* AUDIO_STREAM_COMPLETE */,
39206
+ response_id: this.lastResponseId,
39207
+ total_chunks: this.chunkIndex,
39208
+ total_characters: this.totalCharacters,
39209
+ total_duration_seconds: this.totalDuration > 0 ? this.totalDuration : void 0
39210
+ };
39211
+ yield completeEvent;
39212
+ this.emit("audio:complete", {
39213
+ totalChunks: this.chunkIndex,
39214
+ totalDurationSeconds: this.totalDuration > 0 ? this.totalDuration : void 0
39215
+ });
39216
+ }
39217
+ } finally {
39218
+ this.cleanup();
39219
+ }
39220
+ }
39221
+ /**
39222
+ * Interrupt audio generation. Cancels pending TTS and flushes queue.
39223
+ * Call this when the user sends a new message mid-speech.
39224
+ * Active HTTP requests cannot be cancelled but their results will be discarded.
39225
+ */
39226
+ interrupt() {
39227
+ this.interrupted = true;
39228
+ const pendingCount = this.activeJobs.size;
39229
+ this.activeJobs.clear();
39230
+ this.activeTTSCount = 0;
39231
+ this.audioEventBuffer = [];
39232
+ this.releaseAllWaiters();
39233
+ this.chunker.reset();
39234
+ this.emit("audio:interrupted", { pendingChunks: pendingCount });
39235
+ }
39236
+ /**
39237
+ * Reset state for a new stream. Called automatically by wrap().
39238
+ */
39239
+ reset() {
39240
+ this.chunkIndex = 0;
39241
+ this.totalCharacters = 0;
39242
+ this.totalDuration = 0;
39243
+ this.activeJobs.clear();
39244
+ this.activeTTSCount = 0;
39245
+ this.interrupted = false;
39246
+ this.lastResponseId = "";
39247
+ this.audioEventBuffer = [];
39248
+ this.bufferNotify = null;
39249
+ this.slotWaiters = [];
39250
+ this.queueWaiters = [];
39251
+ this.chunker.reset();
39252
+ }
39253
+ destroy() {
39254
+ this.interrupt();
39255
+ this._isDestroyed = true;
39256
+ this.removeAllListeners();
39257
+ }
39258
+ get isDestroyed() {
39259
+ return this._isDestroyed;
39260
+ }
39261
+ // ======================== Private Methods ========================
39262
+ /**
39263
+ * Schedule a text chunk for TTS synthesis.
39264
+ * Awaits a free queue slot if backpressure is active (lossless).
39265
+ */
39266
+ async scheduleTTS(text) {
39267
+ if (this.interrupted || this._isDestroyed) return;
39268
+ const cleanText = text.trim();
39269
+ if (cleanText.length === 0) return;
39270
+ while (this.activeJobs.size >= this.maxQueuedChunks && !this.interrupted) {
39271
+ await this.waitForQueueSlot();
39272
+ }
39273
+ if (this.interrupted) return;
39274
+ const index = this.chunkIndex++;
39275
+ this.totalCharacters += cleanText.length;
39276
+ const job = {
39277
+ index,
39278
+ text: cleanText,
39279
+ promise: this.executeTTS(index, cleanText)
39280
+ };
39281
+ this.activeJobs.set(index, job);
39282
+ job.promise.finally(() => {
39283
+ this.activeJobs.delete(index);
39284
+ this.releaseQueueWaiter();
39285
+ });
39286
+ }
39287
+ /**
39288
+ * Execute TTS for a single text chunk.
39289
+ * Respects concurrency semaphore.
39290
+ * Branches on streaming mode: yields sub-chunks or a single buffered chunk.
39291
+ */
39292
+ async executeTTS(index, text) {
39293
+ while (this.activeTTSCount >= this.maxConcurrentTTS && !this.interrupted) {
39294
+ await this.waitForTTSSlot();
39295
+ }
39296
+ if (this.interrupted) return;
39297
+ this.activeTTSCount++;
39298
+ try {
39299
+ const ttsStart = Date.now();
39300
+ if (this.streaming && this.tts.supportsStreaming(this.format)) {
39301
+ let subIndex = 0;
39302
+ const streamFormat = this.format === "mp3" ? "pcm" : this.format;
39303
+ const MIN_BUFFER_BYTES = 6e3;
39304
+ const pendingBuffers = [];
39305
+ let pendingSize = 0;
39306
+ const flushPending = () => {
39307
+ if (pendingSize === 0) return;
39308
+ const merged = Buffer.concat(pendingBuffers, pendingSize);
39309
+ pendingBuffers.length = 0;
39310
+ pendingSize = 0;
39311
+ const currentSubIndex = subIndex++;
39312
+ const audioEvent = {
39313
+ type: "response.audio_chunk.ready" /* AUDIO_CHUNK_READY */,
39314
+ response_id: this.lastResponseId,
39315
+ chunk_index: index,
39316
+ sub_index: currentSubIndex,
39317
+ text: currentSubIndex === 0 ? text : "",
39318
+ audio_base64: merged.toString("base64"),
39319
+ format: streamFormat
39320
+ };
39321
+ this.pushAudioEvent(audioEvent);
39322
+ };
39323
+ for await (const chunk of this.tts.synthesizeStream(text, {
39324
+ format: streamFormat,
39325
+ speed: this.speed,
39326
+ vendorOptions: this.vendorOptions
39327
+ })) {
39328
+ if (this.interrupted) return;
39329
+ if (chunk.audio.length > 0) {
39330
+ pendingBuffers.push(chunk.audio);
39331
+ pendingSize += chunk.audio.length;
39332
+ if (pendingSize >= MIN_BUFFER_BYTES) {
39333
+ flushPending();
39334
+ }
39335
+ }
39336
+ if (chunk.isFinal) {
39337
+ break;
39338
+ }
39339
+ }
39340
+ flushPending();
39341
+ console.log(`[VoiceStream] TTS chunk ${index} streamed ${subIndex} sub-chunks in ${Date.now() - ttsStart}ms, text: "${text.slice(0, 40)}..."`);
39342
+ this.emit("audio:ready", { chunkIndex: index, text });
39343
+ } else {
39344
+ const response = await this.tts.synthesize(text, {
39345
+ format: this.format,
39346
+ speed: this.speed,
39347
+ vendorOptions: this.vendorOptions
39348
+ });
39349
+ if (this.interrupted) return;
39350
+ if (response.durationSeconds) {
39351
+ this.totalDuration += response.durationSeconds;
39352
+ }
39353
+ const audioEvent = {
39354
+ type: "response.audio_chunk.ready" /* AUDIO_CHUNK_READY */,
39355
+ response_id: this.lastResponseId,
39356
+ chunk_index: index,
39357
+ text,
39358
+ audio_base64: response.audio.toString("base64"),
39359
+ format: response.format,
39360
+ duration_seconds: response.durationSeconds,
39361
+ characters_used: response.charactersUsed
39362
+ };
39363
+ this.pushAudioEvent(audioEvent);
39364
+ console.log(`[VoiceStream] TTS chunk ${index} ready in ${Date.now() - ttsStart}ms, text: "${text.slice(0, 40)}..."`);
39365
+ this.emit("audio:ready", {
39366
+ chunkIndex: index,
39367
+ text,
39368
+ durationSeconds: response.durationSeconds
39369
+ });
39370
+ }
39371
+ } catch (error) {
39372
+ if (this.interrupted) return;
39373
+ const errorEvent = {
39374
+ type: "response.audio_chunk.error" /* AUDIO_CHUNK_ERROR */,
39375
+ response_id: this.lastResponseId,
39376
+ chunk_index: index,
39377
+ text,
39378
+ error: error.message
39379
+ };
39380
+ this.pushAudioEvent(errorEvent);
39381
+ this.emit("audio:error", {
39382
+ chunkIndex: index,
39383
+ text,
39384
+ error
39385
+ });
39386
+ } finally {
39387
+ this.activeTTSCount--;
39388
+ this.releaseTTSSlot();
39389
+ }
39390
+ }
39391
+ /**
39392
+ * Drain the audio event buffer, yielding all ready events.
39393
+ */
39394
+ *drainAudioBuffer() {
39395
+ while (this.audioEventBuffer.length > 0) {
39396
+ yield this.audioEventBuffer.shift();
39397
+ }
39398
+ }
39399
+ // ======================== Buffer Notification ========================
39400
+ /**
39401
+ * Push an audio event and wake up the consumer in wrap()
39402
+ */
39403
+ pushAudioEvent(event) {
39404
+ this.audioEventBuffer.push(event);
39405
+ if (this.bufferNotify) {
39406
+ this.bufferNotify();
39407
+ this.bufferNotify = null;
39408
+ }
39409
+ }
39410
+ /**
39411
+ * Wait until a new event is pushed to the audio buffer
39412
+ */
39413
+ waitForBufferNotify() {
39414
+ return new Promise((resolve4) => {
39415
+ this.bufferNotify = resolve4;
39416
+ });
39417
+ }
39418
+ // ======================== Semaphore / Backpressure ========================
39419
+ waitForTTSSlot() {
39420
+ return new Promise((resolve4) => {
39421
+ this.slotWaiters.push(resolve4);
39422
+ });
39423
+ }
39424
+ releaseTTSSlot() {
39425
+ const waiter = this.slotWaiters.shift();
39426
+ if (waiter) waiter();
39427
+ }
39428
+ waitForQueueSlot() {
39429
+ return new Promise((resolve4) => {
39430
+ this.queueWaiters.push(resolve4);
39431
+ });
39432
+ }
39433
+ releaseQueueWaiter() {
39434
+ const waiter = this.queueWaiters.shift();
39435
+ if (waiter) waiter();
39436
+ }
39437
+ releaseAllWaiters() {
39438
+ for (const waiter of this.slotWaiters) waiter();
39439
+ this.slotWaiters = [];
39440
+ for (const waiter of this.queueWaiters) waiter();
39441
+ this.queueWaiters = [];
39442
+ if (this.bufferNotify) {
39443
+ this.bufferNotify();
39444
+ this.bufferNotify = null;
39445
+ }
39446
+ }
39447
+ cleanup() {
39448
+ this.releaseAllWaiters();
39449
+ }
39450
+ };
39451
+
39452
+ // src/capabilities/speech/AudioPlaybackQueue.ts
39453
+ var AudioPlaybackQueue = class {
39454
+ buffer = /* @__PURE__ */ new Map();
39455
+ nextPlayIndex = 0;
39456
+ onReady;
39457
+ constructor(onReady) {
39458
+ this.onReady = onReady;
39459
+ }
39460
+ /**
39461
+ * Enqueue an audio chunk event. If it's the next expected chunk,
39462
+ * it (and any subsequent buffered chunks) are immediately delivered
39463
+ * to the callback in order.
39464
+ */
39465
+ enqueue(event) {
39466
+ this.buffer.set(event.chunk_index, event);
39467
+ this.drain();
39468
+ }
39469
+ /**
39470
+ * Reset the queue (e.g., on interruption or new stream).
39471
+ */
39472
+ reset() {
39473
+ this.buffer.clear();
39474
+ this.nextPlayIndex = 0;
39475
+ }
39476
+ /**
39477
+ * Number of chunks currently buffered waiting for earlier chunks.
39478
+ */
39479
+ get pendingCount() {
39480
+ return this.buffer.size;
39481
+ }
39482
+ /**
39483
+ * The next chunk index expected for playback.
39484
+ */
39485
+ get nextExpectedIndex() {
39486
+ return this.nextPlayIndex;
39487
+ }
39488
+ // ======================== Private ========================
39489
+ drain() {
39490
+ while (this.buffer.has(this.nextPlayIndex)) {
39491
+ const event = this.buffer.get(this.nextPlayIndex);
39492
+ this.buffer.delete(this.nextPlayIndex);
39493
+ this.nextPlayIndex++;
39494
+ this.onReady(event);
39495
+ }
39496
+ }
39497
+ };
39498
+
38716
39499
  // src/capabilities/search/SearchProvider.ts
38717
39500
  init_Connector();
38718
39501
 
@@ -55753,6 +56536,7 @@ exports.APPROVAL_STATE_VERSION = APPROVAL_STATE_VERSION;
55753
56536
  exports.Agent = Agent;
55754
56537
  exports.AgentContextNextGen = AgentContextNextGen;
55755
56538
  exports.ApproximateTokenEstimator = ApproximateTokenEstimator;
56539
+ exports.AudioPlaybackQueue = AudioPlaybackQueue;
55756
56540
  exports.BaseMediaProvider = BaseMediaProvider;
55757
56541
  exports.BasePluginNextGen = BasePluginNextGen;
55758
56542
  exports.BaseProvider = BaseProvider;
@@ -55847,6 +56631,7 @@ exports.STT_MODELS = STT_MODELS;
55847
56631
  exports.STT_MODEL_REGISTRY = STT_MODEL_REGISTRY;
55848
56632
  exports.ScrapeProvider = ScrapeProvider;
55849
56633
  exports.SearchProvider = SearchProvider;
56634
+ exports.SentenceChunkingStrategy = SentenceChunkingStrategy;
55850
56635
  exports.SerperProvider = SerperProvider;
55851
56636
  exports.Services = Services;
55852
56637
  exports.SimpleScheduler = SimpleScheduler;
@@ -55882,6 +56667,7 @@ exports.VIDEO_MODELS = VIDEO_MODELS;
55882
56667
  exports.VIDEO_MODEL_REGISTRY = VIDEO_MODEL_REGISTRY;
55883
56668
  exports.Vendor = Vendor;
55884
56669
  exports.VideoGeneration = VideoGeneration;
56670
+ exports.VoiceStream = VoiceStream;
55885
56671
  exports.WorkingMemory = WorkingMemory;
55886
56672
  exports.WorkingMemoryPluginNextGen = WorkingMemoryPluginNextGen;
55887
56673
  exports.addJitter = addJitter;
@@ -56070,6 +56856,9 @@ exports.grep = grep;
56070
56856
  exports.hasClipboardImage = hasClipboardImage;
56071
56857
  exports.hasVendorLogo = hasVendorLogo;
56072
56858
  exports.hydrateCustomTool = hydrateCustomTool;
56859
+ exports.isAudioChunkError = isAudioChunkError;
56860
+ exports.isAudioChunkReady = isAudioChunkReady;
56861
+ exports.isAudioStreamComplete = isAudioStreamComplete;
56073
56862
  exports.isBlockedCommand = isBlockedCommand;
56074
56863
  exports.isErrorEvent = isErrorEvent;
56075
56864
  exports.isExcludedExtension = isExcludedExtension;