@livekit/agents 0.5.2 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. package/dist/index.cjs +3 -0
  2. package/dist/index.cjs.map +1 -1
  3. package/dist/index.d.ts +2 -1
  4. package/dist/index.d.ts.map +1 -1
  5. package/dist/index.js +2 -0
  6. package/dist/index.js.map +1 -1
  7. package/dist/llm/index.cjs +2 -0
  8. package/dist/llm/index.cjs.map +1 -1
  9. package/dist/llm/index.d.ts +1 -1
  10. package/dist/llm/index.d.ts.map +1 -1
  11. package/dist/llm/index.js +2 -0
  12. package/dist/llm/index.js.map +1 -1
  13. package/dist/llm/llm.cjs +47 -3
  14. package/dist/llm/llm.cjs.map +1 -1
  15. package/dist/llm/llm.d.ts +15 -2
  16. package/dist/llm/llm.d.ts.map +1 -1
  17. package/dist/llm/llm.js +46 -3
  18. package/dist/llm/llm.js.map +1 -1
  19. package/dist/metrics/base.cjs +44 -0
  20. package/dist/metrics/base.cjs.map +1 -0
  21. package/dist/metrics/base.d.ts +96 -0
  22. package/dist/metrics/base.d.ts.map +1 -0
  23. package/dist/metrics/base.js +20 -0
  24. package/dist/metrics/base.js.map +1 -0
  25. package/dist/metrics/index.cjs +35 -0
  26. package/dist/metrics/index.cjs.map +1 -0
  27. package/dist/metrics/index.d.ts +5 -0
  28. package/dist/metrics/index.d.ts.map +1 -0
  29. package/dist/metrics/index.js +9 -0
  30. package/dist/metrics/index.js.map +1 -0
  31. package/dist/metrics/usage_collector.cjs +53 -0
  32. package/dist/metrics/usage_collector.cjs.map +1 -0
  33. package/dist/metrics/usage_collector.d.ts +14 -0
  34. package/dist/metrics/usage_collector.d.ts.map +1 -0
  35. package/dist/metrics/usage_collector.js +29 -0
  36. package/dist/metrics/usage_collector.js.map +1 -0
  37. package/dist/metrics/utils.cjs +104 -0
  38. package/dist/metrics/utils.cjs.map +1 -0
  39. package/dist/metrics/utils.d.ts +10 -0
  40. package/dist/metrics/utils.d.ts.map +1 -0
  41. package/dist/metrics/utils.js +73 -0
  42. package/dist/metrics/utils.js.map +1 -0
  43. package/dist/multimodal/multimodal_agent.cjs +7 -13
  44. package/dist/multimodal/multimodal_agent.cjs.map +1 -1
  45. package/dist/multimodal/multimodal_agent.d.ts +1 -4
  46. package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
  47. package/dist/multimodal/multimodal_agent.js +7 -13
  48. package/dist/multimodal/multimodal_agent.js.map +1 -1
  49. package/dist/pipeline/index.cjs +2 -0
  50. package/dist/pipeline/index.cjs.map +1 -1
  51. package/dist/pipeline/index.d.ts +1 -1
  52. package/dist/pipeline/index.d.ts.map +1 -1
  53. package/dist/pipeline/index.js +3 -1
  54. package/dist/pipeline/index.js.map +1 -1
  55. package/dist/pipeline/pipeline_agent.cjs +166 -66
  56. package/dist/pipeline/pipeline_agent.cjs.map +1 -1
  57. package/dist/pipeline/pipeline_agent.d.ts +10 -4
  58. package/dist/pipeline/pipeline_agent.d.ts.map +1 -1
  59. package/dist/pipeline/pipeline_agent.js +169 -69
  60. package/dist/pipeline/pipeline_agent.js.map +1 -1
  61. package/dist/pipeline/speech_handle.cjs +49 -1
  62. package/dist/pipeline/speech_handle.cjs.map +1 -1
  63. package/dist/pipeline/speech_handle.d.ts +12 -2
  64. package/dist/pipeline/speech_handle.d.ts.map +1 -1
  65. package/dist/pipeline/speech_handle.js +50 -2
  66. package/dist/pipeline/speech_handle.js.map +1 -1
  67. package/dist/stt/index.cjs.map +1 -1
  68. package/dist/stt/index.d.ts +1 -1
  69. package/dist/stt/index.d.ts.map +1 -1
  70. package/dist/stt/index.js.map +1 -1
  71. package/dist/stt/stream_adapter.cjs +15 -5
  72. package/dist/stt/stream_adapter.cjs.map +1 -1
  73. package/dist/stt/stream_adapter.d.ts +4 -1
  74. package/dist/stt/stream_adapter.d.ts.map +1 -1
  75. package/dist/stt/stream_adapter.js +15 -5
  76. package/dist/stt/stream_adapter.js.map +1 -1
  77. package/dist/stt/stt.cjs +46 -2
  78. package/dist/stt/stt.cjs.map +1 -1
  79. package/dist/stt/stt.d.ts +25 -3
  80. package/dist/stt/stt.d.ts.map +1 -1
  81. package/dist/stt/stt.js +46 -2
  82. package/dist/stt/stt.js.map +1 -1
  83. package/dist/tts/index.cjs +4 -2
  84. package/dist/tts/index.cjs.map +1 -1
  85. package/dist/tts/index.d.ts +1 -1
  86. package/dist/tts/index.d.ts.map +1 -1
  87. package/dist/tts/index.js +3 -1
  88. package/dist/tts/index.js.map +1 -1
  89. package/dist/tts/stream_adapter.cjs +14 -3
  90. package/dist/tts/stream_adapter.cjs.map +1 -1
  91. package/dist/tts/stream_adapter.d.ts +3 -0
  92. package/dist/tts/stream_adapter.d.ts.map +1 -1
  93. package/dist/tts/stream_adapter.js +15 -4
  94. package/dist/tts/stream_adapter.js.map +1 -1
  95. package/dist/tts/tts.cjs +109 -6
  96. package/dist/tts/tts.cjs.map +1 -1
  97. package/dist/tts/tts.d.ts +24 -1
  98. package/dist/tts/tts.d.ts.map +1 -1
  99. package/dist/tts/tts.js +107 -5
  100. package/dist/tts/tts.js.map +1 -1
  101. package/dist/vad.cjs +43 -2
  102. package/dist/vad.cjs.map +1 -1
  103. package/dist/vad.d.ts +21 -4
  104. package/dist/vad.d.ts.map +1 -1
  105. package/dist/vad.js +43 -2
  106. package/dist/vad.js.map +1 -1
  107. package/package.json +1 -1
  108. package/src/index.ts +2 -1
  109. package/src/llm/index.ts +2 -0
  110. package/src/llm/llm.ts +55 -3
  111. package/src/metrics/base.ts +127 -0
  112. package/src/metrics/index.ts +20 -0
  113. package/src/metrics/usage_collector.ts +40 -0
  114. package/src/metrics/utils.ts +100 -0
  115. package/src/multimodal/multimodal_agent.ts +12 -17
  116. package/src/pipeline/index.ts +1 -1
  117. package/src/pipeline/pipeline_agent.ts +206 -87
  118. package/src/pipeline/speech_handle.ts +67 -2
  119. package/src/stt/index.ts +2 -0
  120. package/src/stt/stream_adapter.ts +17 -5
  121. package/src/stt/stt.ts +67 -3
  122. package/src/tts/index.ts +2 -0
  123. package/src/tts/stream_adapter.ts +17 -4
  124. package/src/tts/tts.ts +127 -4
  125. package/src/vad.ts +61 -4
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/tts/tts.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { AsyncIterableQueue, mergeFrames } from '../utils.js';\n\n/** SynthesizedAudio is a packet of speech synthesis as returned by the TTS. */\nexport interface SynthesizedAudio {\n /** Request ID (one segment could be made up of multiple requests) */\n requestId: string;\n /** Segment ID, each segment is separated by a flush */\n segmentId: string;\n /** Synthesized audio frame */\n frame: AudioFrame;\n /** Current segment of the synthesized audio */\n deltaText?: string;\n}\n\n/**\n * Describes the capabilities of the TTS provider.\n *\n * @remarks\n * At present, only `streaming` is supplied to this interface, and the framework only supports\n * providers that do have a streaming endpoint.\n */\nexport interface TTSCapabilities {\n streaming: boolean;\n}\n\n/**\n * An instance of a text-to-speech adapter.\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child TTS class, which inherits this class's methods.\n */\nexport abstract class TTS {\n #capabilities: TTSCapabilities;\n #sampleRate: number;\n #numChannels: number;\n\n constructor(sampleRate: number, numChannels: number, capabilities: TTSCapabilities) {\n this.#capabilities = capabilities;\n this.#sampleRate = sampleRate;\n this.#numChannels = numChannels;\n }\n\n /** Returns this TTS's capabilities */\n get capabilities(): TTSCapabilities {\n return this.#capabilities;\n }\n\n /** Returns the sample rate of audio frames returned by this TTS */\n get sampleRate(): number {\n return this.#sampleRate;\n }\n\n /** Returns the channel count of audio frames returned by this TTS */\n get numChannels(): number {\n return this.#numChannels;\n }\n\n /**\n * Receives text and returns synthesis in the form of a {@link ChunkedStream}\n */\n abstract synthesize(text: string): ChunkedStream;\n\n /**\n * Returns a {@link SynthesizeStream} that can be used to push text and receive audio data\n */\n abstract stream(): SynthesizeStream;\n}\n\n/**\n * An instance of a text-to-speech stream, as an asynchronous iterable iterator.\n *\n * @example Looping through frames\n * ```ts\n * for await (const event of stream) {\n * await source.captureFrame(event.frame);\n * }\n * ```\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child SynthesizeStream class, which inherits this class's methods.\n */\nexport abstract class SynthesizeStream\n implements AsyncIterableIterator<SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM>\n{\n protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');\n static readonly END_OF_STREAM = Symbol('END_OF_STREAM');\n protected input = new AsyncIterableQueue<string | typeof SynthesizeStream.FLUSH_SENTINEL>();\n protected queue = new AsyncIterableQueue<\n SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM\n >();\n protected closed = false;\n\n /** Push a string of text to the TTS */\n pushText(text: string) {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(text);\n }\n\n /** Flush the TTS, causing it to process all pending text */\n flush() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(SynthesizeStream.FLUSH_SENTINEL);\n }\n\n /** Mark the input as ended and forbid additional pushes */\n endInput() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.close();\n }\n\n next(): Promise<IteratorResult<SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM>> {\n return this.queue.next();\n }\n\n /** Close both the input and output of the TTS stream */\n close() {\n this.input.close();\n this.queue.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): SynthesizeStream {\n return this;\n }\n}\n\n/**\n * An instance of a text-to-speech response, as an asynchronous iterable iterator.\n *\n * @example Looping through frames\n * ```ts\n * for await (const event of stream) {\n * await source.captureFrame(event.frame);\n * }\n * ```\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child ChunkedStream class, which inherits this class's methods.\n */\nexport abstract class ChunkedStream implements AsyncIterableIterator<SynthesizedAudio> {\n protected queue = new AsyncIterableQueue<SynthesizedAudio>();\n protected closed = false;\n\n /** Collect every frame into one in a single call */\n async collect(): Promise<AudioFrame> {\n const frames = [];\n for await (const event of this) {\n frames.push(event.frame);\n }\n return mergeFrames(frames);\n }\n\n next(): Promise<IteratorResult<SynthesizedAudio>> {\n return this.queue.next();\n }\n\n /** Close both the input and output of the TTS stream */\n close() {\n this.queue.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): ChunkedStream {\n return this;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAIA,mBAAgD;AAgCzC,MAAe,IAAI;AAAA,EACxB;AAAA,EACA;AAAA,EACA;AAAA,EAEA,YAAY,YAAoB,aAAqB,cAA+B;AAClF,SAAK,gBAAgB;AACrB,SAAK,cAAc;AACnB,SAAK,eAAe;AAAA,EACtB;AAAA;AAAA,EAGA,IAAI,eAAgC;AAClC,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,IAAI,aAAqB;AACvB,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,IAAI,cAAsB;AACxB,WAAO,KAAK;AAAA,EACd;AAWF;AAgBO,MAAe,iBAEtB;AAAA,EACE,OAA0B,iBAAiB,OAAO,gBAAgB;AAAA,EAClE,OAAgB,gBAAgB,OAAO,eAAe;AAAA,EAC5C,QAAQ,IAAI,gCAAoE;AAAA,EAChF,QAAQ,IAAI,gCAEpB;AAAA,EACQ,SAAS;AAAA;AAAA,EAGnB,SAAS,MAAc;AACrB,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,IAAI;AAAA,EACrB;AAAA;AAAA,EAGA,QAAQ;AACN,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,iBAAiB,cAAc;AAAA,EAChD;AAAA;AAAA,EAGA,WAAW;AACT,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,MAAM;AAAA,EACnB;AAAA,EAEA,OAA0F;AACxF,WAAO,KAAK,MAAM,KAAK;AAAA,EACzB;AAAA;AAAA,EAGA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,MAAM,MAAM;AACjB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAsB;AACzC,WAAO;AAAA,EACT;AACF;AAgBO,MAAe,cAAiE;AAAA,EAC3E,QAAQ,IAAI,gCAAqC;AAAA,EACjD,SAAS;AAAA;AAAA,EAGnB,MAAM,UAA+B;AACnC,UAAM,SAAS,CAAC;AAChB,qBAAiB,SAAS,MAAM;AAC9B,aAAO,KAAK,MAAM,KAAK;AAAA,IACzB;AACA,eAAO,0BAAY,MAAM;AAAA,EAC3B;AAAA,EAEA,OAAkD;AAChD,WAAO,KAAK,MAAM,KAAK;AAAA,EACzB;AAAA;AAAA,EAGA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAmB;AACtC,WAAO;AAAA,EACT;AACF;","names":[]}
1
+ {"version":3,"sources":["../../src/tts/tts.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport { EventEmitter } from 'node:events';\nimport type { TTSMetrics } from '../metrics/base.js';\nimport { AsyncIterableQueue, mergeFrames } from '../utils.js';\n\n/** SynthesizedAudio is a packet of speech synthesis as returned by the TTS. */\nexport interface SynthesizedAudio {\n /** Request ID (one segment could be made up of multiple requests) */\n requestId: string;\n /** Segment ID, each segment is separated by a flush */\n segmentId: string;\n /** Synthesized audio frame */\n frame: AudioFrame;\n /** Current segment of the synthesized audio */\n deltaText?: string;\n /** Whether this is the last frame of the segment (streaming only) */\n final: boolean;\n}\n\n/**\n * Describes the capabilities of the TTS provider.\n *\n * @remarks\n * At present, only `streaming` is supplied to this interface, and the framework only supports\n * providers that do have a streaming endpoint.\n */\nexport interface TTSCapabilities {\n streaming: boolean;\n}\n\nexport enum TTSEvent {\n METRICS_COLLECTED,\n}\n\nexport type TTSCallbacks = {\n [TTSEvent.METRICS_COLLECTED]: (metrics: TTSMetrics) => void;\n};\n\n/**\n * An instance of a text-to-speech adapter.\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child TTS class, which inherits this class's methods.\n */\nexport abstract class TTS extends (EventEmitter as new () => TypedEmitter<TTSCallbacks>) {\n #capabilities: TTSCapabilities;\n #sampleRate: number;\n #numChannels: number;\n abstract label: string;\n\n constructor(sampleRate: number, numChannels: number, capabilities: TTSCapabilities) {\n super();\n this.#capabilities = capabilities;\n this.#sampleRate = sampleRate;\n this.#numChannels = numChannels;\n }\n\n /** Returns this TTS's capabilities */\n get capabilities(): TTSCapabilities {\n return this.#capabilities;\n }\n\n /** Returns the sample rate of audio frames returned by this TTS */\n get sampleRate(): number {\n return this.#sampleRate;\n }\n\n /** Returns the channel count of audio frames returned by this TTS */\n get numChannels(): number {\n return this.#numChannels;\n }\n\n /**\n * Receives text and returns synthesis in the form of a {@link ChunkedStream}\n */\n abstract synthesize(text: string): ChunkedStream;\n\n /**\n * Returns a {@link SynthesizeStream} that can be used to push text and receive audio data\n */\n abstract stream(): SynthesizeStream;\n}\n\n/**\n * An instance of a text-to-speech stream, as an asynchronous iterable iterator.\n *\n * @example Looping through frames\n * ```ts\n * for await (const event of stream) {\n * await source.captureFrame(event.frame);\n * }\n * ```\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child SynthesizeStream class, which inherits this class's methods.\n */\nexport abstract class SynthesizeStream\n implements AsyncIterableIterator<SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM>\n{\n protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');\n static readonly END_OF_STREAM = Symbol('END_OF_STREAM');\n protected input = new AsyncIterableQueue<string | typeof SynthesizeStream.FLUSH_SENTINEL>();\n protected queue = new AsyncIterableQueue<\n SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM\n >();\n protected output = new AsyncIterableQueue<\n SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM\n >();\n protected closed = false;\n abstract label: string;\n #tts: TTS;\n #metricsPendingTexts: string[] = [];\n #metricsText = '';\n #monitorMetricsTask?: Promise<void>;\n\n constructor(tts: TTS) {\n this.#tts = tts;\n }\n\n protected async monitorMetrics() {\n const startTime = process.hrtime.bigint();\n let audioDuration = 0;\n let ttfb: bigint | undefined;\n let requestId = '';\n\n const emit = () => {\n if (this.#metricsPendingTexts.length) {\n const text = this.#metricsPendingTexts.shift()!;\n const duration = process.hrtime.bigint() - startTime;\n const metrics: TTSMetrics = {\n timestamp: Date.now(),\n requestId,\n ttfb: Math.trunc(Number(ttfb! / BigInt(1000000))),\n duration: Math.trunc(Number(duration / BigInt(1000000))),\n charactersCount: text.length,\n audioDuration,\n cancelled: false, // XXX(nbsp)\n label: this.label,\n streamed: false,\n };\n this.#tts.emit(TTSEvent.METRICS_COLLECTED, metrics);\n }\n };\n\n for await (const audio of this.queue) {\n this.output.put(audio);\n if (audio === SynthesizeStream.END_OF_STREAM) continue;\n requestId = audio.requestId;\n if (!ttfb) {\n ttfb = process.hrtime.bigint() - startTime;\n }\n audioDuration += audio.frame.samplesPerChannel / audio.frame.sampleRate;\n if (audio.final) {\n emit();\n }\n }\n\n if (requestId) {\n emit();\n }\n this.output.close();\n }\n\n /** Push a string of text to the TTS */\n pushText(text: string) {\n if (!this.#monitorMetricsTask) {\n this.#monitorMetricsTask = this.monitorMetrics();\n }\n this.#metricsText += text;\n\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(text);\n }\n\n /** Flush the TTS, causing it to process all pending text */\n flush() {\n if (this.#metricsText) {\n this.#metricsPendingTexts.push(this.#metricsText);\n this.#metricsText = '';\n }\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(SynthesizeStream.FLUSH_SENTINEL);\n }\n\n /** Mark the input as ended and forbid additional pushes */\n endInput() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.close();\n }\n\n next(): Promise<IteratorResult<SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM>> {\n return this.output.next();\n }\n\n /** Close both the input and output of the TTS stream */\n close() {\n this.input.close();\n this.output.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): SynthesizeStream {\n return this;\n }\n}\n\n/**\n * An instance of a text-to-speech response, as an asynchronous iterable iterator.\n *\n * @example Looping through frames\n * ```ts\n * for await (const event of stream) {\n * await source.captureFrame(event.frame);\n * }\n * ```\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child ChunkedStream class, which inherits this class's methods.\n */\nexport abstract class ChunkedStream implements AsyncIterableIterator<SynthesizedAudio> {\n protected queue = new AsyncIterableQueue<SynthesizedAudio>();\n protected output = new AsyncIterableQueue<SynthesizedAudio>();\n protected closed = false;\n abstract label: string;\n #text: string;\n #tts: TTS;\n\n constructor(text: string, tts: TTS) {\n this.#text = text;\n this.#tts = tts;\n\n this.monitorMetrics();\n }\n\n protected async monitorMetrics() {\n const startTime = process.hrtime.bigint();\n let audioDuration = 0;\n let ttfb: bigint | undefined;\n let requestId = '';\n\n for await (const audio of this.queue) {\n this.output.put(audio);\n requestId = audio.requestId;\n if (!ttfb) {\n ttfb = process.hrtime.bigint() - startTime;\n }\n audioDuration += audio.frame.samplesPerChannel / audio.frame.sampleRate;\n }\n this.output.close();\n\n const duration = process.hrtime.bigint() - startTime;\n const metrics: TTSMetrics = {\n timestamp: Date.now(),\n requestId,\n ttfb: Math.trunc(Number(ttfb! / BigInt(1000000))),\n duration: Math.trunc(Number(duration / BigInt(1000000))),\n charactersCount: this.#text.length,\n audioDuration,\n cancelled: false, // XXX(nbsp)\n label: this.label,\n streamed: false,\n };\n this.#tts.emit(TTSEvent.METRICS_COLLECTED, metrics);\n }\n\n /** Collect every frame into one in a single call */\n async collect(): Promise<AudioFrame> {\n const frames = [];\n for await (const event of this) {\n frames.push(event.frame);\n }\n return mergeFrames(frames);\n }\n\n next(): Promise<IteratorResult<SynthesizedAudio>> {\n return this.output.next();\n }\n\n /** Close both the input and output of the TTS stream */\n close() {\n this.queue.close();\n this.output.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): ChunkedStream {\n return this;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAKA,yBAA6B;AAE7B,mBAAgD;AA2BzC,IAAK,WAAL,kBAAKA,cAAL;AACL,EAAAA,oBAAA;AADU,SAAAA;AAAA,GAAA;AAeL,MAAe,YAAa,gCAAsD;AAAA,EACvF;AAAA,EACA;AAAA,EACA;AAAA,EAGA,YAAY,YAAoB,aAAqB,cAA+B;AAClF,UAAM;AACN,SAAK,gBAAgB;AACrB,SAAK,cAAc;AACnB,SAAK,eAAe;AAAA,EACtB;AAAA;AAAA,EAGA,IAAI,eAAgC;AAClC,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,IAAI,aAAqB;AACvB,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,IAAI,cAAsB;AACxB,WAAO,KAAK;AAAA,EACd;AAWF;AAgBO,MAAe,iBAEtB;AAAA,EACE,OAA0B,iBAAiB,OAAO,gBAAgB;AAAA,EAClE,OAAgB,gBAAgB,OAAO,eAAe;AAAA,EAC5C,QAAQ,IAAI,gCAAoE;AAAA,EAChF,QAAQ,IAAI,gCAEpB;AAAA,EACQ,SAAS,IAAI,gCAErB;AAAA,EACQ,SAAS;AAAA,EAEnB;AAAA,EACA,uBAAiC,CAAC;AAAA,EAClC,eAAe;AAAA,EACf;AAAA,EAEA,YAAY,KAAU;AACpB,SAAK,OAAO;AAAA,EACd;AAAA,EAEA,MAAgB,iBAAiB;AAC/B,UAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,QAAI,gBAAgB;AACpB,QAAI;AACJ,QAAI,YAAY;AAEhB,UAAM,OAAO,MAAM;AACjB,UAAI,KAAK,qBAAqB,QAAQ;AACpC,cAAM,OAAO,KAAK,qBAAqB,MAAM;AAC7C,cAAM,WAAW,QAAQ,OAAO,OAAO,IAAI;AAC3C,cAAM,UAAsB;AAAA,UAC1B,WAAW,KAAK,IAAI;AAAA,UACpB;AAAA,UACA,MAAM,KAAK,MAAM,OAAO,OAAQ,OAAO,GAAO,CAAC,CAAC;AAAA,UAChD,UAAU,KAAK,MAAM,OAAO,WAAW,OAAO,GAAO,CAAC,CAAC;AAAA,UACvD,iBAAiB,KAAK;AAAA,UACtB;AAAA,UACA,WAAW;AAAA;AAAA,UACX,OAAO,KAAK;AAAA,UACZ,UAAU;AAAA,QACZ;AACA,aAAK,KAAK,KAAK,2BAA4B,OAAO;AAAA,MACpD;AAAA,IACF;AAEA,qBAAiB,SAAS,KAAK,OAAO;AACpC,WAAK,OAAO,IAAI,KAAK;AACrB,UAAI,UAAU,iBAAiB,cAAe;AAC9C,kBAAY,MAAM;AAClB,UAAI,CAAC,MAAM;AACT,eAAO,QAAQ,OAAO,OAAO,IAAI;AAAA,MACnC;AACA,uBAAiB,MAAM,MAAM,oBAAoB,MAAM,MAAM;AAC7D,UAAI,MAAM,OAAO;AACf,aAAK;AAAA,MACP;AAAA,IACF;AAEA,QAAI,WAAW;AACb,WAAK;AAAA,IACP;AACA,SAAK,OAAO,MAAM;AAAA,EACpB;AAAA;AAAA,EAGA,SAAS,MAAc;AACrB,QAAI,CAAC,KAAK,qBAAqB;AAC7B,WAAK,sBAAsB,KAAK,eAAe;AAAA,IACjD;AACA,SAAK,gBAAgB;AAErB,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,IAAI;AAAA,EACrB;AAAA;AAAA,EAGA,QAAQ;AACN,QAAI,KAAK,cAAc;AACrB,WAAK,qBAAqB,KAAK,KAAK,YAAY;AAChD,WAAK,eAAe;AAAA,IACtB;AACA,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,iBAAiB,cAAc;AAAA,EAChD;AAAA;AAAA,EAGA,WAAW;AACT,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,MAAM;AAAA,EACnB;AAAA,EAEA,OAA0F;AACxF,WAAO,KAAK,OAAO,KAAK;AAAA,EAC1B;AAAA;AAAA,EAGA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,OAAO,MAAM;AAClB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAsB;AACzC,WAAO;AAAA,EACT;AACF;AAgBO,MAAe,cAAiE;AAAA,EAC3E,QAAQ,IAAI,gCAAqC;AAAA,EACjD,SAAS,IAAI,gCAAqC;AAAA,EAClD,SAAS;AAAA,EAEnB;AAAA,EACA;AAAA,EAEA,YAAY,MAAc,KAAU;AAClC,SAAK,QAAQ;AACb,SAAK,OAAO;AAEZ,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,MAAgB,iBAAiB;AAC/B,UAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,QAAI,gBAAgB;AACpB,QAAI;AACJ,QAAI,YAAY;AAEhB,qBAAiB,SAAS,KAAK,OAAO;AACpC,WAAK,OAAO,IAAI,KAAK;AACrB,kBAAY,MAAM;AAClB,UAAI,CAAC,MAAM;AACT,eAAO,QAAQ,OAAO,OAAO,IAAI;AAAA,MACnC;AACA,uBAAiB,MAAM,MAAM,oBAAoB,MAAM,MAAM;AAAA,IAC/D;AACA,SAAK,OAAO,MAAM;AAElB,UAAM,WAAW,QAAQ,OAAO,OAAO,IAAI;AAC3C,UAAM,UAAsB;AAAA,MAC1B,WAAW,KAAK,IAAI;AAAA,MACpB;AAAA,MACA,MAAM,KAAK,MAAM,OAAO,OAAQ,OAAO,GAAO,CAAC,CAAC;AAAA,MAChD,UAAU,KAAK,MAAM,OAAO,WAAW,OAAO,GAAO,CAAC,CAAC;AAAA,MACvD,iBAAiB,KAAK,MAAM;AAAA,MAC5B;AAAA,MACA,WAAW;AAAA;AAAA,MACX,OAAO,KAAK;AAAA,MACZ,UAAU;AAAA,IACZ;AACA,SAAK,KAAK,KAAK,2BAA4B,OAAO;AAAA,EACpD;AAAA;AAAA,EAGA,MAAM,UAA+B;AACnC,UAAM,SAAS,CAAC;AAChB,qBAAiB,SAAS,MAAM;AAC9B,aAAO,KAAK,MAAM,KAAK;AAAA,IACzB;AACA,eAAO,0BAAY,MAAM;AAAA,EAC3B;AAAA,EAEA,OAAkD;AAChD,WAAO,KAAK,OAAO,KAAK;AAAA,EAC1B;AAAA;AAAA,EAGA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,OAAO,MAAM;AAClB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAmB;AACtC,WAAO;AAAA,EACT;AACF;","names":["TTSEvent"]}
package/dist/tts/tts.d.ts CHANGED
@@ -1,4 +1,6 @@
1
1
  import type { AudioFrame } from '@livekit/rtc-node';
2
+ import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
3
+ import type { TTSMetrics } from '../metrics/base.js';
2
4
  import { AsyncIterableQueue } from '../utils.js';
3
5
  /** SynthesizedAudio is a packet of speech synthesis as returned by the TTS. */
4
6
  export interface SynthesizedAudio {
@@ -10,6 +12,8 @@ export interface SynthesizedAudio {
10
12
  frame: AudioFrame;
11
13
  /** Current segment of the synthesized audio */
12
14
  deltaText?: string;
15
+ /** Whether this is the last frame of the segment (streaming only) */
16
+ final: boolean;
13
17
  }
14
18
  /**
15
19
  * Describes the capabilities of the TTS provider.
@@ -21,6 +25,13 @@ export interface SynthesizedAudio {
21
25
  export interface TTSCapabilities {
22
26
  streaming: boolean;
23
27
  }
28
+ export declare enum TTSEvent {
29
+ METRICS_COLLECTED = 0
30
+ }
31
+ export type TTSCallbacks = {
32
+ [TTSEvent.METRICS_COLLECTED]: (metrics: TTSMetrics) => void;
33
+ };
34
+ declare const TTS_base: new () => TypedEmitter<TTSCallbacks>;
24
35
  /**
25
36
  * An instance of a text-to-speech adapter.
26
37
  *
@@ -28,8 +39,9 @@ export interface TTSCapabilities {
28
39
  * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that
29
40
  * exports its own child TTS class, which inherits this class's methods.
30
41
  */
31
- export declare abstract class TTS {
42
+ export declare abstract class TTS extends TTS_base {
32
43
  #private;
44
+ abstract label: string;
33
45
  constructor(sampleRate: number, numChannels: number, capabilities: TTSCapabilities);
34
46
  /** Returns this TTS's capabilities */
35
47
  get capabilities(): TTSCapabilities;
@@ -61,11 +73,16 @@ export declare abstract class TTS {
61
73
  * exports its own child SynthesizeStream class, which inherits this class's methods.
62
74
  */
63
75
  export declare abstract class SynthesizeStream implements AsyncIterableIterator<SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM> {
76
+ #private;
64
77
  protected static readonly FLUSH_SENTINEL: unique symbol;
65
78
  static readonly END_OF_STREAM: unique symbol;
66
79
  protected input: AsyncIterableQueue<string | typeof SynthesizeStream.FLUSH_SENTINEL>;
67
80
  protected queue: AsyncIterableQueue<SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM>;
81
+ protected output: AsyncIterableQueue<SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM>;
68
82
  protected closed: boolean;
83
+ abstract label: string;
84
+ constructor(tts: TTS);
85
+ protected monitorMetrics(): Promise<void>;
69
86
  /** Push a string of text to the TTS */
70
87
  pushText(text: string): void;
71
88
  /** Flush the TTS, causing it to process all pending text */
@@ -92,8 +109,13 @@ export declare abstract class SynthesizeStream implements AsyncIterableIterator<
92
109
  * exports its own child ChunkedStream class, which inherits this class's methods.
93
110
  */
94
111
  export declare abstract class ChunkedStream implements AsyncIterableIterator<SynthesizedAudio> {
112
+ #private;
95
113
  protected queue: AsyncIterableQueue<SynthesizedAudio>;
114
+ protected output: AsyncIterableQueue<SynthesizedAudio>;
96
115
  protected closed: boolean;
116
+ abstract label: string;
117
+ constructor(text: string, tts: TTS);
118
+ protected monitorMetrics(): Promise<void>;
97
119
  /** Collect every frame into one in a single call */
98
120
  collect(): Promise<AudioFrame>;
99
121
  next(): Promise<IteratorResult<SynthesizedAudio>>;
@@ -101,4 +123,5 @@ export declare abstract class ChunkedStream implements AsyncIterableIterator<Syn
101
123
  close(): void;
102
124
  [Symbol.asyncIterator](): ChunkedStream;
103
125
  }
126
+ export {};
104
127
  //# sourceMappingURL=tts.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"tts.d.ts","sourceRoot":"","sources":["../../src/tts/tts.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AACpD,OAAO,EAAE,kBAAkB,EAAe,MAAM,aAAa,CAAC;AAE9D,+EAA+E;AAC/E,MAAM,WAAW,gBAAgB;IAC/B,qEAAqE;IACrE,SAAS,EAAE,MAAM,CAAC;IAClB,uDAAuD;IACvD,SAAS,EAAE,MAAM,CAAC;IAClB,8BAA8B;IAC9B,KAAK,EAAE,UAAU,CAAC;IAClB,+CAA+C;IAC/C,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED;;;;;;GAMG;AACH,MAAM,WAAW,eAAe;IAC9B,SAAS,EAAE,OAAO,CAAC;CACpB;AAED;;;;;;GAMG;AACH,8BAAsB,GAAG;;gBAKX,UAAU,EAAE,MAAM,EAAE,WAAW,EAAE,MAAM,EAAE,YAAY,EAAE,eAAe;IAMlF,sCAAsC;IACtC,IAAI,YAAY,IAAI,eAAe,CAElC;IAED,mEAAmE;IACnE,IAAI,UAAU,IAAI,MAAM,CAEvB;IAED,qEAAqE;IACrE,IAAI,WAAW,IAAI,MAAM,CAExB;IAED;;OAEG;IACH,QAAQ,CAAC,UAAU,CAAC,IAAI,EAAE,MAAM,GAAG,aAAa;IAEhD;;OAEG;IACH,QAAQ,CAAC,MAAM,IAAI,gBAAgB;CACpC;AAED;;;;;;;;;;;;;GAaG;AACH,8BAAsB,gBACpB,YAAW,qBAAqB,CAAC,gBAAgB,GAAG,OAAO,gBAAgB,CAAC,aAAa,CAAC;IAE1F,SAAS,CAAC,MAAM,CAAC,QAAQ,CAAC,cAAc,gBAA4B;IACpE,MAAM,CAAC,QAAQ,CAAC,aAAa,gBAA2B;IACxD,SAAS,CAAC,KAAK,sEAA6E;IAC5F,SAAS,CAAC,KAAK,+EAEX;IACJ,SAAS,CAAC,MAAM,UAAS;IAEzB,uCAAuC;IACvC,QAAQ,CAAC,IAAI,EAAE,MAAM;IAUrB,4DAA4D;IAC5D,KAAK;IAUL,2DAA2D;IAC3D,QAAQ;IAUR,IAAI,IAAI,OAAO,CAAC,cAAc,CAAC,gBAAgB,GAAG,OAAO,gBAAgB,CAAC,aAAa,CAAC,CAAC;IAIzF,wDAAwD;IACxD,KAAK;IAML,CAAC,MAAM,CAAC,aAAa,CAAC,IAAI,gBAAgB;CAG3C;AAED;;;;;;;;;;;;;GAaG;AACH,8BAAsB,aAAc,YAAW,qBAAqB,CAAC,gBAAgB,CAAC;IACpF,SAAS,CAAC,KAAK,uCAA8C;IAC7D,SAAS,CAAC,MAAM,UAAS;IAEzB,oDAAoD;IAC9C,OAAO,IAAI,OAAO,CAAC,UAAU,CAAC;IAQpC,IAAI,IAAI,OAAO,CAAC,cAAc,CAAC,gBAAgB,CAAC,CAAC;IAIjD,wDAAwD;IACxD,KAAK;IAKL,CAAC,MAAM,CAAC,aAAa,CAAC,IAAI,aAAa;CAGxC"}
1
+ {"version":3,"file":"tts.d.ts","sourceRoot":"","sources":["../../src/tts/tts.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AACpD,OAAO,KAAK,EAAE,iBAAiB,IAAI,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAEhF,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,oBAAoB,CAAC;AACrD,OAAO,EAAE,kBAAkB,EAAe,MAAM,aAAa,CAAC;AAE9D,+EAA+E;AAC/E,MAAM,WAAW,gBAAgB;IAC/B,qEAAqE;IACrE,SAAS,EAAE,MAAM,CAAC;IAClB,uDAAuD;IACvD,SAAS,EAAE,MAAM,CAAC;IAClB,8BAA8B;IAC9B,KAAK,EAAE,UAAU,CAAC;IAClB,+CAA+C;IAC/C,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,qEAAqE;IACrE,KAAK,EAAE,OAAO,CAAC;CAChB;AAED;;;;;;GAMG;AACH,MAAM,WAAW,eAAe;IAC9B,SAAS,EAAE,OAAO,CAAC;CACpB;AAED,oBAAY,QAAQ;IAClB,iBAAiB,IAAA;CAClB;AAED,MAAM,MAAM,YAAY,GAAG;IACzB,CAAC,QAAQ,CAAC,iBAAiB,CAAC,EAAE,CAAC,OAAO,EAAE,UAAU,KAAK,IAAI,CAAC;CAC7D,CAAC;kCAS2D,aAAa,YAAY,CAAC;AAPvF;;;;;;GAMG;AACH,8BAAsB,GAAI,SAAQ,QAAsD;;IAItF,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;gBAEX,UAAU,EAAE,MAAM,EAAE,WAAW,EAAE,MAAM,EAAE,YAAY,EAAE,eAAe;IAOlF,sCAAsC;IACtC,IAAI,YAAY,IAAI,eAAe,CAElC;IAED,mEAAmE;IACnE,IAAI,UAAU,IAAI,MAAM,CAEvB;IAED,qEAAqE;IACrE,IAAI,WAAW,IAAI,MAAM,CAExB;IAED;;OAEG;IACH,QAAQ,CAAC,UAAU,CAAC,IAAI,EAAE,MAAM,GAAG,aAAa;IAEhD;;OAEG;IACH,QAAQ,CAAC,MAAM,IAAI,gBAAgB;CACpC;AAED;;;;;;;;;;;;;GAaG;AACH,8BAAsB,gBACpB,YAAW,qBAAqB,CAAC,gBAAgB,GAAG,OAAO,gBAAgB,CAAC,aAAa,CAAC;;IAE1F,SAAS,CAAC,MAAM,CAAC,QAAQ,CAAC,cAAc,gBAA4B;IACpE,MAAM,CAAC,QAAQ,CAAC,aAAa,gBAA2B;IACxD,SAAS,CAAC,KAAK,sEAA6E;IAC5F,SAAS,CAAC,KAAK,+EAEX;IACJ,SAAS,CAAC,MAAM,+EAEZ;IACJ,SAAS,CAAC,MAAM,UAAS;IACzB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;gBAMX,GAAG,EAAE,GAAG;cAIJ,cAAc;IA4C9B,uCAAuC;IACvC,QAAQ,CAAC,IAAI,EAAE,MAAM;IAerB,4DAA4D;IAC5D,KAAK;IAcL,2DAA2D;IAC3D,QAAQ;IAUR,IAAI,IAAI,OAAO,CAAC,cAAc,CAAC,gBAAgB,GAAG,OAAO,gBAAgB,CAAC,aAAa,CAAC,CAAC;IAIzF,wDAAwD;IACxD,KAAK;IAML,CAAC,MAAM,CAAC,aAAa,CAAC,IAAI,gBAAgB;CAG3C;AAED;;;;;;;;;;;;;GAaG;AACH,8BAAsB,aAAc,YAAW,qBAAqB,CAAC,gBAAgB,CAAC;;IACpF,SAAS,CAAC,KAAK,uCAA8C;IAC7D,SAAS,CAAC,MAAM,uCAA8C;IAC9D,SAAS,CAAC,MAAM,UAAS;IACzB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;gBAIX,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,GAAG;cAOlB,cAAc;IA+B9B,oDAAoD;IAC9C,OAAO,IAAI,OAAO,CAAC,UAAU,CAAC;IAQpC,IAAI,IAAI,OAAO,CAAC,cAAc,CAAC,gBAAgB,CAAC,CAAC;IAIjD,wDAAwD;IACxD,KAAK;IAML,CAAC,MAAM,CAAC,aAAa,CAAC,IAAI,aAAa;CAGxC"}
package/dist/tts/tts.js CHANGED
@@ -1,9 +1,15 @@
1
+ import { EventEmitter } from "node:events";
1
2
  import { AsyncIterableQueue, mergeFrames } from "../utils.js";
2
- class TTS {
3
+ var TTSEvent = /* @__PURE__ */ ((TTSEvent2) => {
4
+ TTSEvent2[TTSEvent2["METRICS_COLLECTED"] = 0] = "METRICS_COLLECTED";
5
+ return TTSEvent2;
6
+ })(TTSEvent || {});
7
+ class TTS extends EventEmitter {
3
8
  #capabilities;
4
9
  #sampleRate;
5
10
  #numChannels;
6
11
  constructor(sampleRate, numChannels, capabilities) {
12
+ super();
7
13
  this.#capabilities = capabilities;
8
14
  this.#sampleRate = sampleRate;
9
15
  this.#numChannels = numChannels;
@@ -26,9 +32,62 @@ class SynthesizeStream {
26
32
  static END_OF_STREAM = Symbol("END_OF_STREAM");
27
33
  input = new AsyncIterableQueue();
28
34
  queue = new AsyncIterableQueue();
35
+ output = new AsyncIterableQueue();
29
36
  closed = false;
37
+ #tts;
38
+ #metricsPendingTexts = [];
39
+ #metricsText = "";
40
+ #monitorMetricsTask;
41
+ constructor(tts) {
42
+ this.#tts = tts;
43
+ }
44
+ async monitorMetrics() {
45
+ const startTime = process.hrtime.bigint();
46
+ let audioDuration = 0;
47
+ let ttfb;
48
+ let requestId = "";
49
+ const emit = () => {
50
+ if (this.#metricsPendingTexts.length) {
51
+ const text = this.#metricsPendingTexts.shift();
52
+ const duration = process.hrtime.bigint() - startTime;
53
+ const metrics = {
54
+ timestamp: Date.now(),
55
+ requestId,
56
+ ttfb: Math.trunc(Number(ttfb / BigInt(1e6))),
57
+ duration: Math.trunc(Number(duration / BigInt(1e6))),
58
+ charactersCount: text.length,
59
+ audioDuration,
60
+ cancelled: false,
61
+ // XXX(nbsp)
62
+ label: this.label,
63
+ streamed: false
64
+ };
65
+ this.#tts.emit(0 /* METRICS_COLLECTED */, metrics);
66
+ }
67
+ };
68
+ for await (const audio of this.queue) {
69
+ this.output.put(audio);
70
+ if (audio === SynthesizeStream.END_OF_STREAM) continue;
71
+ requestId = audio.requestId;
72
+ if (!ttfb) {
73
+ ttfb = process.hrtime.bigint() - startTime;
74
+ }
75
+ audioDuration += audio.frame.samplesPerChannel / audio.frame.sampleRate;
76
+ if (audio.final) {
77
+ emit();
78
+ }
79
+ }
80
+ if (requestId) {
81
+ emit();
82
+ }
83
+ this.output.close();
84
+ }
30
85
  /** Push a string of text to the TTS */
31
86
  pushText(text) {
87
+ if (!this.#monitorMetricsTask) {
88
+ this.#monitorMetricsTask = this.monitorMetrics();
89
+ }
90
+ this.#metricsText += text;
32
91
  if (this.input.closed) {
33
92
  throw new Error("Input is closed");
34
93
  }
@@ -39,6 +98,10 @@ class SynthesizeStream {
39
98
  }
40
99
  /** Flush the TTS, causing it to process all pending text */
41
100
  flush() {
101
+ if (this.#metricsText) {
102
+ this.#metricsPendingTexts.push(this.#metricsText);
103
+ this.#metricsText = "";
104
+ }
42
105
  if (this.input.closed) {
43
106
  throw new Error("Input is closed");
44
107
  }
@@ -58,12 +121,12 @@ class SynthesizeStream {
58
121
  this.input.close();
59
122
  }
60
123
  next() {
61
- return this.queue.next();
124
+ return this.output.next();
62
125
  }
63
126
  /** Close both the input and output of the TTS stream */
64
127
  close() {
65
128
  this.input.close();
66
- this.queue.close();
129
+ this.output.close();
67
130
  this.closed = true;
68
131
  }
69
132
  [Symbol.asyncIterator]() {
@@ -72,7 +135,44 @@ class SynthesizeStream {
72
135
  }
73
136
  class ChunkedStream {
74
137
  queue = new AsyncIterableQueue();
138
+ output = new AsyncIterableQueue();
75
139
  closed = false;
140
+ #text;
141
+ #tts;
142
+ constructor(text, tts) {
143
+ this.#text = text;
144
+ this.#tts = tts;
145
+ this.monitorMetrics();
146
+ }
147
+ async monitorMetrics() {
148
+ const startTime = process.hrtime.bigint();
149
+ let audioDuration = 0;
150
+ let ttfb;
151
+ let requestId = "";
152
+ for await (const audio of this.queue) {
153
+ this.output.put(audio);
154
+ requestId = audio.requestId;
155
+ if (!ttfb) {
156
+ ttfb = process.hrtime.bigint() - startTime;
157
+ }
158
+ audioDuration += audio.frame.samplesPerChannel / audio.frame.sampleRate;
159
+ }
160
+ this.output.close();
161
+ const duration = process.hrtime.bigint() - startTime;
162
+ const metrics = {
163
+ timestamp: Date.now(),
164
+ requestId,
165
+ ttfb: Math.trunc(Number(ttfb / BigInt(1e6))),
166
+ duration: Math.trunc(Number(duration / BigInt(1e6))),
167
+ charactersCount: this.#text.length,
168
+ audioDuration,
169
+ cancelled: false,
170
+ // XXX(nbsp)
171
+ label: this.label,
172
+ streamed: false
173
+ };
174
+ this.#tts.emit(0 /* METRICS_COLLECTED */, metrics);
175
+ }
76
176
  /** Collect every frame into one in a single call */
77
177
  async collect() {
78
178
  const frames = [];
@@ -82,11 +182,12 @@ class ChunkedStream {
82
182
  return mergeFrames(frames);
83
183
  }
84
184
  next() {
85
- return this.queue.next();
185
+ return this.output.next();
86
186
  }
87
187
  /** Close both the input and output of the TTS stream */
88
188
  close() {
89
189
  this.queue.close();
190
+ this.output.close();
90
191
  this.closed = true;
91
192
  }
92
193
  [Symbol.asyncIterator]() {
@@ -96,6 +197,7 @@ class ChunkedStream {
96
197
  export {
97
198
  ChunkedStream,
98
199
  SynthesizeStream,
99
- TTS
200
+ TTS,
201
+ TTSEvent
100
202
  };
101
203
  //# sourceMappingURL=tts.js.map
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/tts/tts.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { AsyncIterableQueue, mergeFrames } from '../utils.js';\n\n/** SynthesizedAudio is a packet of speech synthesis as returned by the TTS. */\nexport interface SynthesizedAudio {\n /** Request ID (one segment could be made up of multiple requests) */\n requestId: string;\n /** Segment ID, each segment is separated by a flush */\n segmentId: string;\n /** Synthesized audio frame */\n frame: AudioFrame;\n /** Current segment of the synthesized audio */\n deltaText?: string;\n}\n\n/**\n * Describes the capabilities of the TTS provider.\n *\n * @remarks\n * At present, only `streaming` is supplied to this interface, and the framework only supports\n * providers that do have a streaming endpoint.\n */\nexport interface TTSCapabilities {\n streaming: boolean;\n}\n\n/**\n * An instance of a text-to-speech adapter.\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child TTS class, which inherits this class's methods.\n */\nexport abstract class TTS {\n #capabilities: TTSCapabilities;\n #sampleRate: number;\n #numChannels: number;\n\n constructor(sampleRate: number, numChannels: number, capabilities: TTSCapabilities) {\n this.#capabilities = capabilities;\n this.#sampleRate = sampleRate;\n this.#numChannels = numChannels;\n }\n\n /** Returns this TTS's capabilities */\n get capabilities(): TTSCapabilities {\n return this.#capabilities;\n }\n\n /** Returns the sample rate of audio frames returned by this TTS */\n get sampleRate(): number {\n return this.#sampleRate;\n }\n\n /** Returns the channel count of audio frames returned by this TTS */\n get numChannels(): number {\n return this.#numChannels;\n }\n\n /**\n * Receives text and returns synthesis in the form of a {@link ChunkedStream}\n */\n abstract synthesize(text: string): ChunkedStream;\n\n /**\n * Returns a {@link SynthesizeStream} that can be used to push text and receive audio data\n */\n abstract stream(): SynthesizeStream;\n}\n\n/**\n * An instance of a text-to-speech stream, as an asynchronous iterable iterator.\n *\n * @example Looping through frames\n * ```ts\n * for await (const event of stream) {\n * await source.captureFrame(event.frame);\n * }\n * ```\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child SynthesizeStream class, which inherits this class's methods.\n */\nexport abstract class SynthesizeStream\n implements AsyncIterableIterator<SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM>\n{\n protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');\n static readonly END_OF_STREAM = Symbol('END_OF_STREAM');\n protected input = new AsyncIterableQueue<string | typeof SynthesizeStream.FLUSH_SENTINEL>();\n protected queue = new AsyncIterableQueue<\n SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM\n >();\n protected closed = false;\n\n /** Push a string of text to the TTS */\n pushText(text: string) {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(text);\n }\n\n /** Flush the TTS, causing it to process all pending text */\n flush() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(SynthesizeStream.FLUSH_SENTINEL);\n }\n\n /** Mark the input as ended and forbid additional pushes */\n endInput() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.close();\n }\n\n next(): Promise<IteratorResult<SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM>> {\n return this.queue.next();\n }\n\n /** Close both the input and output of the TTS stream */\n close() {\n this.input.close();\n this.queue.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): SynthesizeStream {\n return this;\n }\n}\n\n/**\n * An instance of a text-to-speech response, as an asynchronous iterable iterator.\n *\n * @example Looping through frames\n * ```ts\n * for await (const event of stream) {\n * await source.captureFrame(event.frame);\n * }\n * ```\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child ChunkedStream class, which inherits this class's methods.\n */\nexport abstract class ChunkedStream implements AsyncIterableIterator<SynthesizedAudio> {\n protected queue = new AsyncIterableQueue<SynthesizedAudio>();\n protected closed = false;\n\n /** Collect every frame into one in a single call */\n async collect(): Promise<AudioFrame> {\n const frames = [];\n for await (const event of this) {\n frames.push(event.frame);\n }\n return mergeFrames(frames);\n }\n\n next(): Promise<IteratorResult<SynthesizedAudio>> {\n return this.queue.next();\n }\n\n /** Close both the input and output of the TTS stream */\n close() {\n this.queue.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): ChunkedStream {\n return this;\n }\n}\n"],"mappings":"AAIA,SAAS,oBAAoB,mBAAmB;AAgCzC,MAAe,IAAI;AAAA,EACxB;AAAA,EACA;AAAA,EACA;AAAA,EAEA,YAAY,YAAoB,aAAqB,cAA+B;AAClF,SAAK,gBAAgB;AACrB,SAAK,cAAc;AACnB,SAAK,eAAe;AAAA,EACtB;AAAA;AAAA,EAGA,IAAI,eAAgC;AAClC,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,IAAI,aAAqB;AACvB,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,IAAI,cAAsB;AACxB,WAAO,KAAK;AAAA,EACd;AAWF;AAgBO,MAAe,iBAEtB;AAAA,EACE,OAA0B,iBAAiB,OAAO,gBAAgB;AAAA,EAClE,OAAgB,gBAAgB,OAAO,eAAe;AAAA,EAC5C,QAAQ,IAAI,mBAAoE;AAAA,EAChF,QAAQ,IAAI,mBAEpB;AAAA,EACQ,SAAS;AAAA;AAAA,EAGnB,SAAS,MAAc;AACrB,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,IAAI;AAAA,EACrB;AAAA;AAAA,EAGA,QAAQ;AACN,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,iBAAiB,cAAc;AAAA,EAChD;AAAA;AAAA,EAGA,WAAW;AACT,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,MAAM;AAAA,EACnB;AAAA,EAEA,OAA0F;AACxF,WAAO,KAAK,MAAM,KAAK;AAAA,EACzB;AAAA;AAAA,EAGA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,MAAM,MAAM;AACjB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAsB;AACzC,WAAO;AAAA,EACT;AACF;AAgBO,MAAe,cAAiE;AAAA,EAC3E,QAAQ,IAAI,mBAAqC;AAAA,EACjD,SAAS;AAAA;AAAA,EAGnB,MAAM,UAA+B;AACnC,UAAM,SAAS,CAAC;AAChB,qBAAiB,SAAS,MAAM;AAC9B,aAAO,KAAK,MAAM,KAAK;AAAA,IACzB;AACA,WAAO,YAAY,MAAM;AAAA,EAC3B;AAAA,EAEA,OAAkD;AAChD,WAAO,KAAK,MAAM,KAAK;AAAA,EACzB;AAAA;AAAA,EAGA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAmB;AACtC,WAAO;AAAA,EACT;AACF;","names":[]}
1
+ {"version":3,"sources":["../../src/tts/tts.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport { EventEmitter } from 'node:events';\nimport type { TTSMetrics } from '../metrics/base.js';\nimport { AsyncIterableQueue, mergeFrames } from '../utils.js';\n\n/** SynthesizedAudio is a packet of speech synthesis as returned by the TTS. */\nexport interface SynthesizedAudio {\n /** Request ID (one segment could be made up of multiple requests) */\n requestId: string;\n /** Segment ID, each segment is separated by a flush */\n segmentId: string;\n /** Synthesized audio frame */\n frame: AudioFrame;\n /** Current segment of the synthesized audio */\n deltaText?: string;\n /** Whether this is the last frame of the segment (streaming only) */\n final: boolean;\n}\n\n/**\n * Describes the capabilities of the TTS provider.\n *\n * @remarks\n * At present, only `streaming` is supplied to this interface, and the framework only supports\n * providers that do have a streaming endpoint.\n */\nexport interface TTSCapabilities {\n streaming: boolean;\n}\n\nexport enum TTSEvent {\n METRICS_COLLECTED,\n}\n\nexport type TTSCallbacks = {\n [TTSEvent.METRICS_COLLECTED]: (metrics: TTSMetrics) => void;\n};\n\n/**\n * An instance of a text-to-speech adapter.\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child TTS class, which inherits this class's methods.\n */\nexport abstract class TTS extends (EventEmitter as new () => TypedEmitter<TTSCallbacks>) {\n #capabilities: TTSCapabilities;\n #sampleRate: number;\n #numChannels: number;\n abstract label: string;\n\n constructor(sampleRate: number, numChannels: number, capabilities: TTSCapabilities) {\n super();\n this.#capabilities = capabilities;\n this.#sampleRate = sampleRate;\n this.#numChannels = numChannels;\n }\n\n /** Returns this TTS's capabilities */\n get capabilities(): TTSCapabilities {\n return this.#capabilities;\n }\n\n /** Returns the sample rate of audio frames returned by this TTS */\n get sampleRate(): number {\n return this.#sampleRate;\n }\n\n /** Returns the channel count of audio frames returned by this TTS */\n get numChannels(): number {\n return this.#numChannels;\n }\n\n /**\n * Receives text and returns synthesis in the form of a {@link ChunkedStream}\n */\n abstract synthesize(text: string): ChunkedStream;\n\n /**\n * Returns a {@link SynthesizeStream} that can be used to push text and receive audio data\n */\n abstract stream(): SynthesizeStream;\n}\n\n/**\n * An instance of a text-to-speech stream, as an asynchronous iterable iterator.\n *\n * @example Looping through frames\n * ```ts\n * for await (const event of stream) {\n * await source.captureFrame(event.frame);\n * }\n * ```\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child SynthesizeStream class, which inherits this class's methods.\n */\nexport abstract class SynthesizeStream\n implements AsyncIterableIterator<SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM>\n{\n protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');\n static readonly END_OF_STREAM = Symbol('END_OF_STREAM');\n protected input = new AsyncIterableQueue<string | typeof SynthesizeStream.FLUSH_SENTINEL>();\n protected queue = new AsyncIterableQueue<\n SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM\n >();\n protected output = new AsyncIterableQueue<\n SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM\n >();\n protected closed = false;\n abstract label: string;\n #tts: TTS;\n #metricsPendingTexts: string[] = [];\n #metricsText = '';\n #monitorMetricsTask?: Promise<void>;\n\n constructor(tts: TTS) {\n this.#tts = tts;\n }\n\n protected async monitorMetrics() {\n const startTime = process.hrtime.bigint();\n let audioDuration = 0;\n let ttfb: bigint | undefined;\n let requestId = '';\n\n const emit = () => {\n if (this.#metricsPendingTexts.length) {\n const text = this.#metricsPendingTexts.shift()!;\n const duration = process.hrtime.bigint() - startTime;\n const metrics: TTSMetrics = {\n timestamp: Date.now(),\n requestId,\n ttfb: Math.trunc(Number(ttfb! / BigInt(1000000))),\n duration: Math.trunc(Number(duration / BigInt(1000000))),\n charactersCount: text.length,\n audioDuration,\n cancelled: false, // XXX(nbsp)\n label: this.label,\n streamed: false,\n };\n this.#tts.emit(TTSEvent.METRICS_COLLECTED, metrics);\n }\n };\n\n for await (const audio of this.queue) {\n this.output.put(audio);\n if (audio === SynthesizeStream.END_OF_STREAM) continue;\n requestId = audio.requestId;\n if (!ttfb) {\n ttfb = process.hrtime.bigint() - startTime;\n }\n audioDuration += audio.frame.samplesPerChannel / audio.frame.sampleRate;\n if (audio.final) {\n emit();\n }\n }\n\n if (requestId) {\n emit();\n }\n this.output.close();\n }\n\n /** Push a string of text to the TTS */\n pushText(text: string) {\n if (!this.#monitorMetricsTask) {\n this.#monitorMetricsTask = this.monitorMetrics();\n }\n this.#metricsText += text;\n\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(text);\n }\n\n /** Flush the TTS, causing it to process all pending text */\n flush() {\n if (this.#metricsText) {\n this.#metricsPendingTexts.push(this.#metricsText);\n this.#metricsText = '';\n }\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(SynthesizeStream.FLUSH_SENTINEL);\n }\n\n /** Mark the input as ended and forbid additional pushes */\n endInput() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.close();\n }\n\n next(): Promise<IteratorResult<SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM>> {\n return this.output.next();\n }\n\n /** Close both the input and output of the TTS stream */\n close() {\n this.input.close();\n this.output.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): SynthesizeStream {\n return this;\n }\n}\n\n/**\n * An instance of a text-to-speech response, as an asynchronous iterable iterator.\n *\n * @example Looping through frames\n * ```ts\n * for await (const event of stream) {\n * await source.captureFrame(event.frame);\n * }\n * ```\n *\n * @remarks\n * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that\n * exports its own child ChunkedStream class, which inherits this class's methods.\n */\nexport abstract class ChunkedStream implements AsyncIterableIterator<SynthesizedAudio> {\n protected queue = new AsyncIterableQueue<SynthesizedAudio>();\n protected output = new AsyncIterableQueue<SynthesizedAudio>();\n protected closed = false;\n abstract label: string;\n #text: string;\n #tts: TTS;\n\n constructor(text: string, tts: TTS) {\n this.#text = text;\n this.#tts = tts;\n\n this.monitorMetrics();\n }\n\n protected async monitorMetrics() {\n const startTime = process.hrtime.bigint();\n let audioDuration = 0;\n let ttfb: bigint | undefined;\n let requestId = '';\n\n for await (const audio of this.queue) {\n this.output.put(audio);\n requestId = audio.requestId;\n if (!ttfb) {\n ttfb = process.hrtime.bigint() - startTime;\n }\n audioDuration += audio.frame.samplesPerChannel / audio.frame.sampleRate;\n }\n this.output.close();\n\n const duration = process.hrtime.bigint() - startTime;\n const metrics: TTSMetrics = {\n timestamp: Date.now(),\n requestId,\n ttfb: Math.trunc(Number(ttfb! / BigInt(1000000))),\n duration: Math.trunc(Number(duration / BigInt(1000000))),\n charactersCount: this.#text.length,\n audioDuration,\n cancelled: false, // XXX(nbsp)\n label: this.label,\n streamed: false,\n };\n this.#tts.emit(TTSEvent.METRICS_COLLECTED, metrics);\n }\n\n /** Collect every frame into one in a single call */\n async collect(): Promise<AudioFrame> {\n const frames = [];\n for await (const event of this) {\n frames.push(event.frame);\n }\n return mergeFrames(frames);\n }\n\n next(): Promise<IteratorResult<SynthesizedAudio>> {\n return this.output.next();\n }\n\n /** Close both the input and output of the TTS stream */\n close() {\n this.queue.close();\n this.output.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): ChunkedStream {\n return this;\n }\n}\n"],"mappings":"AAKA,SAAS,oBAAoB;AAE7B,SAAS,oBAAoB,mBAAmB;AA2BzC,IAAK,WAAL,kBAAKA,cAAL;AACL,EAAAA,oBAAA;AADU,SAAAA;AAAA,GAAA;AAeL,MAAe,YAAa,aAAsD;AAAA,EACvF;AAAA,EACA;AAAA,EACA;AAAA,EAGA,YAAY,YAAoB,aAAqB,cAA+B;AAClF,UAAM;AACN,SAAK,gBAAgB;AACrB,SAAK,cAAc;AACnB,SAAK,eAAe;AAAA,EACtB;AAAA;AAAA,EAGA,IAAI,eAAgC;AAClC,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,IAAI,aAAqB;AACvB,WAAO,KAAK;AAAA,EACd;AAAA;AAAA,EAGA,IAAI,cAAsB;AACxB,WAAO,KAAK;AAAA,EACd;AAWF;AAgBO,MAAe,iBAEtB;AAAA,EACE,OAA0B,iBAAiB,OAAO,gBAAgB;AAAA,EAClE,OAAgB,gBAAgB,OAAO,eAAe;AAAA,EAC5C,QAAQ,IAAI,mBAAoE;AAAA,EAChF,QAAQ,IAAI,mBAEpB;AAAA,EACQ,SAAS,IAAI,mBAErB;AAAA,EACQ,SAAS;AAAA,EAEnB;AAAA,EACA,uBAAiC,CAAC;AAAA,EAClC,eAAe;AAAA,EACf;AAAA,EAEA,YAAY,KAAU;AACpB,SAAK,OAAO;AAAA,EACd;AAAA,EAEA,MAAgB,iBAAiB;AAC/B,UAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,QAAI,gBAAgB;AACpB,QAAI;AACJ,QAAI,YAAY;AAEhB,UAAM,OAAO,MAAM;AACjB,UAAI,KAAK,qBAAqB,QAAQ;AACpC,cAAM,OAAO,KAAK,qBAAqB,MAAM;AAC7C,cAAM,WAAW,QAAQ,OAAO,OAAO,IAAI;AAC3C,cAAM,UAAsB;AAAA,UAC1B,WAAW,KAAK,IAAI;AAAA,UACpB;AAAA,UACA,MAAM,KAAK,MAAM,OAAO,OAAQ,OAAO,GAAO,CAAC,CAAC;AAAA,UAChD,UAAU,KAAK,MAAM,OAAO,WAAW,OAAO,GAAO,CAAC,CAAC;AAAA,UACvD,iBAAiB,KAAK;AAAA,UACtB;AAAA,UACA,WAAW;AAAA;AAAA,UACX,OAAO,KAAK;AAAA,UACZ,UAAU;AAAA,QACZ;AACA,aAAK,KAAK,KAAK,2BAA4B,OAAO;AAAA,MACpD;AAAA,IACF;AAEA,qBAAiB,SAAS,KAAK,OAAO;AACpC,WAAK,OAAO,IAAI,KAAK;AACrB,UAAI,UAAU,iBAAiB,cAAe;AAC9C,kBAAY,MAAM;AAClB,UAAI,CAAC,MAAM;AACT,eAAO,QAAQ,OAAO,OAAO,IAAI;AAAA,MACnC;AACA,uBAAiB,MAAM,MAAM,oBAAoB,MAAM,MAAM;AAC7D,UAAI,MAAM,OAAO;AACf,aAAK;AAAA,MACP;AAAA,IACF;AAEA,QAAI,WAAW;AACb,WAAK;AAAA,IACP;AACA,SAAK,OAAO,MAAM;AAAA,EACpB;AAAA;AAAA,EAGA,SAAS,MAAc;AACrB,QAAI,CAAC,KAAK,qBAAqB;AAC7B,WAAK,sBAAsB,KAAK,eAAe;AAAA,IACjD;AACA,SAAK,gBAAgB;AAErB,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,IAAI;AAAA,EACrB;AAAA;AAAA,EAGA,QAAQ;AACN,QAAI,KAAK,cAAc;AACrB,WAAK,qBAAqB,KAAK,KAAK,YAAY;AAChD,WAAK,eAAe;AAAA,IACtB;AACA,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,iBAAiB,cAAc;AAAA,EAChD;AAAA;AAAA,EAGA,WAAW;AACT,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,MAAM;AAAA,EACnB;AAAA,EAEA,OAA0F;AACxF,WAAO,KAAK,OAAO,KAAK;AAAA,EAC1B;AAAA;AAAA,EAGA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,OAAO,MAAM;AAClB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAsB;AACzC,WAAO;AAAA,EACT;AACF;AAgBO,MAAe,cAAiE;AAAA,EAC3E,QAAQ,IAAI,mBAAqC;AAAA,EACjD,SAAS,IAAI,mBAAqC;AAAA,EAClD,SAAS;AAAA,EAEnB;AAAA,EACA;AAAA,EAEA,YAAY,MAAc,KAAU;AAClC,SAAK,QAAQ;AACb,SAAK,OAAO;AAEZ,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,MAAgB,iBAAiB;AAC/B,UAAM,YAAY,QAAQ,OAAO,OAAO;AACxC,QAAI,gBAAgB;AACpB,QAAI;AACJ,QAAI,YAAY;AAEhB,qBAAiB,SAAS,KAAK,OAAO;AACpC,WAAK,OAAO,IAAI,KAAK;AACrB,kBAAY,MAAM;AAClB,UAAI,CAAC,MAAM;AACT,eAAO,QAAQ,OAAO,OAAO,IAAI;AAAA,MACnC;AACA,uBAAiB,MAAM,MAAM,oBAAoB,MAAM,MAAM;AAAA,IAC/D;AACA,SAAK,OAAO,MAAM;AAElB,UAAM,WAAW,QAAQ,OAAO,OAAO,IAAI;AAC3C,UAAM,UAAsB;AAAA,MAC1B,WAAW,KAAK,IAAI;AAAA,MACpB;AAAA,MACA,MAAM,KAAK,MAAM,OAAO,OAAQ,OAAO,GAAO,CAAC,CAAC;AAAA,MAChD,UAAU,KAAK,MAAM,OAAO,WAAW,OAAO,GAAO,CAAC,CAAC;AAAA,MACvD,iBAAiB,KAAK,MAAM;AAAA,MAC5B;AAAA,MACA,WAAW;AAAA;AAAA,MACX,OAAO,KAAK;AAAA,MACZ,UAAU;AAAA,IACZ;AACA,SAAK,KAAK,KAAK,2BAA4B,OAAO;AAAA,EACpD;AAAA;AAAA,EAGA,MAAM,UAA+B;AACnC,UAAM,SAAS,CAAC;AAChB,qBAAiB,SAAS,MAAM;AAC9B,aAAO,KAAK,MAAM,KAAK;AAAA,IACzB;AACA,WAAO,YAAY,MAAM;AAAA,EAC3B;AAAA,EAEA,OAAkD;AAChD,WAAO,KAAK,OAAO,KAAK;AAAA,EAC1B;AAAA;AAAA,EAGA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,OAAO,MAAM;AAClB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAmB;AACtC,WAAO;AAAA,EACT;AACF;","names":["TTSEvent"]}
package/dist/vad.cjs CHANGED
@@ -23,16 +23,19 @@ __export(vad_exports, {
23
23
  VADStream: () => VADStream
24
24
  });
25
25
  module.exports = __toCommonJS(vad_exports);
26
+ var import_node_events = require("node:events");
26
27
  var import_utils = require("./utils.cjs");
27
28
  var VADEventType = /* @__PURE__ */ ((VADEventType2) => {
28
29
  VADEventType2[VADEventType2["START_OF_SPEECH"] = 0] = "START_OF_SPEECH";
29
30
  VADEventType2[VADEventType2["INFERENCE_DONE"] = 1] = "INFERENCE_DONE";
30
31
  VADEventType2[VADEventType2["END_OF_SPEECH"] = 2] = "END_OF_SPEECH";
32
+ VADEventType2[VADEventType2["METRICS_COLLECTED"] = 3] = "METRICS_COLLECTED";
31
33
  return VADEventType2;
32
34
  })(VADEventType || {});
33
- class VAD {
35
+ class VAD extends import_node_events.EventEmitter {
34
36
  #capabilities;
35
37
  constructor(capabilities) {
38
+ super();
36
39
  this.#capabilities = capabilities;
37
40
  }
38
41
  get capabilities() {
@@ -43,7 +46,44 @@ class VADStream {
43
46
  static FLUSH_SENTINEL = Symbol("FLUSH_SENTINEL");
44
47
  input = new import_utils.AsyncIterableQueue();
45
48
  queue = new import_utils.AsyncIterableQueue();
49
+ output = new import_utils.AsyncIterableQueue();
46
50
  closed = false;
51
+ #vad;
52
+ #lastActivityTime = BigInt(0);
53
+ constructor(vad) {
54
+ this.#vad = vad;
55
+ this.monitorMetrics();
56
+ }
57
+ async monitorMetrics() {
58
+ let inferenceDurationTotal = 0;
59
+ let inferenceCount = 0;
60
+ for await (const event of this.queue) {
61
+ this.output.put(event);
62
+ switch (event.type) {
63
+ case 0 /* START_OF_SPEECH */:
64
+ inferenceCount++;
65
+ if (inferenceCount >= 1 / this.#vad.capabilities.updateInterval) {
66
+ this.#vad.emit(3 /* METRICS_COLLECTED */, {
67
+ timestamp: Date.now(),
68
+ idleTime: Math.trunc(
69
+ Number((process.hrtime.bigint() - this.#lastActivityTime) / BigInt(1e6))
70
+ ),
71
+ inferenceDurationTotal,
72
+ inferenceCount,
73
+ label: this.#vad.label
74
+ });
75
+ inferenceCount = 0;
76
+ inferenceDurationTotal = 0;
77
+ }
78
+ break;
79
+ case 1 /* INFERENCE_DONE */:
80
+ case 2 /* END_OF_SPEECH */:
81
+ this.#lastActivityTime = process.hrtime.bigint();
82
+ break;
83
+ }
84
+ }
85
+ this.output.close();
86
+ }
47
87
  pushFrame(frame) {
48
88
  if (this.input.closed) {
49
89
  throw new Error("Input is closed");
@@ -72,11 +112,12 @@ class VADStream {
72
112
  this.input.close();
73
113
  }
74
114
  next() {
75
- return this.queue.next();
115
+ return this.output.next();
76
116
  }
77
117
  close() {
78
118
  this.input.close();
79
119
  this.queue.close();
120
+ this.output.close();
80
121
  this.closed = true;
81
122
  }
82
123
  [Symbol.asyncIterator]() {
package/dist/vad.cjs.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/vad.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { AsyncIterableQueue } from './utils.js';\n\nexport enum VADEventType {\n START_OF_SPEECH,\n INFERENCE_DONE,\n END_OF_SPEECH,\n}\n\nexport interface VADEvent {\n /** Type of the VAD event (e.g., start of speech, end of speech, inference done). */\n type: VADEventType;\n /**\n * Index of the audio sample where the event occurred, relative to the inference sample rate.\n */\n samplesIndex: number;\n /** Timestamp when the event was fired. */\n timestamp: number;\n /** Duration of the detected speech segment in seconds. */\n speechDuration: number;\n /** Duration of the silence segment preceding or following the speech, in seconds. */\n silenceDuration: number;\n /**\n * List of audio frames associated with the speech.\n *\n * @remarks\n * - For `start_of_speech` events, this contains the audio chunks that triggered the detection.\n * - For `inference_done` events, this contains the audio chunks that were processed.\n * - For `end_of_speech` events, this contains the complete user speech.\n */\n frames: AudioFrame[];\n /** Probability that speech is present (only for `INFERENCE_DONE` events). */\n probability: number;\n /** Time taken to perform the inference, in seconds (only for `INFERENCE_DONE` events). */\n inferenceDuration: number;\n /** Indicates whether speech was detected in the frames. */\n speaking: boolean;\n}\n\nexport interface VADCapabilities {\n updateInterval: number;\n}\n\nexport abstract class VAD {\n #capabilities: VADCapabilities;\n constructor(capabilities: VADCapabilities) {\n this.#capabilities = capabilities;\n }\n\n get capabilities(): VADCapabilities {\n return this.#capabilities;\n }\n\n /**\n * Returns a {@link VADStream} that can be used to push audio frames and receive VAD events.\n */\n abstract stream(): VADStream;\n}\n\nexport abstract class VADStream implements AsyncIterableIterator<VADEvent> {\n protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');\n protected input = new AsyncIterableQueue<AudioFrame | typeof VADStream.FLUSH_SENTINEL>();\n protected queue = new AsyncIterableQueue<VADEvent>();\n protected closed = false;\n\n pushFrame(frame: AudioFrame) {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(frame);\n }\n\n flush() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(VADStream.FLUSH_SENTINEL);\n }\n\n endInput() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.close();\n }\n\n next(): Promise<IteratorResult<VADEvent>> {\n return this.queue.next();\n }\n\n close() {\n this.input.close();\n this.queue.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): VADStream {\n return this;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAIA,mBAAmC;AAE5B,IAAK,eAAL,kBAAKA,kBAAL;AACL,EAAAA,4BAAA;AACA,EAAAA,4BAAA;AACA,EAAAA,4BAAA;AAHU,SAAAA;AAAA,GAAA;AAwCL,MAAe,IAAI;AAAA,EACxB;AAAA,EACA,YAAY,cAA+B;AACzC,SAAK,gBAAgB;AAAA,EACvB;AAAA,EAEA,IAAI,eAAgC;AAClC,WAAO,KAAK;AAAA,EACd;AAMF;AAEO,MAAe,UAAqD;AAAA,EACzE,OAA0B,iBAAiB,OAAO,gBAAgB;AAAA,EACxD,QAAQ,IAAI,gCAAiE;AAAA,EAC7E,QAAQ,IAAI,gCAA6B;AAAA,EACzC,SAAS;AAAA,EAEnB,UAAU,OAAmB;AAC3B,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,KAAK;AAAA,EACtB;AAAA,EAEA,QAAQ;AACN,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,UAAU,cAAc;AAAA,EACzC;AAAA,EAEA,WAAW;AACT,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,MAAM;AAAA,EACnB;AAAA,EAEA,OAA0C;AACxC,WAAO,KAAK,MAAM,KAAK;AAAA,EACzB;AAAA,EAEA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,MAAM,MAAM;AACjB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAe;AAClC,WAAO;AAAA,EACT;AACF;","names":["VADEventType"]}
1
+ {"version":3,"sources":["../src/vad.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport { EventEmitter } from 'node:events';\nimport type { VADMetrics } from './metrics/base.js';\nimport { AsyncIterableQueue } from './utils.js';\n\nexport enum VADEventType {\n START_OF_SPEECH,\n INFERENCE_DONE,\n END_OF_SPEECH,\n METRICS_COLLECTED,\n}\n\nexport interface VADEvent {\n /** Type of the VAD event (e.g., start of speech, end of speech, inference done). */\n type: VADEventType;\n /**\n * Index of the audio sample where the event occurred, relative to the inference sample rate.\n */\n samplesIndex: number;\n /** Timestamp when the event was fired. */\n timestamp: number;\n /** Duration of the speech segment. */\n speechDuration: number;\n /** Duration of the silence segment. */\n silenceDuration: number;\n /**\n * List of audio frames associated with the speech.\n *\n * @remarks\n * - For `start_of_speech` events, this contains the audio chunks that triggered the detection.\n * - For `inference_done` events, this contains the audio chunks that were processed.\n * - For `end_of_speech` events, this contains the complete user speech.\n */\n frames: AudioFrame[];\n /** Probability that speech is present (only for `INFERENCE_DONE` events). */\n probability: number;\n /** Time taken to perform the inference, in seconds (only for `INFERENCE_DONE` events). */\n inferenceDuration: number;\n /** Indicates whether speech was detected in the frames. */\n speaking: boolean;\n /** Threshold used to detect silence. */\n rawAccumulatedSilence: number;\n /** Threshold used to detect speech. */\n rawAccumulatedSpeech: number;\n}\n\nexport interface VADCapabilities {\n updateInterval: number;\n}\n\nexport type VADCallbacks = {\n [VADEventType.METRICS_COLLECTED]: (metrics: VADMetrics) => void;\n};\n\nexport abstract class VAD extends (EventEmitter as new () => TypedEmitter<VADCallbacks>) {\n #capabilities: VADCapabilities;\n abstract label: string;\n\n constructor(capabilities: VADCapabilities) {\n super();\n this.#capabilities = capabilities;\n }\n\n get capabilities(): VADCapabilities {\n return this.#capabilities;\n }\n\n /**\n * Returns a {@link VADStream} that can be used to push audio frames and receive VAD events.\n */\n abstract stream(): VADStream;\n}\n\nexport abstract class VADStream implements AsyncIterableIterator<VADEvent> {\n protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');\n protected input = new AsyncIterableQueue<AudioFrame | typeof VADStream.FLUSH_SENTINEL>();\n protected queue = new AsyncIterableQueue<VADEvent>();\n protected output = new AsyncIterableQueue<VADEvent>();\n protected closed = false;\n #vad: VAD;\n #lastActivityTime = BigInt(0);\n\n constructor(vad: VAD) {\n this.#vad = vad;\n this.monitorMetrics();\n }\n\n protected async monitorMetrics() {\n let inferenceDurationTotal = 0;\n let inferenceCount = 0;\n\n for await (const event of this.queue) {\n this.output.put(event);\n switch (event.type) {\n case VADEventType.START_OF_SPEECH:\n inferenceCount++;\n if (inferenceCount >= 1 / this.#vad.capabilities.updateInterval) {\n this.#vad.emit(VADEventType.METRICS_COLLECTED, {\n timestamp: Date.now(),\n idleTime: Math.trunc(\n Number((process.hrtime.bigint() - this.#lastActivityTime) / BigInt(1000000)),\n ),\n inferenceDurationTotal,\n inferenceCount,\n label: this.#vad.label,\n });\n\n inferenceCount = 0;\n inferenceDurationTotal = 0;\n }\n break;\n case VADEventType.INFERENCE_DONE:\n case VADEventType.END_OF_SPEECH:\n this.#lastActivityTime = process.hrtime.bigint();\n break;\n }\n }\n this.output.close();\n }\n\n pushFrame(frame: AudioFrame) {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(frame);\n }\n\n flush() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(VADStream.FLUSH_SENTINEL);\n }\n\n endInput() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.close();\n }\n\n next(): Promise<IteratorResult<VADEvent>> {\n return this.output.next();\n }\n\n close() {\n this.input.close();\n this.queue.close();\n this.output.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): VADStream {\n return this;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAKA,yBAA6B;AAE7B,mBAAmC;AAE5B,IAAK,eAAL,kBAAKA,kBAAL;AACL,EAAAA,4BAAA;AACA,EAAAA,4BAAA;AACA,EAAAA,4BAAA;AACA,EAAAA,4BAAA;AAJU,SAAAA;AAAA,GAAA;AAiDL,MAAe,YAAa,gCAAsD;AAAA,EACvF;AAAA,EAGA,YAAY,cAA+B;AACzC,UAAM;AACN,SAAK,gBAAgB;AAAA,EACvB;AAAA,EAEA,IAAI,eAAgC;AAClC,WAAO,KAAK;AAAA,EACd;AAMF;AAEO,MAAe,UAAqD;AAAA,EACzE,OAA0B,iBAAiB,OAAO,gBAAgB;AAAA,EACxD,QAAQ,IAAI,gCAAiE;AAAA,EAC7E,QAAQ,IAAI,gCAA6B;AAAA,EACzC,SAAS,IAAI,gCAA6B;AAAA,EAC1C,SAAS;AAAA,EACnB;AAAA,EACA,oBAAoB,OAAO,CAAC;AAAA,EAE5B,YAAY,KAAU;AACpB,SAAK,OAAO;AACZ,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,MAAgB,iBAAiB;AAC/B,QAAI,yBAAyB;AAC7B,QAAI,iBAAiB;AAErB,qBAAiB,SAAS,KAAK,OAAO;AACpC,WAAK,OAAO,IAAI,KAAK;AACrB,cAAQ,MAAM,MAAM;AAAA,QAClB,KAAK;AACH;AACA,cAAI,kBAAkB,IAAI,KAAK,KAAK,aAAa,gBAAgB;AAC/D,iBAAK,KAAK,KAAK,2BAAgC;AAAA,cAC7C,WAAW,KAAK,IAAI;AAAA,cACpB,UAAU,KAAK;AAAA,gBACb,QAAQ,QAAQ,OAAO,OAAO,IAAI,KAAK,qBAAqB,OAAO,GAAO,CAAC;AAAA,cAC7E;AAAA,cACA;AAAA,cACA;AAAA,cACA,OAAO,KAAK,KAAK;AAAA,YACnB,CAAC;AAED,6BAAiB;AACjB,qCAAyB;AAAA,UAC3B;AACA;AAAA,QACF,KAAK;AAAA,QACL,KAAK;AACH,eAAK,oBAAoB,QAAQ,OAAO,OAAO;AAC/C;AAAA,MACJ;AAAA,IACF;AACA,SAAK,OAAO,MAAM;AAAA,EACpB;AAAA,EAEA,UAAU,OAAmB;AAC3B,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,KAAK;AAAA,EACtB;AAAA,EAEA,QAAQ;AACN,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,UAAU,cAAc;AAAA,EACzC;AAAA,EAEA,WAAW;AACT,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,MAAM;AAAA,EACnB;AAAA,EAEA,OAA0C;AACxC,WAAO,KAAK,OAAO,KAAK;AAAA,EAC1B;AAAA,EAEA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,MAAM,MAAM;AACjB,SAAK,OAAO,MAAM;AAClB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAe;AAClC,WAAO;AAAA,EACT;AACF;","names":["VADEventType"]}
package/dist/vad.d.ts CHANGED
@@ -1,9 +1,12 @@
1
1
  import type { AudioFrame } from '@livekit/rtc-node';
2
+ import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
3
+ import type { VADMetrics } from './metrics/base.js';
2
4
  import { AsyncIterableQueue } from './utils.js';
3
5
  export declare enum VADEventType {
4
6
  START_OF_SPEECH = 0,
5
7
  INFERENCE_DONE = 1,
6
- END_OF_SPEECH = 2
8
+ END_OF_SPEECH = 2,
9
+ METRICS_COLLECTED = 3
7
10
  }
8
11
  export interface VADEvent {
9
12
  /** Type of the VAD event (e.g., start of speech, end of speech, inference done). */
@@ -14,9 +17,9 @@ export interface VADEvent {
14
17
  samplesIndex: number;
15
18
  /** Timestamp when the event was fired. */
16
19
  timestamp: number;
17
- /** Duration of the detected speech segment in seconds. */
20
+ /** Duration of the speech segment. */
18
21
  speechDuration: number;
19
- /** Duration of the silence segment preceding or following the speech, in seconds. */
22
+ /** Duration of the silence segment. */
20
23
  silenceDuration: number;
21
24
  /**
22
25
  * List of audio frames associated with the speech.
@@ -33,12 +36,21 @@ export interface VADEvent {
33
36
  inferenceDuration: number;
34
37
  /** Indicates whether speech was detected in the frames. */
35
38
  speaking: boolean;
39
+ /** Threshold used to detect silence. */
40
+ rawAccumulatedSilence: number;
41
+ /** Threshold used to detect speech. */
42
+ rawAccumulatedSpeech: number;
36
43
  }
37
44
  export interface VADCapabilities {
38
45
  updateInterval: number;
39
46
  }
40
- export declare abstract class VAD {
47
+ export type VADCallbacks = {
48
+ [VADEventType.METRICS_COLLECTED]: (metrics: VADMetrics) => void;
49
+ };
50
+ declare const VAD_base: new () => TypedEmitter<VADCallbacks>;
51
+ export declare abstract class VAD extends VAD_base {
41
52
  #private;
53
+ abstract label: string;
42
54
  constructor(capabilities: VADCapabilities);
43
55
  get capabilities(): VADCapabilities;
44
56
  /**
@@ -47,10 +59,14 @@ export declare abstract class VAD {
47
59
  abstract stream(): VADStream;
48
60
  }
49
61
  export declare abstract class VADStream implements AsyncIterableIterator<VADEvent> {
62
+ #private;
50
63
  protected static readonly FLUSH_SENTINEL: unique symbol;
51
64
  protected input: AsyncIterableQueue<AudioFrame | typeof VADStream.FLUSH_SENTINEL>;
52
65
  protected queue: AsyncIterableQueue<VADEvent>;
66
+ protected output: AsyncIterableQueue<VADEvent>;
53
67
  protected closed: boolean;
68
+ constructor(vad: VAD);
69
+ protected monitorMetrics(): Promise<void>;
54
70
  pushFrame(frame: AudioFrame): void;
55
71
  flush(): void;
56
72
  endInput(): void;
@@ -58,4 +74,5 @@ export declare abstract class VADStream implements AsyncIterableIterator<VADEven
58
74
  close(): void;
59
75
  [Symbol.asyncIterator](): VADStream;
60
76
  }
77
+ export {};
61
78
  //# sourceMappingURL=vad.d.ts.map
package/dist/vad.d.ts.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"vad.d.ts","sourceRoot":"","sources":["../src/vad.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AACpD,OAAO,EAAE,kBAAkB,EAAE,MAAM,YAAY,CAAC;AAEhD,oBAAY,YAAY;IACtB,eAAe,IAAA;IACf,cAAc,IAAA;IACd,aAAa,IAAA;CACd;AAED,MAAM,WAAW,QAAQ;IACvB,oFAAoF;IACpF,IAAI,EAAE,YAAY,CAAC;IACnB;;OAEG;IACH,YAAY,EAAE,MAAM,CAAC;IACrB,0CAA0C;IAC1C,SAAS,EAAE,MAAM,CAAC;IAClB,0DAA0D;IAC1D,cAAc,EAAE,MAAM,CAAC;IACvB,qFAAqF;IACrF,eAAe,EAAE,MAAM,CAAC;IACxB;;;;;;;OAOG;IACH,MAAM,EAAE,UAAU,EAAE,CAAC;IACrB,6EAA6E;IAC7E,WAAW,EAAE,MAAM,CAAC;IACpB,0FAA0F;IAC1F,iBAAiB,EAAE,MAAM,CAAC;IAC1B,2DAA2D;IAC3D,QAAQ,EAAE,OAAO,CAAC;CACnB;AAED,MAAM,WAAW,eAAe;IAC9B,cAAc,EAAE,MAAM,CAAC;CACxB;AAED,8BAAsB,GAAG;;gBAEX,YAAY,EAAE,eAAe;IAIzC,IAAI,YAAY,IAAI,eAAe,CAElC;IAED;;OAEG;IACH,QAAQ,CAAC,MAAM,IAAI,SAAS;CAC7B;AAED,8BAAsB,SAAU,YAAW,qBAAqB,CAAC,QAAQ,CAAC;IACxE,SAAS,CAAC,MAAM,CAAC,QAAQ,CAAC,cAAc,gBAA4B;IACpE,SAAS,CAAC,KAAK,mEAA0E;IACzF,SAAS,CAAC,KAAK,+BAAsC;IACrD,SAAS,CAAC,MAAM,UAAS;IAEzB,SAAS,CAAC,KAAK,EAAE,UAAU;IAU3B,KAAK;IAUL,QAAQ;IAUR,IAAI,IAAI,OAAO,CAAC,cAAc,CAAC,QAAQ,CAAC,CAAC;IAIzC,KAAK;IAML,CAAC,MAAM,CAAC,aAAa,CAAC,IAAI,SAAS;CAGpC"}
1
+ {"version":3,"file":"vad.d.ts","sourceRoot":"","sources":["../src/vad.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AACpD,OAAO,KAAK,EAAE,iBAAiB,IAAI,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAEhF,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AACpD,OAAO,EAAE,kBAAkB,EAAE,MAAM,YAAY,CAAC;AAEhD,oBAAY,YAAY;IACtB,eAAe,IAAA;IACf,cAAc,IAAA;IACd,aAAa,IAAA;IACb,iBAAiB,IAAA;CAClB;AAED,MAAM,WAAW,QAAQ;IACvB,oFAAoF;IACpF,IAAI,EAAE,YAAY,CAAC;IACnB;;OAEG;IACH,YAAY,EAAE,MAAM,CAAC;IACrB,0CAA0C;IAC1C,SAAS,EAAE,MAAM,CAAC;IAClB,sCAAsC;IACtC,cAAc,EAAE,MAAM,CAAC;IACvB,uCAAuC;IACvC,eAAe,EAAE,MAAM,CAAC;IACxB;;;;;;;OAOG;IACH,MAAM,EAAE,UAAU,EAAE,CAAC;IACrB,6EAA6E;IAC7E,WAAW,EAAE,MAAM,CAAC;IACpB,0FAA0F;IAC1F,iBAAiB,EAAE,MAAM,CAAC;IAC1B,2DAA2D;IAC3D,QAAQ,EAAE,OAAO,CAAC;IAClB,wCAAwC;IACxC,qBAAqB,EAAE,MAAM,CAAC;IAC9B,uCAAuC;IACvC,oBAAoB,EAAE,MAAM,CAAC;CAC9B;AAED,MAAM,WAAW,eAAe;IAC9B,cAAc,EAAE,MAAM,CAAC;CACxB;AAED,MAAM,MAAM,YAAY,GAAG;IACzB,CAAC,YAAY,CAAC,iBAAiB,CAAC,EAAE,CAAC,OAAO,EAAE,UAAU,KAAK,IAAI,CAAC;CACjE,CAAC;kCAE2D,aAAa,YAAY,CAAC;AAAvF,8BAAsB,GAAI,SAAQ,QAAsD;;IAEtF,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;gBAEX,YAAY,EAAE,eAAe;IAKzC,IAAI,YAAY,IAAI,eAAe,CAElC;IAED;;OAEG;IACH,QAAQ,CAAC,MAAM,IAAI,SAAS;CAC7B;AAED,8BAAsB,SAAU,YAAW,qBAAqB,CAAC,QAAQ,CAAC;;IACxE,SAAS,CAAC,MAAM,CAAC,QAAQ,CAAC,cAAc,gBAA4B;IACpE,SAAS,CAAC,KAAK,mEAA0E;IACzF,SAAS,CAAC,KAAK,+BAAsC;IACrD,SAAS,CAAC,MAAM,+BAAsC;IACtD,SAAS,CAAC,MAAM,UAAS;gBAIb,GAAG,EAAE,GAAG;cAKJ,cAAc;IAiC9B,SAAS,CAAC,KAAK,EAAE,UAAU;IAU3B,KAAK;IAUL,QAAQ;IAUR,IAAI,IAAI,OAAO,CAAC,cAAc,CAAC,QAAQ,CAAC,CAAC;IAIzC,KAAK;IAOL,CAAC,MAAM,CAAC,aAAa,CAAC,IAAI,SAAS;CAGpC"}
package/dist/vad.js CHANGED
@@ -1,13 +1,16 @@
1
+ import { EventEmitter } from "node:events";
1
2
  import { AsyncIterableQueue } from "./utils.js";
2
3
  var VADEventType = /* @__PURE__ */ ((VADEventType2) => {
3
4
  VADEventType2[VADEventType2["START_OF_SPEECH"] = 0] = "START_OF_SPEECH";
4
5
  VADEventType2[VADEventType2["INFERENCE_DONE"] = 1] = "INFERENCE_DONE";
5
6
  VADEventType2[VADEventType2["END_OF_SPEECH"] = 2] = "END_OF_SPEECH";
7
+ VADEventType2[VADEventType2["METRICS_COLLECTED"] = 3] = "METRICS_COLLECTED";
6
8
  return VADEventType2;
7
9
  })(VADEventType || {});
8
- class VAD {
10
+ class VAD extends EventEmitter {
9
11
  #capabilities;
10
12
  constructor(capabilities) {
13
+ super();
11
14
  this.#capabilities = capabilities;
12
15
  }
13
16
  get capabilities() {
@@ -18,7 +21,44 @@ class VADStream {
18
21
  static FLUSH_SENTINEL = Symbol("FLUSH_SENTINEL");
19
22
  input = new AsyncIterableQueue();
20
23
  queue = new AsyncIterableQueue();
24
+ output = new AsyncIterableQueue();
21
25
  closed = false;
26
+ #vad;
27
+ #lastActivityTime = BigInt(0);
28
+ constructor(vad) {
29
+ this.#vad = vad;
30
+ this.monitorMetrics();
31
+ }
32
+ async monitorMetrics() {
33
+ let inferenceDurationTotal = 0;
34
+ let inferenceCount = 0;
35
+ for await (const event of this.queue) {
36
+ this.output.put(event);
37
+ switch (event.type) {
38
+ case 0 /* START_OF_SPEECH */:
39
+ inferenceCount++;
40
+ if (inferenceCount >= 1 / this.#vad.capabilities.updateInterval) {
41
+ this.#vad.emit(3 /* METRICS_COLLECTED */, {
42
+ timestamp: Date.now(),
43
+ idleTime: Math.trunc(
44
+ Number((process.hrtime.bigint() - this.#lastActivityTime) / BigInt(1e6))
45
+ ),
46
+ inferenceDurationTotal,
47
+ inferenceCount,
48
+ label: this.#vad.label
49
+ });
50
+ inferenceCount = 0;
51
+ inferenceDurationTotal = 0;
52
+ }
53
+ break;
54
+ case 1 /* INFERENCE_DONE */:
55
+ case 2 /* END_OF_SPEECH */:
56
+ this.#lastActivityTime = process.hrtime.bigint();
57
+ break;
58
+ }
59
+ }
60
+ this.output.close();
61
+ }
22
62
  pushFrame(frame) {
23
63
  if (this.input.closed) {
24
64
  throw new Error("Input is closed");
@@ -47,11 +87,12 @@ class VADStream {
47
87
  this.input.close();
48
88
  }
49
89
  next() {
50
- return this.queue.next();
90
+ return this.output.next();
51
91
  }
52
92
  close() {
53
93
  this.input.close();
54
94
  this.queue.close();
95
+ this.output.close();
55
96
  this.closed = true;
56
97
  }
57
98
  [Symbol.asyncIterator]() {