@livekit/agents 0.5.2 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. package/dist/index.cjs +3 -0
  2. package/dist/index.cjs.map +1 -1
  3. package/dist/index.d.ts +2 -1
  4. package/dist/index.d.ts.map +1 -1
  5. package/dist/index.js +2 -0
  6. package/dist/index.js.map +1 -1
  7. package/dist/llm/index.cjs +2 -0
  8. package/dist/llm/index.cjs.map +1 -1
  9. package/dist/llm/index.d.ts +1 -1
  10. package/dist/llm/index.d.ts.map +1 -1
  11. package/dist/llm/index.js +2 -0
  12. package/dist/llm/index.js.map +1 -1
  13. package/dist/llm/llm.cjs +47 -3
  14. package/dist/llm/llm.cjs.map +1 -1
  15. package/dist/llm/llm.d.ts +15 -2
  16. package/dist/llm/llm.d.ts.map +1 -1
  17. package/dist/llm/llm.js +46 -3
  18. package/dist/llm/llm.js.map +1 -1
  19. package/dist/metrics/base.cjs +44 -0
  20. package/dist/metrics/base.cjs.map +1 -0
  21. package/dist/metrics/base.d.ts +96 -0
  22. package/dist/metrics/base.d.ts.map +1 -0
  23. package/dist/metrics/base.js +20 -0
  24. package/dist/metrics/base.js.map +1 -0
  25. package/dist/metrics/index.cjs +35 -0
  26. package/dist/metrics/index.cjs.map +1 -0
  27. package/dist/metrics/index.d.ts +5 -0
  28. package/dist/metrics/index.d.ts.map +1 -0
  29. package/dist/metrics/index.js +9 -0
  30. package/dist/metrics/index.js.map +1 -0
  31. package/dist/metrics/usage_collector.cjs +53 -0
  32. package/dist/metrics/usage_collector.cjs.map +1 -0
  33. package/dist/metrics/usage_collector.d.ts +14 -0
  34. package/dist/metrics/usage_collector.d.ts.map +1 -0
  35. package/dist/metrics/usage_collector.js +29 -0
  36. package/dist/metrics/usage_collector.js.map +1 -0
  37. package/dist/metrics/utils.cjs +104 -0
  38. package/dist/metrics/utils.cjs.map +1 -0
  39. package/dist/metrics/utils.d.ts +10 -0
  40. package/dist/metrics/utils.d.ts.map +1 -0
  41. package/dist/metrics/utils.js +73 -0
  42. package/dist/metrics/utils.js.map +1 -0
  43. package/dist/multimodal/multimodal_agent.cjs +7 -13
  44. package/dist/multimodal/multimodal_agent.cjs.map +1 -1
  45. package/dist/multimodal/multimodal_agent.d.ts +1 -4
  46. package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
  47. package/dist/multimodal/multimodal_agent.js +7 -13
  48. package/dist/multimodal/multimodal_agent.js.map +1 -1
  49. package/dist/pipeline/index.cjs +2 -0
  50. package/dist/pipeline/index.cjs.map +1 -1
  51. package/dist/pipeline/index.d.ts +1 -1
  52. package/dist/pipeline/index.d.ts.map +1 -1
  53. package/dist/pipeline/index.js +3 -1
  54. package/dist/pipeline/index.js.map +1 -1
  55. package/dist/pipeline/pipeline_agent.cjs +166 -66
  56. package/dist/pipeline/pipeline_agent.cjs.map +1 -1
  57. package/dist/pipeline/pipeline_agent.d.ts +10 -4
  58. package/dist/pipeline/pipeline_agent.d.ts.map +1 -1
  59. package/dist/pipeline/pipeline_agent.js +169 -69
  60. package/dist/pipeline/pipeline_agent.js.map +1 -1
  61. package/dist/pipeline/speech_handle.cjs +49 -1
  62. package/dist/pipeline/speech_handle.cjs.map +1 -1
  63. package/dist/pipeline/speech_handle.d.ts +12 -2
  64. package/dist/pipeline/speech_handle.d.ts.map +1 -1
  65. package/dist/pipeline/speech_handle.js +50 -2
  66. package/dist/pipeline/speech_handle.js.map +1 -1
  67. package/dist/stt/index.cjs.map +1 -1
  68. package/dist/stt/index.d.ts +1 -1
  69. package/dist/stt/index.d.ts.map +1 -1
  70. package/dist/stt/index.js.map +1 -1
  71. package/dist/stt/stream_adapter.cjs +15 -5
  72. package/dist/stt/stream_adapter.cjs.map +1 -1
  73. package/dist/stt/stream_adapter.d.ts +4 -1
  74. package/dist/stt/stream_adapter.d.ts.map +1 -1
  75. package/dist/stt/stream_adapter.js +15 -5
  76. package/dist/stt/stream_adapter.js.map +1 -1
  77. package/dist/stt/stt.cjs +46 -2
  78. package/dist/stt/stt.cjs.map +1 -1
  79. package/dist/stt/stt.d.ts +25 -3
  80. package/dist/stt/stt.d.ts.map +1 -1
  81. package/dist/stt/stt.js +46 -2
  82. package/dist/stt/stt.js.map +1 -1
  83. package/dist/tts/index.cjs +4 -2
  84. package/dist/tts/index.cjs.map +1 -1
  85. package/dist/tts/index.d.ts +1 -1
  86. package/dist/tts/index.d.ts.map +1 -1
  87. package/dist/tts/index.js +3 -1
  88. package/dist/tts/index.js.map +1 -1
  89. package/dist/tts/stream_adapter.cjs +14 -3
  90. package/dist/tts/stream_adapter.cjs.map +1 -1
  91. package/dist/tts/stream_adapter.d.ts +3 -0
  92. package/dist/tts/stream_adapter.d.ts.map +1 -1
  93. package/dist/tts/stream_adapter.js +15 -4
  94. package/dist/tts/stream_adapter.js.map +1 -1
  95. package/dist/tts/tts.cjs +109 -6
  96. package/dist/tts/tts.cjs.map +1 -1
  97. package/dist/tts/tts.d.ts +24 -1
  98. package/dist/tts/tts.d.ts.map +1 -1
  99. package/dist/tts/tts.js +107 -5
  100. package/dist/tts/tts.js.map +1 -1
  101. package/dist/vad.cjs +43 -2
  102. package/dist/vad.cjs.map +1 -1
  103. package/dist/vad.d.ts +21 -4
  104. package/dist/vad.d.ts.map +1 -1
  105. package/dist/vad.js +43 -2
  106. package/dist/vad.js.map +1 -1
  107. package/package.json +1 -1
  108. package/src/index.ts +2 -1
  109. package/src/llm/index.ts +2 -0
  110. package/src/llm/llm.ts +55 -3
  111. package/src/metrics/base.ts +127 -0
  112. package/src/metrics/index.ts +20 -0
  113. package/src/metrics/usage_collector.ts +40 -0
  114. package/src/metrics/utils.ts +100 -0
  115. package/src/multimodal/multimodal_agent.ts +12 -17
  116. package/src/pipeline/index.ts +1 -1
  117. package/src/pipeline/pipeline_agent.ts +206 -87
  118. package/src/pipeline/speech_handle.ts +67 -2
  119. package/src/stt/index.ts +2 -0
  120. package/src/stt/stream_adapter.ts +17 -5
  121. package/src/stt/stt.ts +67 -3
  122. package/src/tts/index.ts +2 -0
  123. package/src/tts/stream_adapter.ts +17 -4
  124. package/src/tts/tts.ts +127 -4
  125. package/src/vad.ts +61 -4
package/src/stt/stt.ts CHANGED
@@ -2,6 +2,9 @@
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
  import type { AudioFrame } from '@livekit/rtc-node';
5
+ import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
6
+ import { EventEmitter } from 'node:events';
7
+ import type { STTMetrics } from '../metrics/base.js';
5
8
  import type { AudioBuffer } from '../utils.js';
6
9
  import { AsyncIterableQueue } from '../utils.js';
7
10
 
@@ -27,6 +30,9 @@ export enum SpeechEventType {
27
30
  * The first alternative is a combination of all the previous FINAL_TRANSCRIPT events.
28
31
  */
29
32
  END_OF_SPEECH = 3,
33
+ /** Usage event, emitted periodically to indicate usage metrics. */
34
+ RECOGNITION_USAGE = 4,
35
+ METRICS_COLLECTED = 5,
30
36
  }
31
37
 
32
38
  /** SpeechData contains metadata about this {@link SpeechEvent}. */
@@ -38,10 +44,16 @@ export interface SpeechData {
38
44
  confidence: number;
39
45
  }
40
46
 
47
+ export interface RecognitionUsage {
48
+ audioDuration: number;
49
+ }
50
+
41
51
  /** SpeechEvent is a packet of speech-to-text data. */
42
52
  export interface SpeechEvent {
43
53
  type: SpeechEventType;
44
54
  alternatives?: [SpeechData, ...SpeechData[]];
55
+ requestId?: string;
56
+ recognitionUsage?: RecognitionUsage;
45
57
  }
46
58
 
47
59
  /**
@@ -55,6 +67,10 @@ export interface STTCapabilities {
55
67
  interimResults: boolean;
56
68
  }
57
69
 
70
+ export type STTCallbacks = {
71
+ [SpeechEventType.METRICS_COLLECTED]: (metrics: STTMetrics) => void;
72
+ };
73
+
58
74
  /**
59
75
  * An instance of a speech-to-text adapter.
60
76
  *
@@ -62,10 +78,12 @@ export interface STTCapabilities {
62
78
  * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that
63
79
  * exports its own child STT class, which inherits this class's methods.
64
80
  */
65
- export abstract class STT {
81
+ export abstract class STT extends (EventEmitter as new () => TypedEmitter<STTCallbacks>) {
82
+ abstract label: string;
66
83
  #capabilities: STTCapabilities;
67
84
 
68
85
  constructor(capabilities: STTCapabilities) {
86
+ super();
69
87
  this.#capabilities = capabilities;
70
88
  }
71
89
 
@@ -75,7 +93,24 @@ export abstract class STT {
75
93
  }
76
94
 
77
95
  /** Receives an audio buffer and returns transcription in the form of a {@link SpeechEvent} */
78
- abstract recognize(frame: AudioBuffer): Promise<SpeechEvent>;
96
+ async recognize(frame: AudioBuffer): Promise<SpeechEvent> {
97
+ const startTime = process.hrtime.bigint();
98
+ const event = await this._recognize(frame);
99
+ const duration = Number((process.hrtime.bigint() - startTime) / BigInt(1000000));
100
+ this.emit(SpeechEventType.METRICS_COLLECTED, {
101
+ requestId: event.requestId ?? '',
102
+ timestamp: Date.now(),
103
+ duration,
104
+ label: this.label,
105
+ audioDuration: Array.isArray(frame)
106
+ ? frame.reduce((sum, a) => sum + a.samplesPerChannel / a.sampleRate, 0)
107
+ : frame.samplesPerChannel / frame.sampleRate,
108
+ streamed: false,
109
+ });
110
+ return event;
111
+ }
112
+
113
+ protected abstract _recognize(frame: AudioBuffer): Promise<SpeechEvent>;
79
114
 
80
115
  /**
81
116
  * Returns a {@link SpeechStream} that can be used to push audio frames and receive
@@ -103,8 +138,36 @@ export abstract class STT {
103
138
  export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent> {
104
139
  protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');
105
140
  protected input = new AsyncIterableQueue<AudioFrame | typeof SpeechStream.FLUSH_SENTINEL>();
141
+ protected output = new AsyncIterableQueue<SpeechEvent>();
106
142
  protected queue = new AsyncIterableQueue<SpeechEvent>();
143
+ abstract label: string;
107
144
  protected closed = false;
145
+ #stt: STT;
146
+
147
+ constructor(stt: STT) {
148
+ this.#stt = stt;
149
+ this.monitorMetrics();
150
+ }
151
+
152
+ protected async monitorMetrics() {
153
+ const startTime = process.hrtime.bigint();
154
+
155
+ for await (const event of this.queue) {
156
+ this.output.put(event);
157
+ if (event.type !== SpeechEventType.RECOGNITION_USAGE) continue;
158
+ const duration = process.hrtime.bigint() - startTime;
159
+ const metrics: STTMetrics = {
160
+ timestamp: Date.now(),
161
+ requestId: event.requestId!,
162
+ duration: Math.trunc(Number(duration / BigInt(1000000))),
163
+ label: this.label,
164
+ audioDuration: event.recognitionUsage!.audioDuration,
165
+ streamed: true,
166
+ };
167
+ this.#stt.emit(SpeechEventType.METRICS_COLLECTED, metrics);
168
+ }
169
+ this.output.close();
170
+ }
108
171
 
109
172
  /** Push an audio frame to the STT */
110
173
  pushFrame(frame: AudioFrame) {
@@ -140,13 +203,14 @@ export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent>
140
203
  }
141
204
 
142
205
  next(): Promise<IteratorResult<SpeechEvent>> {
143
- return this.queue.next();
206
+ return this.output.next();
144
207
  }
145
208
 
146
209
  /** Close both the input and output of the STT stream */
147
210
  close() {
148
211
  this.input.close();
149
212
  this.queue.close();
213
+ this.output.close();
150
214
  this.closed = true;
151
215
  }
152
216
 
package/src/tts/index.ts CHANGED
@@ -4,7 +4,9 @@
4
4
  export {
5
5
  type SynthesizedAudio,
6
6
  type TTSCapabilities,
7
+ type TTSCallbacks,
7
8
  TTS,
9
+ TTSEvent,
8
10
  SynthesizeStream,
9
11
  ChunkedStream,
10
12
  } from './tts.js';
@@ -3,16 +3,23 @@
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
  import type { SentenceStream, SentenceTokenizer } from '../tokenize/index.js';
5
5
  import type { ChunkedStream } from './tts.js';
6
- import { SynthesizeStream, TTS } from './tts.js';
6
+ import { SynthesizeStream, TTS, TTSEvent } from './tts.js';
7
7
 
8
8
  export class StreamAdapter extends TTS {
9
9
  #tts: TTS;
10
10
  #sentenceTokenizer: SentenceTokenizer;
11
+ label: string;
11
12
 
12
13
  constructor(tts: TTS, sentenceTokenizer: SentenceTokenizer) {
13
14
  super(tts.sampleRate, tts.numChannels, { streaming: true });
14
15
  this.#tts = tts;
15
16
  this.#sentenceTokenizer = sentenceTokenizer;
17
+ this.label = this.#tts.label;
18
+ this.label = `tts.StreamAdapter<${this.#tts.label}>`;
19
+
20
+ this.#tts.on(TTSEvent.METRICS_COLLECTED, (metrics) => {
21
+ this.emit(TTSEvent.METRICS_COLLECTED, metrics);
22
+ });
16
23
  }
17
24
 
18
25
  synthesize(text: string): ChunkedStream {
@@ -27,15 +34,21 @@ export class StreamAdapter extends TTS {
27
34
  export class StreamAdapterWrapper extends SynthesizeStream {
28
35
  #tts: TTS;
29
36
  #sentenceStream: SentenceStream;
37
+ label: string;
30
38
 
31
39
  constructor(tts: TTS, sentenceTokenizer: SentenceTokenizer) {
32
- super();
40
+ super(tts);
33
41
  this.#tts = tts;
34
42
  this.#sentenceStream = sentenceTokenizer.stream();
43
+ this.label = `tts.StreamAdapterWrapper<${this.#tts.label}>`;
35
44
 
36
45
  this.#run();
37
46
  }
38
47
 
48
+ async monitorMetrics() {
49
+ return; // do nothing
50
+ }
51
+
39
52
  async #run() {
40
53
  const forwardInput = async () => {
41
54
  for await (const input of this.input) {
@@ -52,10 +65,10 @@ export class StreamAdapterWrapper extends SynthesizeStream {
52
65
  const synthesize = async () => {
53
66
  for await (const ev of this.#sentenceStream) {
54
67
  for await (const audio of this.#tts.synthesize(ev.token)) {
55
- this.queue.put(audio);
68
+ this.output.put(audio);
56
69
  }
57
70
  }
58
- this.queue.put(SynthesizeStream.END_OF_STREAM);
71
+ this.output.put(SynthesizeStream.END_OF_STREAM);
59
72
  };
60
73
 
61
74
  Promise.all([forwardInput(), synthesize()]);
package/src/tts/tts.ts CHANGED
@@ -2,6 +2,9 @@
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
  import type { AudioFrame } from '@livekit/rtc-node';
5
+ import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
6
+ import { EventEmitter } from 'node:events';
7
+ import type { TTSMetrics } from '../metrics/base.js';
5
8
  import { AsyncIterableQueue, mergeFrames } from '../utils.js';
6
9
 
7
10
  /** SynthesizedAudio is a packet of speech synthesis as returned by the TTS. */
@@ -14,6 +17,8 @@ export interface SynthesizedAudio {
14
17
  frame: AudioFrame;
15
18
  /** Current segment of the synthesized audio */
16
19
  deltaText?: string;
20
+ /** Whether this is the last frame of the segment (streaming only) */
21
+ final: boolean;
17
22
  }
18
23
 
19
24
  /**
@@ -27,6 +32,14 @@ export interface TTSCapabilities {
27
32
  streaming: boolean;
28
33
  }
29
34
 
35
+ export enum TTSEvent {
36
+ METRICS_COLLECTED,
37
+ }
38
+
39
+ export type TTSCallbacks = {
40
+ [TTSEvent.METRICS_COLLECTED]: (metrics: TTSMetrics) => void;
41
+ };
42
+
30
43
  /**
31
44
  * An instance of a text-to-speech adapter.
32
45
  *
@@ -34,12 +47,14 @@ export interface TTSCapabilities {
34
47
  * This class is abstract, and as such cannot be used directly. Instead, use a provider plugin that
35
48
  * exports its own child TTS class, which inherits this class's methods.
36
49
  */
37
- export abstract class TTS {
50
+ export abstract class TTS extends (EventEmitter as new () => TypedEmitter<TTSCallbacks>) {
38
51
  #capabilities: TTSCapabilities;
39
52
  #sampleRate: number;
40
53
  #numChannels: number;
54
+ abstract label: string;
41
55
 
42
56
  constructor(sampleRate: number, numChannels: number, capabilities: TTSCapabilities) {
57
+ super();
43
58
  this.#capabilities = capabilities;
44
59
  this.#sampleRate = sampleRate;
45
60
  this.#numChannels = numChannels;
@@ -94,10 +109,71 @@ export abstract class SynthesizeStream
94
109
  protected queue = new AsyncIterableQueue<
95
110
  SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM
96
111
  >();
112
+ protected output = new AsyncIterableQueue<
113
+ SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM
114
+ >();
97
115
  protected closed = false;
116
+ abstract label: string;
117
+ #tts: TTS;
118
+ #metricsPendingTexts: string[] = [];
119
+ #metricsText = '';
120
+ #monitorMetricsTask?: Promise<void>;
121
+
122
+ constructor(tts: TTS) {
123
+ this.#tts = tts;
124
+ }
125
+
126
+ protected async monitorMetrics() {
127
+ const startTime = process.hrtime.bigint();
128
+ let audioDuration = 0;
129
+ let ttfb: bigint | undefined;
130
+ let requestId = '';
131
+
132
+ const emit = () => {
133
+ if (this.#metricsPendingTexts.length) {
134
+ const text = this.#metricsPendingTexts.shift()!;
135
+ const duration = process.hrtime.bigint() - startTime;
136
+ const metrics: TTSMetrics = {
137
+ timestamp: Date.now(),
138
+ requestId,
139
+ ttfb: Math.trunc(Number(ttfb! / BigInt(1000000))),
140
+ duration: Math.trunc(Number(duration / BigInt(1000000))),
141
+ charactersCount: text.length,
142
+ audioDuration,
143
+ cancelled: false, // XXX(nbsp)
144
+ label: this.label,
145
+ streamed: false,
146
+ };
147
+ this.#tts.emit(TTSEvent.METRICS_COLLECTED, metrics);
148
+ }
149
+ };
150
+
151
+ for await (const audio of this.queue) {
152
+ this.output.put(audio);
153
+ if (audio === SynthesizeStream.END_OF_STREAM) continue;
154
+ requestId = audio.requestId;
155
+ if (!ttfb) {
156
+ ttfb = process.hrtime.bigint() - startTime;
157
+ }
158
+ audioDuration += audio.frame.samplesPerChannel / audio.frame.sampleRate;
159
+ if (audio.final) {
160
+ emit();
161
+ }
162
+ }
163
+
164
+ if (requestId) {
165
+ emit();
166
+ }
167
+ this.output.close();
168
+ }
98
169
 
99
170
  /** Push a string of text to the TTS */
100
171
  pushText(text: string) {
172
+ if (!this.#monitorMetricsTask) {
173
+ this.#monitorMetricsTask = this.monitorMetrics();
174
+ }
175
+ this.#metricsText += text;
176
+
101
177
  if (this.input.closed) {
102
178
  throw new Error('Input is closed');
103
179
  }
@@ -109,6 +185,10 @@ export abstract class SynthesizeStream
109
185
 
110
186
  /** Flush the TTS, causing it to process all pending text */
111
187
  flush() {
188
+ if (this.#metricsText) {
189
+ this.#metricsPendingTexts.push(this.#metricsText);
190
+ this.#metricsText = '';
191
+ }
112
192
  if (this.input.closed) {
113
193
  throw new Error('Input is closed');
114
194
  }
@@ -130,13 +210,13 @@ export abstract class SynthesizeStream
130
210
  }
131
211
 
132
212
  next(): Promise<IteratorResult<SynthesizedAudio | typeof SynthesizeStream.END_OF_STREAM>> {
133
- return this.queue.next();
213
+ return this.output.next();
134
214
  }
135
215
 
136
216
  /** Close both the input and output of the TTS stream */
137
217
  close() {
138
218
  this.input.close();
139
- this.queue.close();
219
+ this.output.close();
140
220
  this.closed = true;
141
221
  }
142
222
 
@@ -161,7 +241,49 @@ export abstract class SynthesizeStream
161
241
  */
162
242
  export abstract class ChunkedStream implements AsyncIterableIterator<SynthesizedAudio> {
163
243
  protected queue = new AsyncIterableQueue<SynthesizedAudio>();
244
+ protected output = new AsyncIterableQueue<SynthesizedAudio>();
164
245
  protected closed = false;
246
+ abstract label: string;
247
+ #text: string;
248
+ #tts: TTS;
249
+
250
+ constructor(text: string, tts: TTS) {
251
+ this.#text = text;
252
+ this.#tts = tts;
253
+
254
+ this.monitorMetrics();
255
+ }
256
+
257
+ protected async monitorMetrics() {
258
+ const startTime = process.hrtime.bigint();
259
+ let audioDuration = 0;
260
+ let ttfb: bigint | undefined;
261
+ let requestId = '';
262
+
263
+ for await (const audio of this.queue) {
264
+ this.output.put(audio);
265
+ requestId = audio.requestId;
266
+ if (!ttfb) {
267
+ ttfb = process.hrtime.bigint() - startTime;
268
+ }
269
+ audioDuration += audio.frame.samplesPerChannel / audio.frame.sampleRate;
270
+ }
271
+ this.output.close();
272
+
273
+ const duration = process.hrtime.bigint() - startTime;
274
+ const metrics: TTSMetrics = {
275
+ timestamp: Date.now(),
276
+ requestId,
277
+ ttfb: Math.trunc(Number(ttfb! / BigInt(1000000))),
278
+ duration: Math.trunc(Number(duration / BigInt(1000000))),
279
+ charactersCount: this.#text.length,
280
+ audioDuration,
281
+ cancelled: false, // XXX(nbsp)
282
+ label: this.label,
283
+ streamed: false,
284
+ };
285
+ this.#tts.emit(TTSEvent.METRICS_COLLECTED, metrics);
286
+ }
165
287
 
166
288
  /** Collect every frame into one in a single call */
167
289
  async collect(): Promise<AudioFrame> {
@@ -173,12 +295,13 @@ export abstract class ChunkedStream implements AsyncIterableIterator<Synthesized
173
295
  }
174
296
 
175
297
  next(): Promise<IteratorResult<SynthesizedAudio>> {
176
- return this.queue.next();
298
+ return this.output.next();
177
299
  }
178
300
 
179
301
  /** Close both the input and output of the TTS stream */
180
302
  close() {
181
303
  this.queue.close();
304
+ this.output.close();
182
305
  this.closed = true;
183
306
  }
184
307
 
package/src/vad.ts CHANGED
@@ -2,12 +2,16 @@
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
  import type { AudioFrame } from '@livekit/rtc-node';
5
+ import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
6
+ import { EventEmitter } from 'node:events';
7
+ import type { VADMetrics } from './metrics/base.js';
5
8
  import { AsyncIterableQueue } from './utils.js';
6
9
 
7
10
  export enum VADEventType {
8
11
  START_OF_SPEECH,
9
12
  INFERENCE_DONE,
10
13
  END_OF_SPEECH,
14
+ METRICS_COLLECTED,
11
15
  }
12
16
 
13
17
  export interface VADEvent {
@@ -19,9 +23,9 @@ export interface VADEvent {
19
23
  samplesIndex: number;
20
24
  /** Timestamp when the event was fired. */
21
25
  timestamp: number;
22
- /** Duration of the detected speech segment in seconds. */
26
+ /** Duration of the speech segment. */
23
27
  speechDuration: number;
24
- /** Duration of the silence segment preceding or following the speech, in seconds. */
28
+ /** Duration of the silence segment. */
25
29
  silenceDuration: number;
26
30
  /**
27
31
  * List of audio frames associated with the speech.
@@ -38,15 +42,26 @@ export interface VADEvent {
38
42
  inferenceDuration: number;
39
43
  /** Indicates whether speech was detected in the frames. */
40
44
  speaking: boolean;
45
+ /** Threshold used to detect silence. */
46
+ rawAccumulatedSilence: number;
47
+ /** Threshold used to detect speech. */
48
+ rawAccumulatedSpeech: number;
41
49
  }
42
50
 
43
51
  export interface VADCapabilities {
44
52
  updateInterval: number;
45
53
  }
46
54
 
47
- export abstract class VAD {
55
+ export type VADCallbacks = {
56
+ [VADEventType.METRICS_COLLECTED]: (metrics: VADMetrics) => void;
57
+ };
58
+
59
+ export abstract class VAD extends (EventEmitter as new () => TypedEmitter<VADCallbacks>) {
48
60
  #capabilities: VADCapabilities;
61
+ abstract label: string;
62
+
49
63
  constructor(capabilities: VADCapabilities) {
64
+ super();
50
65
  this.#capabilities = capabilities;
51
66
  }
52
67
 
@@ -64,7 +79,48 @@ export abstract class VADStream implements AsyncIterableIterator<VADEvent> {
64
79
  protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');
65
80
  protected input = new AsyncIterableQueue<AudioFrame | typeof VADStream.FLUSH_SENTINEL>();
66
81
  protected queue = new AsyncIterableQueue<VADEvent>();
82
+ protected output = new AsyncIterableQueue<VADEvent>();
67
83
  protected closed = false;
84
+ #vad: VAD;
85
+ #lastActivityTime = BigInt(0);
86
+
87
+ constructor(vad: VAD) {
88
+ this.#vad = vad;
89
+ this.monitorMetrics();
90
+ }
91
+
92
+ protected async monitorMetrics() {
93
+ let inferenceDurationTotal = 0;
94
+ let inferenceCount = 0;
95
+
96
+ for await (const event of this.queue) {
97
+ this.output.put(event);
98
+ switch (event.type) {
99
+ case VADEventType.START_OF_SPEECH:
100
+ inferenceCount++;
101
+ if (inferenceCount >= 1 / this.#vad.capabilities.updateInterval) {
102
+ this.#vad.emit(VADEventType.METRICS_COLLECTED, {
103
+ timestamp: Date.now(),
104
+ idleTime: Math.trunc(
105
+ Number((process.hrtime.bigint() - this.#lastActivityTime) / BigInt(1000000)),
106
+ ),
107
+ inferenceDurationTotal,
108
+ inferenceCount,
109
+ label: this.#vad.label,
110
+ });
111
+
112
+ inferenceCount = 0;
113
+ inferenceDurationTotal = 0;
114
+ }
115
+ break;
116
+ case VADEventType.INFERENCE_DONE:
117
+ case VADEventType.END_OF_SPEECH:
118
+ this.#lastActivityTime = process.hrtime.bigint();
119
+ break;
120
+ }
121
+ }
122
+ this.output.close();
123
+ }
68
124
 
69
125
  pushFrame(frame: AudioFrame) {
70
126
  if (this.input.closed) {
@@ -97,12 +153,13 @@ export abstract class VADStream implements AsyncIterableIterator<VADEvent> {
97
153
  }
98
154
 
99
155
  next(): Promise<IteratorResult<VADEvent>> {
100
- return this.queue.next();
156
+ return this.output.next();
101
157
  }
102
158
 
103
159
  close() {
104
160
  this.input.close();
105
161
  this.queue.close();
162
+ this.output.close();
106
163
  this.closed = true;
107
164
  }
108
165