@livekit/agents 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. package/dist/index.cjs +3 -0
  2. package/dist/index.cjs.map +1 -1
  3. package/dist/index.d.ts +2 -1
  4. package/dist/index.d.ts.map +1 -1
  5. package/dist/index.js +2 -0
  6. package/dist/index.js.map +1 -1
  7. package/dist/llm/index.cjs +2 -0
  8. package/dist/llm/index.cjs.map +1 -1
  9. package/dist/llm/index.d.ts +1 -1
  10. package/dist/llm/index.d.ts.map +1 -1
  11. package/dist/llm/index.js +2 -0
  12. package/dist/llm/index.js.map +1 -1
  13. package/dist/llm/llm.cjs +47 -3
  14. package/dist/llm/llm.cjs.map +1 -1
  15. package/dist/llm/llm.d.ts +15 -2
  16. package/dist/llm/llm.d.ts.map +1 -1
  17. package/dist/llm/llm.js +46 -3
  18. package/dist/llm/llm.js.map +1 -1
  19. package/dist/metrics/base.cjs +44 -0
  20. package/dist/metrics/base.cjs.map +1 -0
  21. package/dist/metrics/base.d.ts +96 -0
  22. package/dist/metrics/base.d.ts.map +1 -0
  23. package/dist/metrics/base.js +20 -0
  24. package/dist/metrics/base.js.map +1 -0
  25. package/dist/metrics/index.cjs +35 -0
  26. package/dist/metrics/index.cjs.map +1 -0
  27. package/dist/metrics/index.d.ts +5 -0
  28. package/dist/metrics/index.d.ts.map +1 -0
  29. package/dist/metrics/index.js +9 -0
  30. package/dist/metrics/index.js.map +1 -0
  31. package/dist/metrics/usage_collector.cjs +53 -0
  32. package/dist/metrics/usage_collector.cjs.map +1 -0
  33. package/dist/metrics/usage_collector.d.ts +14 -0
  34. package/dist/metrics/usage_collector.d.ts.map +1 -0
  35. package/dist/metrics/usage_collector.js +29 -0
  36. package/dist/metrics/usage_collector.js.map +1 -0
  37. package/dist/metrics/utils.cjs +104 -0
  38. package/dist/metrics/utils.cjs.map +1 -0
  39. package/dist/metrics/utils.d.ts +10 -0
  40. package/dist/metrics/utils.d.ts.map +1 -0
  41. package/dist/metrics/utils.js +73 -0
  42. package/dist/metrics/utils.js.map +1 -0
  43. package/dist/multimodal/multimodal_agent.cjs +7 -13
  44. package/dist/multimodal/multimodal_agent.cjs.map +1 -1
  45. package/dist/multimodal/multimodal_agent.d.ts +1 -4
  46. package/dist/multimodal/multimodal_agent.d.ts.map +1 -1
  47. package/dist/multimodal/multimodal_agent.js +7 -13
  48. package/dist/multimodal/multimodal_agent.js.map +1 -1
  49. package/dist/pipeline/agent_output.cjs +9 -2
  50. package/dist/pipeline/agent_output.cjs.map +1 -1
  51. package/dist/pipeline/agent_output.d.ts +1 -0
  52. package/dist/pipeline/agent_output.d.ts.map +1 -1
  53. package/dist/pipeline/agent_output.js +9 -2
  54. package/dist/pipeline/agent_output.js.map +1 -1
  55. package/dist/pipeline/index.cjs +2 -0
  56. package/dist/pipeline/index.cjs.map +1 -1
  57. package/dist/pipeline/index.d.ts +1 -1
  58. package/dist/pipeline/index.d.ts.map +1 -1
  59. package/dist/pipeline/index.js +3 -1
  60. package/dist/pipeline/index.js.map +1 -1
  61. package/dist/pipeline/pipeline_agent.cjs +168 -70
  62. package/dist/pipeline/pipeline_agent.cjs.map +1 -1
  63. package/dist/pipeline/pipeline_agent.d.ts +10 -4
  64. package/dist/pipeline/pipeline_agent.d.ts.map +1 -1
  65. package/dist/pipeline/pipeline_agent.js +171 -73
  66. package/dist/pipeline/pipeline_agent.js.map +1 -1
  67. package/dist/pipeline/speech_handle.cjs +49 -1
  68. package/dist/pipeline/speech_handle.cjs.map +1 -1
  69. package/dist/pipeline/speech_handle.d.ts +12 -2
  70. package/dist/pipeline/speech_handle.d.ts.map +1 -1
  71. package/dist/pipeline/speech_handle.js +50 -2
  72. package/dist/pipeline/speech_handle.js.map +1 -1
  73. package/dist/stt/index.cjs.map +1 -1
  74. package/dist/stt/index.d.ts +1 -1
  75. package/dist/stt/index.d.ts.map +1 -1
  76. package/dist/stt/index.js.map +1 -1
  77. package/dist/stt/stream_adapter.cjs +15 -5
  78. package/dist/stt/stream_adapter.cjs.map +1 -1
  79. package/dist/stt/stream_adapter.d.ts +4 -1
  80. package/dist/stt/stream_adapter.d.ts.map +1 -1
  81. package/dist/stt/stream_adapter.js +15 -5
  82. package/dist/stt/stream_adapter.js.map +1 -1
  83. package/dist/stt/stt.cjs +46 -2
  84. package/dist/stt/stt.cjs.map +1 -1
  85. package/dist/stt/stt.d.ts +25 -3
  86. package/dist/stt/stt.d.ts.map +1 -1
  87. package/dist/stt/stt.js +46 -2
  88. package/dist/stt/stt.js.map +1 -1
  89. package/dist/tts/index.cjs +4 -2
  90. package/dist/tts/index.cjs.map +1 -1
  91. package/dist/tts/index.d.ts +1 -1
  92. package/dist/tts/index.d.ts.map +1 -1
  93. package/dist/tts/index.js +3 -1
  94. package/dist/tts/index.js.map +1 -1
  95. package/dist/tts/stream_adapter.cjs +14 -3
  96. package/dist/tts/stream_adapter.cjs.map +1 -1
  97. package/dist/tts/stream_adapter.d.ts +3 -0
  98. package/dist/tts/stream_adapter.d.ts.map +1 -1
  99. package/dist/tts/stream_adapter.js +15 -4
  100. package/dist/tts/stream_adapter.js.map +1 -1
  101. package/dist/tts/tts.cjs +109 -6
  102. package/dist/tts/tts.cjs.map +1 -1
  103. package/dist/tts/tts.d.ts +24 -1
  104. package/dist/tts/tts.d.ts.map +1 -1
  105. package/dist/tts/tts.js +107 -5
  106. package/dist/tts/tts.js.map +1 -1
  107. package/dist/vad.cjs +43 -2
  108. package/dist/vad.cjs.map +1 -1
  109. package/dist/vad.d.ts +21 -4
  110. package/dist/vad.d.ts.map +1 -1
  111. package/dist/vad.js +43 -2
  112. package/dist/vad.js.map +1 -1
  113. package/package.json +1 -1
  114. package/src/index.ts +2 -1
  115. package/src/llm/index.ts +2 -0
  116. package/src/llm/llm.ts +55 -3
  117. package/src/metrics/base.ts +127 -0
  118. package/src/metrics/index.ts +20 -0
  119. package/src/metrics/usage_collector.ts +40 -0
  120. package/src/metrics/utils.ts +100 -0
  121. package/src/multimodal/multimodal_agent.ts +12 -17
  122. package/src/pipeline/agent_output.ts +14 -7
  123. package/src/pipeline/index.ts +1 -1
  124. package/src/pipeline/pipeline_agent.ts +210 -95
  125. package/src/pipeline/speech_handle.ts +67 -2
  126. package/src/stt/index.ts +2 -0
  127. package/src/stt/stream_adapter.ts +17 -5
  128. package/src/stt/stt.ts +67 -3
  129. package/src/tts/index.ts +2 -0
  130. package/src/tts/stream_adapter.ts +17 -4
  131. package/src/tts/tts.ts +127 -4
  132. package/src/vad.ts +61 -4
package/dist/vad.cjs CHANGED
@@ -23,16 +23,19 @@ __export(vad_exports, {
23
23
  VADStream: () => VADStream
24
24
  });
25
25
  module.exports = __toCommonJS(vad_exports);
26
+ var import_node_events = require("node:events");
26
27
  var import_utils = require("./utils.cjs");
27
28
  var VADEventType = /* @__PURE__ */ ((VADEventType2) => {
28
29
  VADEventType2[VADEventType2["START_OF_SPEECH"] = 0] = "START_OF_SPEECH";
29
30
  VADEventType2[VADEventType2["INFERENCE_DONE"] = 1] = "INFERENCE_DONE";
30
31
  VADEventType2[VADEventType2["END_OF_SPEECH"] = 2] = "END_OF_SPEECH";
32
+ VADEventType2[VADEventType2["METRICS_COLLECTED"] = 3] = "METRICS_COLLECTED";
31
33
  return VADEventType2;
32
34
  })(VADEventType || {});
33
- class VAD {
35
+ class VAD extends import_node_events.EventEmitter {
34
36
  #capabilities;
35
37
  constructor(capabilities) {
38
+ super();
36
39
  this.#capabilities = capabilities;
37
40
  }
38
41
  get capabilities() {
@@ -43,7 +46,44 @@ class VADStream {
43
46
  static FLUSH_SENTINEL = Symbol("FLUSH_SENTINEL");
44
47
  input = new import_utils.AsyncIterableQueue();
45
48
  queue = new import_utils.AsyncIterableQueue();
49
+ output = new import_utils.AsyncIterableQueue();
46
50
  closed = false;
51
+ #vad;
52
+ #lastActivityTime = BigInt(0);
53
+ constructor(vad) {
54
+ this.#vad = vad;
55
+ this.monitorMetrics();
56
+ }
57
+ async monitorMetrics() {
58
+ let inferenceDurationTotal = 0;
59
+ let inferenceCount = 0;
60
+ for await (const event of this.queue) {
61
+ this.output.put(event);
62
+ switch (event.type) {
63
+ case 0 /* START_OF_SPEECH */:
64
+ inferenceCount++;
65
+ if (inferenceCount >= 1 / this.#vad.capabilities.updateInterval) {
66
+ this.#vad.emit(3 /* METRICS_COLLECTED */, {
67
+ timestamp: Date.now(),
68
+ idleTime: Math.trunc(
69
+ Number((process.hrtime.bigint() - this.#lastActivityTime) / BigInt(1e6))
70
+ ),
71
+ inferenceDurationTotal,
72
+ inferenceCount,
73
+ label: this.#vad.label
74
+ });
75
+ inferenceCount = 0;
76
+ inferenceDurationTotal = 0;
77
+ }
78
+ break;
79
+ case 1 /* INFERENCE_DONE */:
80
+ case 2 /* END_OF_SPEECH */:
81
+ this.#lastActivityTime = process.hrtime.bigint();
82
+ break;
83
+ }
84
+ }
85
+ this.output.close();
86
+ }
47
87
  pushFrame(frame) {
48
88
  if (this.input.closed) {
49
89
  throw new Error("Input is closed");
@@ -72,11 +112,12 @@ class VADStream {
72
112
  this.input.close();
73
113
  }
74
114
  next() {
75
- return this.queue.next();
115
+ return this.output.next();
76
116
  }
77
117
  close() {
78
118
  this.input.close();
79
119
  this.queue.close();
120
+ this.output.close();
80
121
  this.closed = true;
81
122
  }
82
123
  [Symbol.asyncIterator]() {
package/dist/vad.cjs.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/vad.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { AsyncIterableQueue } from './utils.js';\n\nexport enum VADEventType {\n START_OF_SPEECH,\n INFERENCE_DONE,\n END_OF_SPEECH,\n}\n\nexport interface VADEvent {\n /** Type of the VAD event (e.g., start of speech, end of speech, inference done). */\n type: VADEventType;\n /**\n * Index of the audio sample where the event occurred, relative to the inference sample rate.\n */\n samplesIndex: number;\n /** Timestamp when the event was fired. */\n timestamp: number;\n /** Duration of the detected speech segment in seconds. */\n speechDuration: number;\n /** Duration of the silence segment preceding or following the speech, in seconds. */\n silenceDuration: number;\n /**\n * List of audio frames associated with the speech.\n *\n * @remarks\n * - For `start_of_speech` events, this contains the audio chunks that triggered the detection.\n * - For `inference_done` events, this contains the audio chunks that were processed.\n * - For `end_of_speech` events, this contains the complete user speech.\n */\n frames: AudioFrame[];\n /** Probability that speech is present (only for `INFERENCE_DONE` events). */\n probability: number;\n /** Time taken to perform the inference, in seconds (only for `INFERENCE_DONE` events). */\n inferenceDuration: number;\n /** Indicates whether speech was detected in the frames. */\n speaking: boolean;\n}\n\nexport interface VADCapabilities {\n updateInterval: number;\n}\n\nexport abstract class VAD {\n #capabilities: VADCapabilities;\n constructor(capabilities: VADCapabilities) {\n this.#capabilities = capabilities;\n }\n\n get capabilities(): VADCapabilities {\n return this.#capabilities;\n }\n\n /**\n * Returns a {@link VADStream} that can be used to push audio frames and receive VAD events.\n */\n abstract stream(): VADStream;\n}\n\nexport abstract class VADStream implements AsyncIterableIterator<VADEvent> {\n protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');\n protected input = new AsyncIterableQueue<AudioFrame | typeof VADStream.FLUSH_SENTINEL>();\n protected queue = new AsyncIterableQueue<VADEvent>();\n protected closed = false;\n\n pushFrame(frame: AudioFrame) {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(frame);\n }\n\n flush() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(VADStream.FLUSH_SENTINEL);\n }\n\n endInput() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.close();\n }\n\n next(): Promise<IteratorResult<VADEvent>> {\n return this.queue.next();\n }\n\n close() {\n this.input.close();\n this.queue.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): VADStream {\n return this;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAIA,mBAAmC;AAE5B,IAAK,eAAL,kBAAKA,kBAAL;AACL,EAAAA,4BAAA;AACA,EAAAA,4BAAA;AACA,EAAAA,4BAAA;AAHU,SAAAA;AAAA,GAAA;AAwCL,MAAe,IAAI;AAAA,EACxB;AAAA,EACA,YAAY,cAA+B;AACzC,SAAK,gBAAgB;AAAA,EACvB;AAAA,EAEA,IAAI,eAAgC;AAClC,WAAO,KAAK;AAAA,EACd;AAMF;AAEO,MAAe,UAAqD;AAAA,EACzE,OAA0B,iBAAiB,OAAO,gBAAgB;AAAA,EACxD,QAAQ,IAAI,gCAAiE;AAAA,EAC7E,QAAQ,IAAI,gCAA6B;AAAA,EACzC,SAAS;AAAA,EAEnB,UAAU,OAAmB;AAC3B,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,KAAK;AAAA,EACtB;AAAA,EAEA,QAAQ;AACN,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,UAAU,cAAc;AAAA,EACzC;AAAA,EAEA,WAAW;AACT,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,MAAM;AAAA,EACnB;AAAA,EAEA,OAA0C;AACxC,WAAO,KAAK,MAAM,KAAK;AAAA,EACzB;AAAA,EAEA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,MAAM,MAAM;AACjB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAe;AAClC,WAAO;AAAA,EACT;AACF;","names":["VADEventType"]}
1
+ {"version":3,"sources":["../src/vad.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport { EventEmitter } from 'node:events';\nimport type { VADMetrics } from './metrics/base.js';\nimport { AsyncIterableQueue } from './utils.js';\n\nexport enum VADEventType {\n START_OF_SPEECH,\n INFERENCE_DONE,\n END_OF_SPEECH,\n METRICS_COLLECTED,\n}\n\nexport interface VADEvent {\n /** Type of the VAD event (e.g., start of speech, end of speech, inference done). */\n type: VADEventType;\n /**\n * Index of the audio sample where the event occurred, relative to the inference sample rate.\n */\n samplesIndex: number;\n /** Timestamp when the event was fired. */\n timestamp: number;\n /** Duration of the speech segment. */\n speechDuration: number;\n /** Duration of the silence segment. */\n silenceDuration: number;\n /**\n * List of audio frames associated with the speech.\n *\n * @remarks\n * - For `start_of_speech` events, this contains the audio chunks that triggered the detection.\n * - For `inference_done` events, this contains the audio chunks that were processed.\n * - For `end_of_speech` events, this contains the complete user speech.\n */\n frames: AudioFrame[];\n /** Probability that speech is present (only for `INFERENCE_DONE` events). */\n probability: number;\n /** Time taken to perform the inference, in seconds (only for `INFERENCE_DONE` events). */\n inferenceDuration: number;\n /** Indicates whether speech was detected in the frames. */\n speaking: boolean;\n /** Threshold used to detect silence. */\n rawAccumulatedSilence: number;\n /** Threshold used to detect speech. */\n rawAccumulatedSpeech: number;\n}\n\nexport interface VADCapabilities {\n updateInterval: number;\n}\n\nexport type VADCallbacks = {\n [VADEventType.METRICS_COLLECTED]: (metrics: VADMetrics) => void;\n};\n\nexport abstract class VAD extends (EventEmitter as new () => TypedEmitter<VADCallbacks>) {\n #capabilities: VADCapabilities;\n abstract label: string;\n\n constructor(capabilities: VADCapabilities) {\n super();\n this.#capabilities = capabilities;\n }\n\n get capabilities(): VADCapabilities {\n return this.#capabilities;\n }\n\n /**\n * Returns a {@link VADStream} that can be used to push audio frames and receive VAD events.\n */\n abstract stream(): VADStream;\n}\n\nexport abstract class VADStream implements AsyncIterableIterator<VADEvent> {\n protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');\n protected input = new AsyncIterableQueue<AudioFrame | typeof VADStream.FLUSH_SENTINEL>();\n protected queue = new AsyncIterableQueue<VADEvent>();\n protected output = new AsyncIterableQueue<VADEvent>();\n protected closed = false;\n #vad: VAD;\n #lastActivityTime = BigInt(0);\n\n constructor(vad: VAD) {\n this.#vad = vad;\n this.monitorMetrics();\n }\n\n protected async monitorMetrics() {\n let inferenceDurationTotal = 0;\n let inferenceCount = 0;\n\n for await (const event of this.queue) {\n this.output.put(event);\n switch (event.type) {\n case VADEventType.START_OF_SPEECH:\n inferenceCount++;\n if (inferenceCount >= 1 / this.#vad.capabilities.updateInterval) {\n this.#vad.emit(VADEventType.METRICS_COLLECTED, {\n timestamp: Date.now(),\n idleTime: Math.trunc(\n Number((process.hrtime.bigint() - this.#lastActivityTime) / BigInt(1000000)),\n ),\n inferenceDurationTotal,\n inferenceCount,\n label: this.#vad.label,\n });\n\n inferenceCount = 0;\n inferenceDurationTotal = 0;\n }\n break;\n case VADEventType.INFERENCE_DONE:\n case VADEventType.END_OF_SPEECH:\n this.#lastActivityTime = process.hrtime.bigint();\n break;\n }\n }\n this.output.close();\n }\n\n pushFrame(frame: AudioFrame) {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(frame);\n }\n\n flush() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(VADStream.FLUSH_SENTINEL);\n }\n\n endInput() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.close();\n }\n\n next(): Promise<IteratorResult<VADEvent>> {\n return this.output.next();\n }\n\n close() {\n this.input.close();\n this.queue.close();\n this.output.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): VADStream {\n return this;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAKA,yBAA6B;AAE7B,mBAAmC;AAE5B,IAAK,eAAL,kBAAKA,kBAAL;AACL,EAAAA,4BAAA;AACA,EAAAA,4BAAA;AACA,EAAAA,4BAAA;AACA,EAAAA,4BAAA;AAJU,SAAAA;AAAA,GAAA;AAiDL,MAAe,YAAa,gCAAsD;AAAA,EACvF;AAAA,EAGA,YAAY,cAA+B;AACzC,UAAM;AACN,SAAK,gBAAgB;AAAA,EACvB;AAAA,EAEA,IAAI,eAAgC;AAClC,WAAO,KAAK;AAAA,EACd;AAMF;AAEO,MAAe,UAAqD;AAAA,EACzE,OAA0B,iBAAiB,OAAO,gBAAgB;AAAA,EACxD,QAAQ,IAAI,gCAAiE;AAAA,EAC7E,QAAQ,IAAI,gCAA6B;AAAA,EACzC,SAAS,IAAI,gCAA6B;AAAA,EAC1C,SAAS;AAAA,EACnB;AAAA,EACA,oBAAoB,OAAO,CAAC;AAAA,EAE5B,YAAY,KAAU;AACpB,SAAK,OAAO;AACZ,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,MAAgB,iBAAiB;AAC/B,QAAI,yBAAyB;AAC7B,QAAI,iBAAiB;AAErB,qBAAiB,SAAS,KAAK,OAAO;AACpC,WAAK,OAAO,IAAI,KAAK;AACrB,cAAQ,MAAM,MAAM;AAAA,QAClB,KAAK;AACH;AACA,cAAI,kBAAkB,IAAI,KAAK,KAAK,aAAa,gBAAgB;AAC/D,iBAAK,KAAK,KAAK,2BAAgC;AAAA,cAC7C,WAAW,KAAK,IAAI;AAAA,cACpB,UAAU,KAAK;AAAA,gBACb,QAAQ,QAAQ,OAAO,OAAO,IAAI,KAAK,qBAAqB,OAAO,GAAO,CAAC;AAAA,cAC7E;AAAA,cACA;AAAA,cACA;AAAA,cACA,OAAO,KAAK,KAAK;AAAA,YACnB,CAAC;AAED,6BAAiB;AACjB,qCAAyB;AAAA,UAC3B;AACA;AAAA,QACF,KAAK;AAAA,QACL,KAAK;AACH,eAAK,oBAAoB,QAAQ,OAAO,OAAO;AAC/C;AAAA,MACJ;AAAA,IACF;AACA,SAAK,OAAO,MAAM;AAAA,EACpB;AAAA,EAEA,UAAU,OAAmB;AAC3B,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,KAAK;AAAA,EACtB;AAAA,EAEA,QAAQ;AACN,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,UAAU,cAAc;AAAA,EACzC;AAAA,EAEA,WAAW;AACT,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,MAAM;AAAA,EACnB;AAAA,EAEA,OAA0C;AACxC,WAAO,KAAK,OAAO,KAAK;AAAA,EAC1B;AAAA,EAEA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,MAAM,MAAM;AACjB,SAAK,OAAO,MAAM;AAClB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAe;AAClC,WAAO;AAAA,EACT;AACF;","names":["VADEventType"]}
package/dist/vad.d.ts CHANGED
@@ -1,9 +1,12 @@
1
1
  import type { AudioFrame } from '@livekit/rtc-node';
2
+ import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
3
+ import type { VADMetrics } from './metrics/base.js';
2
4
  import { AsyncIterableQueue } from './utils.js';
3
5
  export declare enum VADEventType {
4
6
  START_OF_SPEECH = 0,
5
7
  INFERENCE_DONE = 1,
6
- END_OF_SPEECH = 2
8
+ END_OF_SPEECH = 2,
9
+ METRICS_COLLECTED = 3
7
10
  }
8
11
  export interface VADEvent {
9
12
  /** Type of the VAD event (e.g., start of speech, end of speech, inference done). */
@@ -14,9 +17,9 @@ export interface VADEvent {
14
17
  samplesIndex: number;
15
18
  /** Timestamp when the event was fired. */
16
19
  timestamp: number;
17
- /** Duration of the detected speech segment in seconds. */
20
+ /** Duration of the speech segment. */
18
21
  speechDuration: number;
19
- /** Duration of the silence segment preceding or following the speech, in seconds. */
22
+ /** Duration of the silence segment. */
20
23
  silenceDuration: number;
21
24
  /**
22
25
  * List of audio frames associated with the speech.
@@ -33,12 +36,21 @@ export interface VADEvent {
33
36
  inferenceDuration: number;
34
37
  /** Indicates whether speech was detected in the frames. */
35
38
  speaking: boolean;
39
+ /** Threshold used to detect silence. */
40
+ rawAccumulatedSilence: number;
41
+ /** Threshold used to detect speech. */
42
+ rawAccumulatedSpeech: number;
36
43
  }
37
44
  export interface VADCapabilities {
38
45
  updateInterval: number;
39
46
  }
40
- export declare abstract class VAD {
47
+ export type VADCallbacks = {
48
+ [VADEventType.METRICS_COLLECTED]: (metrics: VADMetrics) => void;
49
+ };
50
+ declare const VAD_base: new () => TypedEmitter<VADCallbacks>;
51
+ export declare abstract class VAD extends VAD_base {
41
52
  #private;
53
+ abstract label: string;
42
54
  constructor(capabilities: VADCapabilities);
43
55
  get capabilities(): VADCapabilities;
44
56
  /**
@@ -47,10 +59,14 @@ export declare abstract class VAD {
47
59
  abstract stream(): VADStream;
48
60
  }
49
61
  export declare abstract class VADStream implements AsyncIterableIterator<VADEvent> {
62
+ #private;
50
63
  protected static readonly FLUSH_SENTINEL: unique symbol;
51
64
  protected input: AsyncIterableQueue<AudioFrame | typeof VADStream.FLUSH_SENTINEL>;
52
65
  protected queue: AsyncIterableQueue<VADEvent>;
66
+ protected output: AsyncIterableQueue<VADEvent>;
53
67
  protected closed: boolean;
68
+ constructor(vad: VAD);
69
+ protected monitorMetrics(): Promise<void>;
54
70
  pushFrame(frame: AudioFrame): void;
55
71
  flush(): void;
56
72
  endInput(): void;
@@ -58,4 +74,5 @@ export declare abstract class VADStream implements AsyncIterableIterator<VADEven
58
74
  close(): void;
59
75
  [Symbol.asyncIterator](): VADStream;
60
76
  }
77
+ export {};
61
78
  //# sourceMappingURL=vad.d.ts.map
package/dist/vad.d.ts.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"vad.d.ts","sourceRoot":"","sources":["../src/vad.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AACpD,OAAO,EAAE,kBAAkB,EAAE,MAAM,YAAY,CAAC;AAEhD,oBAAY,YAAY;IACtB,eAAe,IAAA;IACf,cAAc,IAAA;IACd,aAAa,IAAA;CACd;AAED,MAAM,WAAW,QAAQ;IACvB,oFAAoF;IACpF,IAAI,EAAE,YAAY,CAAC;IACnB;;OAEG;IACH,YAAY,EAAE,MAAM,CAAC;IACrB,0CAA0C;IAC1C,SAAS,EAAE,MAAM,CAAC;IAClB,0DAA0D;IAC1D,cAAc,EAAE,MAAM,CAAC;IACvB,qFAAqF;IACrF,eAAe,EAAE,MAAM,CAAC;IACxB;;;;;;;OAOG;IACH,MAAM,EAAE,UAAU,EAAE,CAAC;IACrB,6EAA6E;IAC7E,WAAW,EAAE,MAAM,CAAC;IACpB,0FAA0F;IAC1F,iBAAiB,EAAE,MAAM,CAAC;IAC1B,2DAA2D;IAC3D,QAAQ,EAAE,OAAO,CAAC;CACnB;AAED,MAAM,WAAW,eAAe;IAC9B,cAAc,EAAE,MAAM,CAAC;CACxB;AAED,8BAAsB,GAAG;;gBAEX,YAAY,EAAE,eAAe;IAIzC,IAAI,YAAY,IAAI,eAAe,CAElC;IAED;;OAEG;IACH,QAAQ,CAAC,MAAM,IAAI,SAAS;CAC7B;AAED,8BAAsB,SAAU,YAAW,qBAAqB,CAAC,QAAQ,CAAC;IACxE,SAAS,CAAC,MAAM,CAAC,QAAQ,CAAC,cAAc,gBAA4B;IACpE,SAAS,CAAC,KAAK,mEAA0E;IACzF,SAAS,CAAC,KAAK,+BAAsC;IACrD,SAAS,CAAC,MAAM,UAAS;IAEzB,SAAS,CAAC,KAAK,EAAE,UAAU;IAU3B,KAAK;IAUL,QAAQ;IAUR,IAAI,IAAI,OAAO,CAAC,cAAc,CAAC,QAAQ,CAAC,CAAC;IAIzC,KAAK;IAML,CAAC,MAAM,CAAC,aAAa,CAAC,IAAI,SAAS;CAGpC"}
1
+ {"version":3,"file":"vad.d.ts","sourceRoot":"","sources":["../src/vad.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AACpD,OAAO,KAAK,EAAE,iBAAiB,IAAI,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAEhF,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AACpD,OAAO,EAAE,kBAAkB,EAAE,MAAM,YAAY,CAAC;AAEhD,oBAAY,YAAY;IACtB,eAAe,IAAA;IACf,cAAc,IAAA;IACd,aAAa,IAAA;IACb,iBAAiB,IAAA;CAClB;AAED,MAAM,WAAW,QAAQ;IACvB,oFAAoF;IACpF,IAAI,EAAE,YAAY,CAAC;IACnB;;OAEG;IACH,YAAY,EAAE,MAAM,CAAC;IACrB,0CAA0C;IAC1C,SAAS,EAAE,MAAM,CAAC;IAClB,sCAAsC;IACtC,cAAc,EAAE,MAAM,CAAC;IACvB,uCAAuC;IACvC,eAAe,EAAE,MAAM,CAAC;IACxB;;;;;;;OAOG;IACH,MAAM,EAAE,UAAU,EAAE,CAAC;IACrB,6EAA6E;IAC7E,WAAW,EAAE,MAAM,CAAC;IACpB,0FAA0F;IAC1F,iBAAiB,EAAE,MAAM,CAAC;IAC1B,2DAA2D;IAC3D,QAAQ,EAAE,OAAO,CAAC;IAClB,wCAAwC;IACxC,qBAAqB,EAAE,MAAM,CAAC;IAC9B,uCAAuC;IACvC,oBAAoB,EAAE,MAAM,CAAC;CAC9B;AAED,MAAM,WAAW,eAAe;IAC9B,cAAc,EAAE,MAAM,CAAC;CACxB;AAED,MAAM,MAAM,YAAY,GAAG;IACzB,CAAC,YAAY,CAAC,iBAAiB,CAAC,EAAE,CAAC,OAAO,EAAE,UAAU,KAAK,IAAI,CAAC;CACjE,CAAC;kCAE2D,aAAa,YAAY,CAAC;AAAvF,8BAAsB,GAAI,SAAQ,QAAsD;;IAEtF,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;gBAEX,YAAY,EAAE,eAAe;IAKzC,IAAI,YAAY,IAAI,eAAe,CAElC;IAED;;OAEG;IACH,QAAQ,CAAC,MAAM,IAAI,SAAS;CAC7B;AAED,8BAAsB,SAAU,YAAW,qBAAqB,CAAC,QAAQ,CAAC;;IACxE,SAAS,CAAC,MAAM,CAAC,QAAQ,CAAC,cAAc,gBAA4B;IACpE,SAAS,CAAC,KAAK,mEAA0E;IACzF,SAAS,CAAC,KAAK,+BAAsC;IACrD,SAAS,CAAC,MAAM,+BAAsC;IACtD,SAAS,CAAC,MAAM,UAAS;gBAIb,GAAG,EAAE,GAAG;cAKJ,cAAc;IAiC9B,SAAS,CAAC,KAAK,EAAE,UAAU;IAU3B,KAAK;IAUL,QAAQ;IAUR,IAAI,IAAI,OAAO,CAAC,cAAc,CAAC,QAAQ,CAAC,CAAC;IAIzC,KAAK;IAOL,CAAC,MAAM,CAAC,aAAa,CAAC,IAAI,SAAS;CAGpC"}
package/dist/vad.js CHANGED
@@ -1,13 +1,16 @@
1
+ import { EventEmitter } from "node:events";
1
2
  import { AsyncIterableQueue } from "./utils.js";
2
3
  var VADEventType = /* @__PURE__ */ ((VADEventType2) => {
3
4
  VADEventType2[VADEventType2["START_OF_SPEECH"] = 0] = "START_OF_SPEECH";
4
5
  VADEventType2[VADEventType2["INFERENCE_DONE"] = 1] = "INFERENCE_DONE";
5
6
  VADEventType2[VADEventType2["END_OF_SPEECH"] = 2] = "END_OF_SPEECH";
7
+ VADEventType2[VADEventType2["METRICS_COLLECTED"] = 3] = "METRICS_COLLECTED";
6
8
  return VADEventType2;
7
9
  })(VADEventType || {});
8
- class VAD {
10
+ class VAD extends EventEmitter {
9
11
  #capabilities;
10
12
  constructor(capabilities) {
13
+ super();
11
14
  this.#capabilities = capabilities;
12
15
  }
13
16
  get capabilities() {
@@ -18,7 +21,44 @@ class VADStream {
18
21
  static FLUSH_SENTINEL = Symbol("FLUSH_SENTINEL");
19
22
  input = new AsyncIterableQueue();
20
23
  queue = new AsyncIterableQueue();
24
+ output = new AsyncIterableQueue();
21
25
  closed = false;
26
+ #vad;
27
+ #lastActivityTime = BigInt(0);
28
+ constructor(vad) {
29
+ this.#vad = vad;
30
+ this.monitorMetrics();
31
+ }
32
+ async monitorMetrics() {
33
+ let inferenceDurationTotal = 0;
34
+ let inferenceCount = 0;
35
+ for await (const event of this.queue) {
36
+ this.output.put(event);
37
+ switch (event.type) {
38
+ case 0 /* START_OF_SPEECH */:
39
+ inferenceCount++;
40
+ if (inferenceCount >= 1 / this.#vad.capabilities.updateInterval) {
41
+ this.#vad.emit(3 /* METRICS_COLLECTED */, {
42
+ timestamp: Date.now(),
43
+ idleTime: Math.trunc(
44
+ Number((process.hrtime.bigint() - this.#lastActivityTime) / BigInt(1e6))
45
+ ),
46
+ inferenceDurationTotal,
47
+ inferenceCount,
48
+ label: this.#vad.label
49
+ });
50
+ inferenceCount = 0;
51
+ inferenceDurationTotal = 0;
52
+ }
53
+ break;
54
+ case 1 /* INFERENCE_DONE */:
55
+ case 2 /* END_OF_SPEECH */:
56
+ this.#lastActivityTime = process.hrtime.bigint();
57
+ break;
58
+ }
59
+ }
60
+ this.output.close();
61
+ }
22
62
  pushFrame(frame) {
23
63
  if (this.input.closed) {
24
64
  throw new Error("Input is closed");
@@ -47,11 +87,12 @@ class VADStream {
47
87
  this.input.close();
48
88
  }
49
89
  next() {
50
- return this.queue.next();
90
+ return this.output.next();
51
91
  }
52
92
  close() {
53
93
  this.input.close();
54
94
  this.queue.close();
95
+ this.output.close();
55
96
  this.closed = true;
56
97
  }
57
98
  [Symbol.asyncIterator]() {
package/dist/vad.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/vad.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport { AsyncIterableQueue } from './utils.js';\n\nexport enum VADEventType {\n START_OF_SPEECH,\n INFERENCE_DONE,\n END_OF_SPEECH,\n}\n\nexport interface VADEvent {\n /** Type of the VAD event (e.g., start of speech, end of speech, inference done). */\n type: VADEventType;\n /**\n * Index of the audio sample where the event occurred, relative to the inference sample rate.\n */\n samplesIndex: number;\n /** Timestamp when the event was fired. */\n timestamp: number;\n /** Duration of the detected speech segment in seconds. */\n speechDuration: number;\n /** Duration of the silence segment preceding or following the speech, in seconds. */\n silenceDuration: number;\n /**\n * List of audio frames associated with the speech.\n *\n * @remarks\n * - For `start_of_speech` events, this contains the audio chunks that triggered the detection.\n * - For `inference_done` events, this contains the audio chunks that were processed.\n * - For `end_of_speech` events, this contains the complete user speech.\n */\n frames: AudioFrame[];\n /** Probability that speech is present (only for `INFERENCE_DONE` events). */\n probability: number;\n /** Time taken to perform the inference, in seconds (only for `INFERENCE_DONE` events). */\n inferenceDuration: number;\n /** Indicates whether speech was detected in the frames. */\n speaking: boolean;\n}\n\nexport interface VADCapabilities {\n updateInterval: number;\n}\n\nexport abstract class VAD {\n #capabilities: VADCapabilities;\n constructor(capabilities: VADCapabilities) {\n this.#capabilities = capabilities;\n }\n\n get capabilities(): VADCapabilities {\n return this.#capabilities;\n }\n\n /**\n * Returns a {@link VADStream} that can be used to push audio frames and receive VAD events.\n */\n abstract stream(): VADStream;\n}\n\nexport abstract class VADStream implements AsyncIterableIterator<VADEvent> {\n protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');\n protected input = new AsyncIterableQueue<AudioFrame | typeof VADStream.FLUSH_SENTINEL>();\n protected queue = new AsyncIterableQueue<VADEvent>();\n protected closed = false;\n\n pushFrame(frame: AudioFrame) {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(frame);\n }\n\n flush() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(VADStream.FLUSH_SENTINEL);\n }\n\n endInput() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.close();\n }\n\n next(): Promise<IteratorResult<VADEvent>> {\n return this.queue.next();\n }\n\n close() {\n this.input.close();\n this.queue.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): VADStream {\n return this;\n }\n}\n"],"mappings":"AAIA,SAAS,0BAA0B;AAE5B,IAAK,eAAL,kBAAKA,kBAAL;AACL,EAAAA,4BAAA;AACA,EAAAA,4BAAA;AACA,EAAAA,4BAAA;AAHU,SAAAA;AAAA,GAAA;AAwCL,MAAe,IAAI;AAAA,EACxB;AAAA,EACA,YAAY,cAA+B;AACzC,SAAK,gBAAgB;AAAA,EACvB;AAAA,EAEA,IAAI,eAAgC;AAClC,WAAO,KAAK;AAAA,EACd;AAMF;AAEO,MAAe,UAAqD;AAAA,EACzE,OAA0B,iBAAiB,OAAO,gBAAgB;AAAA,EACxD,QAAQ,IAAI,mBAAiE;AAAA,EAC7E,QAAQ,IAAI,mBAA6B;AAAA,EACzC,SAAS;AAAA,EAEnB,UAAU,OAAmB;AAC3B,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,KAAK;AAAA,EACtB;AAAA,EAEA,QAAQ;AACN,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,UAAU,cAAc;AAAA,EACzC;AAAA,EAEA,WAAW;AACT,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,MAAM;AAAA,EACnB;AAAA,EAEA,OAA0C;AACxC,WAAO,KAAK,MAAM,KAAK;AAAA,EACzB;AAAA,EAEA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,MAAM,MAAM;AACjB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAe;AAClC,WAAO;AAAA,EACT;AACF;","names":["VADEventType"]}
1
+ {"version":3,"sources":["../src/vad.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport type { AudioFrame } from '@livekit/rtc-node';\nimport type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';\nimport { EventEmitter } from 'node:events';\nimport type { VADMetrics } from './metrics/base.js';\nimport { AsyncIterableQueue } from './utils.js';\n\nexport enum VADEventType {\n START_OF_SPEECH,\n INFERENCE_DONE,\n END_OF_SPEECH,\n METRICS_COLLECTED,\n}\n\nexport interface VADEvent {\n /** Type of the VAD event (e.g., start of speech, end of speech, inference done). */\n type: VADEventType;\n /**\n * Index of the audio sample where the event occurred, relative to the inference sample rate.\n */\n samplesIndex: number;\n /** Timestamp when the event was fired. */\n timestamp: number;\n /** Duration of the speech segment. */\n speechDuration: number;\n /** Duration of the silence segment. */\n silenceDuration: number;\n /**\n * List of audio frames associated with the speech.\n *\n * @remarks\n * - For `start_of_speech` events, this contains the audio chunks that triggered the detection.\n * - For `inference_done` events, this contains the audio chunks that were processed.\n * - For `end_of_speech` events, this contains the complete user speech.\n */\n frames: AudioFrame[];\n /** Probability that speech is present (only for `INFERENCE_DONE` events). */\n probability: number;\n /** Time taken to perform the inference, in seconds (only for `INFERENCE_DONE` events). */\n inferenceDuration: number;\n /** Indicates whether speech was detected in the frames. */\n speaking: boolean;\n /** Threshold used to detect silence. */\n rawAccumulatedSilence: number;\n /** Threshold used to detect speech. */\n rawAccumulatedSpeech: number;\n}\n\nexport interface VADCapabilities {\n updateInterval: number;\n}\n\nexport type VADCallbacks = {\n [VADEventType.METRICS_COLLECTED]: (metrics: VADMetrics) => void;\n};\n\nexport abstract class VAD extends (EventEmitter as new () => TypedEmitter<VADCallbacks>) {\n #capabilities: VADCapabilities;\n abstract label: string;\n\n constructor(capabilities: VADCapabilities) {\n super();\n this.#capabilities = capabilities;\n }\n\n get capabilities(): VADCapabilities {\n return this.#capabilities;\n }\n\n /**\n * Returns a {@link VADStream} that can be used to push audio frames and receive VAD events.\n */\n abstract stream(): VADStream;\n}\n\nexport abstract class VADStream implements AsyncIterableIterator<VADEvent> {\n protected static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL');\n protected input = new AsyncIterableQueue<AudioFrame | typeof VADStream.FLUSH_SENTINEL>();\n protected queue = new AsyncIterableQueue<VADEvent>();\n protected output = new AsyncIterableQueue<VADEvent>();\n protected closed = false;\n #vad: VAD;\n #lastActivityTime = BigInt(0);\n\n constructor(vad: VAD) {\n this.#vad = vad;\n this.monitorMetrics();\n }\n\n protected async monitorMetrics() {\n let inferenceDurationTotal = 0;\n let inferenceCount = 0;\n\n for await (const event of this.queue) {\n this.output.put(event);\n switch (event.type) {\n case VADEventType.START_OF_SPEECH:\n inferenceCount++;\n if (inferenceCount >= 1 / this.#vad.capabilities.updateInterval) {\n this.#vad.emit(VADEventType.METRICS_COLLECTED, {\n timestamp: Date.now(),\n idleTime: Math.trunc(\n Number((process.hrtime.bigint() - this.#lastActivityTime) / BigInt(1000000)),\n ),\n inferenceDurationTotal,\n inferenceCount,\n label: this.#vad.label,\n });\n\n inferenceCount = 0;\n inferenceDurationTotal = 0;\n }\n break;\n case VADEventType.INFERENCE_DONE:\n case VADEventType.END_OF_SPEECH:\n this.#lastActivityTime = process.hrtime.bigint();\n break;\n }\n }\n this.output.close();\n }\n\n pushFrame(frame: AudioFrame) {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(frame);\n }\n\n flush() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.put(VADStream.FLUSH_SENTINEL);\n }\n\n endInput() {\n if (this.input.closed) {\n throw new Error('Input is closed');\n }\n if (this.closed) {\n throw new Error('Stream is closed');\n }\n this.input.close();\n }\n\n next(): Promise<IteratorResult<VADEvent>> {\n return this.output.next();\n }\n\n close() {\n this.input.close();\n this.queue.close();\n this.output.close();\n this.closed = true;\n }\n\n [Symbol.asyncIterator](): VADStream {\n return this;\n }\n}\n"],"mappings":"AAKA,SAAS,oBAAoB;AAE7B,SAAS,0BAA0B;AAE5B,IAAK,eAAL,kBAAKA,kBAAL;AACL,EAAAA,4BAAA;AACA,EAAAA,4BAAA;AACA,EAAAA,4BAAA;AACA,EAAAA,4BAAA;AAJU,SAAAA;AAAA,GAAA;AAiDL,MAAe,YAAa,aAAsD;AAAA,EACvF;AAAA,EAGA,YAAY,cAA+B;AACzC,UAAM;AACN,SAAK,gBAAgB;AAAA,EACvB;AAAA,EAEA,IAAI,eAAgC;AAClC,WAAO,KAAK;AAAA,EACd;AAMF;AAEO,MAAe,UAAqD;AAAA,EACzE,OAA0B,iBAAiB,OAAO,gBAAgB;AAAA,EACxD,QAAQ,IAAI,mBAAiE;AAAA,EAC7E,QAAQ,IAAI,mBAA6B;AAAA,EACzC,SAAS,IAAI,mBAA6B;AAAA,EAC1C,SAAS;AAAA,EACnB;AAAA,EACA,oBAAoB,OAAO,CAAC;AAAA,EAE5B,YAAY,KAAU;AACpB,SAAK,OAAO;AACZ,SAAK,eAAe;AAAA,EACtB;AAAA,EAEA,MAAgB,iBAAiB;AAC/B,QAAI,yBAAyB;AAC7B,QAAI,iBAAiB;AAErB,qBAAiB,SAAS,KAAK,OAAO;AACpC,WAAK,OAAO,IAAI,KAAK;AACrB,cAAQ,MAAM,MAAM;AAAA,QAClB,KAAK;AACH;AACA,cAAI,kBAAkB,IAAI,KAAK,KAAK,aAAa,gBAAgB;AAC/D,iBAAK,KAAK,KAAK,2BAAgC;AAAA,cAC7C,WAAW,KAAK,IAAI;AAAA,cACpB,UAAU,KAAK;AAAA,gBACb,QAAQ,QAAQ,OAAO,OAAO,IAAI,KAAK,qBAAqB,OAAO,GAAO,CAAC;AAAA,cAC7E;AAAA,cACA;AAAA,cACA;AAAA,cACA,OAAO,KAAK,KAAK;AAAA,YACnB,CAAC;AAED,6BAAiB;AACjB,qCAAyB;AAAA,UAC3B;AACA;AAAA,QACF,KAAK;AAAA,QACL,KAAK;AACH,eAAK,oBAAoB,QAAQ,OAAO,OAAO;AAC/C;AAAA,MACJ;AAAA,IACF;AACA,SAAK,OAAO,MAAM;AAAA,EACpB;AAAA,EAEA,UAAU,OAAmB;AAC3B,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,KAAK;AAAA,EACtB;AAAA,EAEA,QAAQ;AACN,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,IAAI,UAAU,cAAc;AAAA,EACzC;AAAA,EAEA,WAAW;AACT,QAAI,KAAK,MAAM,QAAQ;AACrB,YAAM,IAAI,MAAM,iBAAiB;AAAA,IACnC;AACA,QAAI,KAAK,QAAQ;AACf,YAAM,IAAI,MAAM,kBAAkB;AAAA,IACpC;AACA,SAAK,MAAM,MAAM;AAAA,EACnB;AAAA,EAEA,OAA0C;AACxC,WAAO,KAAK,OAAO,KAAK;AAAA,EAC1B;AAAA,EAEA,QAAQ;AACN,SAAK,MAAM,MAAM;AACjB,SAAK,MAAM,MAAM;AACjB,SAAK,OAAO,MAAM;AAClB,SAAK,SAAS;AAAA,EAChB;AAAA,EAEA,CAAC,OAAO,aAAa,IAAe;AAClC,WAAO;AAAA,EACT;AACF;","names":["VADEventType"]}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@livekit/agents",
3
- "version": "0.5.1",
3
+ "version": "0.6.0",
4
4
  "description": "LiveKit Agents - Node.js",
5
5
  "main": "dist/index.js",
6
6
  "require": "dist/index.cjs",
package/src/index.ts CHANGED
@@ -11,6 +11,7 @@
11
11
  */
12
12
  import * as cli from './cli.js';
13
13
  import * as llm from './llm/index.js';
14
+ import * as metrics from './metrics/index.js';
14
15
  import * as multimodal from './multimodal/index.js';
15
16
  import * as pipeline from './pipeline/index.js';
16
17
  import * as stt from './stt/index.js';
@@ -28,4 +29,4 @@ export * from './generator.js';
28
29
  export * from './audio.js';
29
30
  export * from './transcription.js';
30
31
 
31
- export { cli, stt, tts, llm, pipeline, multimodal, tokenize };
32
+ export { cli, stt, tts, llm, pipeline, multimodal, tokenize, metrics };
package/src/llm/index.ts CHANGED
@@ -25,6 +25,8 @@ export {
25
25
  type CompletionUsage,
26
26
  type Choice,
27
27
  type ChatChunk,
28
+ type LLMCallbacks,
29
+ LLMEvent,
28
30
  LLM,
29
31
  LLMStream,
30
32
  } from './llm.js';
package/src/llm/llm.ts CHANGED
@@ -1,6 +1,9 @@
1
1
  // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
+ import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
5
+ import { EventEmitter } from 'node:events';
6
+ import type { LLMMetrics } from '../metrics/base.js';
4
7
  import { AsyncIterableQueue } from '../utils.js';
5
8
  import type { ChatContext, ChatRole } from './chat_context.js';
6
9
  import type { FunctionCallInfo, FunctionContext } from './function_context.js';
@@ -28,7 +31,15 @@ export interface ChatChunk {
28
31
  usage?: CompletionUsage;
29
32
  }
30
33
 
31
- export abstract class LLM {
34
+ export enum LLMEvent {
35
+ METRICS_COLLECTED,
36
+ }
37
+
38
+ export type LLMCallbacks = {
39
+ [LLMEvent.METRICS_COLLECTED]: (metrics: LLMMetrics) => void;
40
+ };
41
+
42
+ export abstract class LLM extends (EventEmitter as new () => TypedEmitter<LLMCallbacks>) {
32
43
  /**
33
44
  * Returns a {@link LLMStream} that can be used to push text and receive LLM responses.
34
45
  */
@@ -48,16 +59,56 @@ export abstract class LLM {
48
59
  }
49
60
 
50
61
  export abstract class LLMStream implements AsyncIterableIterator<ChatChunk> {
62
+ protected output = new AsyncIterableQueue<ChatChunk>();
51
63
  protected queue = new AsyncIterableQueue<ChatChunk>();
52
64
  protected closed = false;
53
65
  protected _functionCalls: FunctionCallInfo[] = [];
66
+ abstract label: string;
54
67
 
68
+ #llm: LLM;
55
69
  #chatCtx: ChatContext;
56
70
  #fncCtx?: FunctionContext;
57
71
 
58
- constructor(chatCtx: ChatContext, fncCtx?: FunctionContext) {
72
+ constructor(llm: LLM, chatCtx: ChatContext, fncCtx?: FunctionContext) {
73
+ this.#llm = llm;
59
74
  this.#chatCtx = chatCtx;
60
75
  this.#fncCtx = fncCtx;
76
+ this.monitorMetrics();
77
+ }
78
+
79
+ protected async monitorMetrics() {
80
+ const startTime = process.hrtime.bigint();
81
+ let ttft: bigint | undefined;
82
+ let requestId = '';
83
+ let usage: CompletionUsage | undefined;
84
+
85
+ for await (const ev of this.queue) {
86
+ this.output.put(ev);
87
+ requestId = ev.requestId;
88
+ if (!ttft) {
89
+ ttft = process.hrtime.bigint() - startTime;
90
+ }
91
+ if (ev.usage) {
92
+ usage = ev.usage;
93
+ }
94
+ }
95
+ this.output.close();
96
+
97
+ const duration = process.hrtime.bigint() - startTime;
98
+ const metrics: LLMMetrics = {
99
+ timestamp: Date.now(),
100
+ requestId,
101
+ ttft: Math.trunc(Number(ttft! / BigInt(1000000))),
102
+ duration: Math.trunc(Number(duration / BigInt(1000000))),
103
+ cancelled: false, // XXX(nbsp)
104
+ label: this.label,
105
+ completionTokens: usage?.completionTokens || 0,
106
+ promptTokens: usage?.promptTokens || 0,
107
+ totalTokens: usage?.totalTokens || 0,
108
+ tokensPerSecond:
109
+ (usage?.completionTokens || 0) / Math.trunc(Number(duration / BigInt(1000000000))),
110
+ };
111
+ this.#llm.emit(LLMEvent.METRICS_COLLECTED, metrics);
61
112
  }
62
113
 
63
114
  /** List of called functions from this stream. */
@@ -88,10 +139,11 @@ export abstract class LLMStream implements AsyncIterableIterator<ChatChunk> {
88
139
  }
89
140
 
90
141
  next(): Promise<IteratorResult<ChatChunk>> {
91
- return this.queue.next();
142
+ return this.output.next();
92
143
  }
93
144
 
94
145
  close() {
146
+ this.output.close();
95
147
  this.queue.close();
96
148
  this.closed = true;
97
149
  }
@@ -0,0 +1,127 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+
5
+ export interface LLMMetrics {
6
+ requestId: string;
7
+ timestamp: number;
8
+ ttft: number;
9
+ duration: number;
10
+ label: string;
11
+ cancelled: boolean;
12
+ completionTokens: number;
13
+ promptTokens: number;
14
+ totalTokens: number;
15
+ tokensPerSecond: number;
16
+ error?: Error;
17
+ }
18
+
19
+ export interface STTMetrics {
20
+ requestId: string;
21
+ timestamp: number;
22
+ duration: number;
23
+ label: string;
24
+ audioDuration: number;
25
+ streamed: boolean;
26
+ error?: Error;
27
+ }
28
+
29
+ export interface TTSMetrics {
30
+ requestId: string;
31
+ timestamp: number;
32
+ ttfb: number;
33
+ duration: number;
34
+ label: string;
35
+ audioDuration: number;
36
+ cancelled: boolean;
37
+ charactersCount: number;
38
+ streamed: boolean;
39
+ error?: Error;
40
+ }
41
+
42
+ export interface VADMetrics {
43
+ timestamp: number;
44
+ idleTime: number;
45
+ inferenceDurationTotal: number;
46
+ inferenceCount: number;
47
+ label: string;
48
+ }
49
+
50
+ export interface PipelineEOUMetrics {
51
+ /**
52
+ * Unique identifier shared across different metrics to combine related STT, LLM, and TTS metrics
53
+ */
54
+ sequenceId: string;
55
+ /** Timestamp of when the event was recorded */
56
+ timestamp: number;
57
+ /** Amount of time between the end of speech from VAD and the decision to end the user's turn */
58
+ endOfUtteranceDelay: number;
59
+ /**
60
+ * Time taken to obtain the transcript after the end of the user's speech.
61
+ *
62
+ * @remarks
63
+ * May be 0 if the transcript was already available.
64
+ */
65
+ transcriptionDelay: number;
66
+ }
67
+
68
+ export interface PipelineLLMMetrics extends LLMMetrics {
69
+ /**
70
+ * Unique identifier shared across different metrics to combine related STT, LLM, and TTS metrics
71
+ */
72
+ sequenceId: string;
73
+ }
74
+
75
+ export interface PipelineTTSMetrics extends TTSMetrics {
76
+ /**
77
+ * Unique identifier shared across different metrics to combine related STT, LLM, and TTS metrics
78
+ */
79
+ sequenceId: string;
80
+ }
81
+
82
+ export type PipelineSTTMetrics = STTMetrics;
83
+ export type PipelineVADMetrics = VADMetrics;
84
+
85
+ export class MultimodalLLMError extends Error {
86
+ type?: string;
87
+ reason?: string;
88
+ code?: string;
89
+ constructor(
90
+ {
91
+ type,
92
+ reason,
93
+ code,
94
+ message,
95
+ }: { type?: string; reason?: string; code?: string; message?: string } = {},
96
+ options?: ErrorOptions,
97
+ ) {
98
+ super(message, options);
99
+ this.type = type;
100
+ this.reason = reason;
101
+ this.code = code;
102
+ }
103
+ }
104
+
105
+ export interface MultimodalLLMMetrics extends LLMMetrics {
106
+ inputTokenDetails: {
107
+ cachedTokens: number;
108
+ textTokens: number;
109
+ audioTokens: number;
110
+ };
111
+ outputTokenDetails: {
112
+ textTokens: number;
113
+ audioTokens: number;
114
+ };
115
+ }
116
+
117
+ export type AgentMetrics =
118
+ | STTMetrics
119
+ | LLMMetrics
120
+ | TTSMetrics
121
+ | VADMetrics
122
+ | PipelineSTTMetrics
123
+ | PipelineEOUMetrics
124
+ | PipelineLLMMetrics
125
+ | PipelineTTSMetrics
126
+ | PipelineVADMetrics
127
+ | MultimodalLLMMetrics;
@@ -0,0 +1,20 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+
5
+ export type {
6
+ AgentMetrics,
7
+ STTMetrics,
8
+ LLMMetrics,
9
+ TTSMetrics,
10
+ VADMetrics,
11
+ PipelineSTTMetrics,
12
+ PipelineEOUMetrics,
13
+ PipelineLLMMetrics,
14
+ PipelineTTSMetrics,
15
+ PipelineVADMetrics,
16
+ MultimodalLLMMetrics,
17
+ } from './base.js';
18
+ export { MultimodalLLMError } from './base.js';
19
+ export { type UsageSummary, UsageCollector } from './usage_collector.js';
20
+ export { logMetrics } from './utils.js';
@@ -0,0 +1,40 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import type { AgentMetrics } from './base.js';
5
+ import { isLLMMetrics, isSTTMetrics, isTTSMetrics } from './utils.js';
6
+
7
+ export interface UsageSummary {
8
+ llmPromptTokens: number;
9
+ llmCompletionTokens: number;
10
+ ttsCharactersCount: number;
11
+ sttAudioDuration: number;
12
+ }
13
+
14
+ export class UsageCollector {
15
+ #summary: UsageSummary;
16
+
17
+ constructor() {
18
+ this.#summary = {
19
+ llmPromptTokens: 0,
20
+ llmCompletionTokens: 0,
21
+ ttsCharactersCount: 0,
22
+ sttAudioDuration: 0,
23
+ };
24
+ }
25
+
26
+ collect(metrics: AgentMetrics) {
27
+ if (isLLMMetrics(metrics)) {
28
+ this.#summary.llmPromptTokens += metrics.promptTokens;
29
+ this.#summary.llmCompletionTokens += metrics.completionTokens;
30
+ } else if (isTTSMetrics(metrics)) {
31
+ this.#summary.ttsCharactersCount += metrics.charactersCount;
32
+ } else if (isSTTMetrics(metrics)) {
33
+ this.#summary.sttAudioDuration += metrics.audioDuration;
34
+ }
35
+ }
36
+
37
+ get summary(): UsageSummary {
38
+ return { ...this.#summary };
39
+ }
40
+ }