@livekit/agents-plugin-silero 0.4.6 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,17 @@
1
+ <!--
2
+ SPDX-FileCopyrightText: 2024 LiveKit, Inc.
3
+
4
+ SPDX-License-Identifier: Apache-2.0
5
+ -->
6
+ # Silero plugin for LiveKit Agents
7
+
8
+ The Agents Framework is designed for building realtime, programmable
9
+ participants that run on servers. Use it to create conversational, multi-modal
10
+ voice agents that can see, hear, and understand.
11
+
12
+ This package contains the Silero plugin, providing voice activity detection.
13
+ Refer to the [documentation](https://docs.livekit.io/agents/overview/) for
14
+ information on how to use it, or browse the [API
15
+ reference](https://docs.livekit.io/agents-js/modules/plugins_agents_plugin_silero.html).
16
+ See the [repository](https://github.com/livekit/agents-js) for more information
17
+ about the framework as a whole.
package/dist/index.cjs ADDED
@@ -0,0 +1,31 @@
1
+ "use strict";
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
6
+ var __export = (target, all) => {
7
+ for (var name in all)
8
+ __defProp(target, name, { get: all[name], enumerable: true });
9
+ };
10
+ var __copyProps = (to, from, except, desc) => {
11
+ if (from && typeof from === "object" || typeof from === "function") {
12
+ for (let key of __getOwnPropNames(from))
13
+ if (!__hasOwnProp.call(to, key) && key !== except)
14
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
15
+ }
16
+ return to;
17
+ };
18
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
+ var src_exports = {};
20
+ __export(src_exports, {
21
+ VAD: () => import_vad.VAD,
22
+ VADStream: () => import_vad.VADStream
23
+ });
24
+ module.exports = __toCommonJS(src_exports);
25
+ var import_vad = require("./vad.cjs");
26
+ // Annotate the CommonJS export names for ESM import in node:
27
+ 0 && (module.exports = {
28
+ VAD,
29
+ VADStream
30
+ });
31
+ //# sourceMappingURL=index.cjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/index.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nexport { VAD, VADStream } from './vad.js';\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAGA,iBAA+B;","names":[]}
package/dist/index.js CHANGED
@@ -1,5 +1,6 @@
1
- // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
- //
3
- // SPDX-License-Identifier: Apache-2.0
4
- export { VAD, VADStream } from './vad.js';
1
+ import { VAD, VADStream } from "./vad.js";
2
+ export {
3
+ VAD,
4
+ VADStream
5
+ };
5
6
  //# sourceMappingURL=index.js.map
package/dist/index.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,6CAA6C;AAC7C,EAAE;AACF,sCAAsC;AACtC,OAAO,EAAE,GAAG,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC"}
1
+ {"version":3,"sources":["../src/index.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nexport { VAD, VADStream } from './vad.js';\n"],"mappings":"AAGA,SAAS,KAAK,iBAAiB;","names":[]}
@@ -0,0 +1,95 @@
1
+ "use strict";
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
6
+ var __export = (target, all) => {
7
+ for (var name in all)
8
+ __defProp(target, name, { get: all[name], enumerable: true });
9
+ };
10
+ var __copyProps = (to, from, except, desc) => {
11
+ if (from && typeof from === "object" || typeof from === "function") {
12
+ for (let key of __getOwnPropNames(from))
13
+ if (!__hasOwnProp.call(to, key) && key !== except)
14
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
15
+ }
16
+ return to;
17
+ };
18
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
+ var onnx_model_exports = {};
20
+ __export(onnx_model_exports, {
21
+ OnnxModel: () => OnnxModel,
22
+ newInferenceSession: () => newInferenceSession
23
+ });
24
+ module.exports = __toCommonJS(onnx_model_exports);
25
+ var getImportMetaUrl = () => typeof document === "undefined" ? new URL(`file:${__filename}`).href : document.currentScript && document.currentScript.src || new URL("main.js", document.baseURI).href;
26
+ var importMetaUrl = /* @__PURE__ */ getImportMetaUrl();
27
+ var import_node_url = require("node:url");
28
+ var import_onnxruntime_node = require("onnxruntime-node");
29
+ const newInferenceSession = (forceCPU) => {
30
+ return import_onnxruntime_node.InferenceSession.create((0, import_node_url.fileURLToPath)(new URL("silero_vad.onnx", importMetaUrl).href), {
31
+ interOpNumThreads: 1,
32
+ intraOpNumThreads: 1,
33
+ executionMode: "sequential",
34
+ executionProviders: forceCPU ? [{ name: "cpu" }] : void 0
35
+ });
36
+ };
37
+ class OnnxModel {
38
+ #session;
39
+ #sampleRate;
40
+ #windowSizeSamples;
41
+ #contextSize;
42
+ #sampleRateNd;
43
+ #context;
44
+ // #state: Float32Array;
45
+ #rnnState;
46
+ #inputBuffer;
47
+ constructor(session, sampleRate) {
48
+ this.#session = session;
49
+ this.#sampleRate = sampleRate;
50
+ switch (sampleRate) {
51
+ case 8e3:
52
+ this.#windowSizeSamples = 256;
53
+ this.#contextSize = 32;
54
+ break;
55
+ case 16e3:
56
+ this.#windowSizeSamples = 512;
57
+ this.#contextSize = 64;
58
+ break;
59
+ }
60
+ this.#sampleRateNd = BigInt64Array.from([BigInt(sampleRate)]);
61
+ this.#context = new Float32Array(this.#contextSize);
62
+ this.#rnnState = new Float32Array(2 * 1 * 128);
63
+ this.#inputBuffer = new Float32Array(this.#contextSize + this.#windowSizeSamples);
64
+ }
65
+ get sampleRate() {
66
+ return this.#sampleRate;
67
+ }
68
+ get windowSizeSamples() {
69
+ return this.#windowSizeSamples;
70
+ }
71
+ get contextSize() {
72
+ return this.#contextSize;
73
+ }
74
+ async run(x) {
75
+ this.#inputBuffer.set(this.#context, 0);
76
+ this.#inputBuffer.set(x, this.#contextSize);
77
+ return await this.#session.run({
78
+ input: new import_onnxruntime_node.Tensor("float32", this.#inputBuffer, [
79
+ 1,
80
+ this.#contextSize + this.#windowSizeSamples
81
+ ]),
82
+ state: new import_onnxruntime_node.Tensor("float32", this.#rnnState, [2, 1, 128]),
83
+ sr: new import_onnxruntime_node.Tensor("int64", this.#sampleRateNd)
84
+ }).then((result) => {
85
+ this.#context = this.#inputBuffer.subarray(0, this.#contextSize);
86
+ return result.output.data.at(0);
87
+ });
88
+ }
89
+ }
90
+ // Annotate the CommonJS export names for ESM import in node:
91
+ 0 && (module.exports = {
92
+ OnnxModel,
93
+ newInferenceSession
94
+ });
95
+ //# sourceMappingURL=onnx_model.cjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/onnx_model.ts","../../../node_modules/.pnpm/tsup@8.3.5_@microsoft+api-extractor@7.43.7_@types+node@22.5.5__postcss@8.4.38_tsx@4.19.2_typescript@5.4.5/node_modules/tsup/assets/cjs_shims.js"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { fileURLToPath } from 'node:url';\nimport { InferenceSession, Tensor } from 'onnxruntime-node';\n\nexport type SampleRate = 8000 | 16000;\n\nexport const newInferenceSession = (forceCPU: boolean) => {\n return InferenceSession.create(fileURLToPath(new URL('silero_vad.onnx', import.meta.url).href), {\n interOpNumThreads: 1,\n intraOpNumThreads: 1,\n executionMode: 'sequential',\n executionProviders: forceCPU ? [{ name: 'cpu' }] : undefined,\n });\n};\n\nexport class OnnxModel {\n #session: InferenceSession;\n #sampleRate: number;\n #windowSizeSamples: number;\n #contextSize: number;\n #sampleRateNd: BigInt64Array;\n #context: Float32Array;\n // #state: Float32Array;\n #rnnState: Float32Array;\n #inputBuffer: Float32Array;\n\n constructor(session: InferenceSession, sampleRate: SampleRate) {\n this.#session = session;\n this.#sampleRate = sampleRate;\n\n switch (sampleRate) {\n case 8000:\n this.#windowSizeSamples = 256;\n this.#contextSize = 32;\n break;\n case 16000:\n this.#windowSizeSamples = 512;\n this.#contextSize = 64;\n break;\n }\n\n this.#sampleRateNd = BigInt64Array.from([BigInt(sampleRate)]);\n this.#context = new Float32Array(this.#contextSize);\n this.#rnnState = new Float32Array(2 * 1 * 128);\n this.#inputBuffer = new Float32Array(this.#contextSize + this.#windowSizeSamples);\n }\n\n get sampleRate(): number {\n return this.#sampleRate;\n }\n\n get windowSizeSamples(): number {\n return this.#windowSizeSamples;\n }\n\n get contextSize(): number {\n return this.#contextSize;\n }\n\n async run(x: Float32Array): Promise<number> {\n this.#inputBuffer.set(this.#context, 0);\n this.#inputBuffer.set(x, this.#contextSize);\n\n return await this.#session\n .run({\n input: new Tensor('float32', this.#inputBuffer, [\n 1,\n this.#contextSize + this.#windowSizeSamples,\n ]),\n state: new Tensor('float32', this.#rnnState, [2, 1, 128]),\n sr: new Tensor('int64', this.#sampleRateNd),\n })\n .then((result) => {\n // this.#state = result.output.data as Float32Array,\n this.#context = this.#inputBuffer.subarray(0, this.#contextSize);\n return (result.output!.data as Float32Array).at(0)!;\n });\n }\n}\n","// Shim globals in cjs bundle\n// There's a weird bug that esbuild will always inject importMetaUrl\n// if we export it as `const importMetaUrl = ... __filename ...`\n// But using a function will not cause this issue\n\nconst getImportMetaUrl = () =>\n typeof document === 'undefined'\n ? new URL(`file:${__filename}`).href\n : (document.currentScript && document.currentScript.src) ||\n new URL('main.js', document.baseURI).href\n\nexport const importMetaUrl = /* @__PURE__ */ getImportMetaUrl()\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;ACKA,IAAM,mBAAmB,MACvB,OAAO,aAAa,cAChB,IAAI,IAAI,QAAQ,UAAU,EAAE,EAAE,OAC7B,SAAS,iBAAiB,SAAS,cAAc,OAClD,IAAI,IAAI,WAAW,SAAS,OAAO,EAAE;AAEpC,IAAM,gBAAgC,iCAAiB;ADR9D,sBAA8B;AAC9B,8BAAyC;AAIlC,MAAM,sBAAsB,CAAC,aAAsB;AACxD,SAAO,yCAAiB,WAAO,+BAAc,IAAI,IAAI,mBAAmB,aAAe,EAAE,IAAI,GAAG;AAAA,IAC9F,mBAAmB;AAAA,IACnB,mBAAmB;AAAA,IACnB,eAAe;AAAA,IACf,oBAAoB,WAAW,CAAC,EAAE,MAAM,MAAM,CAAC,IAAI;AAAA,EACrD,CAAC;AACH;AAEO,MAAM,UAAU;AAAA,EACrB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA;AAAA,EAEA;AAAA,EACA;AAAA,EAEA,YAAY,SAA2B,YAAwB;AAC7D,SAAK,WAAW;AAChB,SAAK,cAAc;AAEnB,YAAQ,YAAY;AAAA,MAClB,KAAK;AACH,aAAK,qBAAqB;AAC1B,aAAK,eAAe;AACpB;AAAA,MACF,KAAK;AACH,aAAK,qBAAqB;AAC1B,aAAK,eAAe;AACpB;AAAA,IACJ;AAEA,SAAK,gBAAgB,cAAc,KAAK,CAAC,OAAO,UAAU,CAAC,CAAC;AAC5D,SAAK,WAAW,IAAI,aAAa,KAAK,YAAY;AAClD,SAAK,YAAY,IAAI,aAAa,IAAI,IAAI,GAAG;AAC7C,SAAK,eAAe,IAAI,aAAa,KAAK,eAAe,KAAK,kBAAkB;AAAA,EAClF;AAAA,EAEA,IAAI,aAAqB;AACvB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,oBAA4B;AAC9B,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,cAAsB;AACxB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,MAAM,IAAI,GAAkC;AAC1C,SAAK,aAAa,IAAI,KAAK,UAAU,CAAC;AACtC,SAAK,aAAa,IAAI,GAAG,KAAK,YAAY;AAE1C,WAAO,MAAM,KAAK,SACf,IAAI;AAAA,MACH,OAAO,IAAI,+BAAO,WAAW,KAAK,cAAc;AAAA,QAC9C;AAAA,QACA,KAAK,eAAe,KAAK;AAAA,MAC3B,CAAC;AAAA,MACD,OAAO,IAAI,+BAAO,WAAW,KAAK,WAAW,CAAC,GAAG,GAAG,GAAG,CAAC;AAAA,MACxD,IAAI,IAAI,+BAAO,SAAS,KAAK,aAAa;AAAA,IAC5C,CAAC,EACA,KAAK,CAAC,WAAW;AAEhB,WAAK,WAAW,KAAK,aAAa,SAAS,GAAG,KAAK,YAAY;AAC/D,aAAQ,OAAO,OAAQ,KAAsB,GAAG,CAAC;AAAA,IACnD,CAAC;AAAA,EACL;AACF;","names":[]}
@@ -1 +1 @@
1
- {"version":3,"file":"onnx_model.d.ts","sourceRoot":"","sources":["../src/onnx_model.ts"],"names":[],"mappings":";AAGA,OAAO,EAAE,gBAAgB,EAAU,MAAM,kBAAkB,CAAC;AAE5D,MAAM,MAAM,UAAU,GAAG,IAAI,GAAG,KAAK,CAAC;AAEtC,eAAO,MAAM,mBAAmB,aAAc,OAAO,8BAOpD,CAAC;AAEF,qBAAa,SAAS;;gBAWR,OAAO,EAAE,gBAAgB,EAAE,UAAU,EAAE,UAAU;IAqB7D,IAAI,UAAU,IAAI,MAAM,CAEvB;IAED,IAAI,iBAAiB,IAAI,MAAM,CAE9B;IAED,IAAI,WAAW,IAAI,MAAM,CAExB;IAEK,GAAG,CAAC,CAAC,EAAE,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC;CAmB5C"}
1
+ {"version":3,"file":"onnx_model.d.ts","sourceRoot":"","sources":["../src/onnx_model.ts"],"names":[],"mappings":";AAIA,OAAO,EAAE,gBAAgB,EAAU,MAAM,kBAAkB,CAAC;AAE5D,MAAM,MAAM,UAAU,GAAG,IAAI,GAAG,KAAK,CAAC;AAEtC,eAAO,MAAM,mBAAmB,aAAc,OAAO,8BAOpD,CAAC;AAEF,qBAAa,SAAS;;gBAWR,OAAO,EAAE,gBAAgB,EAAE,UAAU,EAAE,UAAU;IAqB7D,IAAI,UAAU,IAAI,MAAM,CAEvB;IAED,IAAI,iBAAiB,IAAI,MAAM,CAE9B;IAED,IAAI,WAAW,IAAI,MAAM,CAExB;IAEK,GAAG,CAAC,CAAC,EAAE,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC;CAmB5C"}
@@ -1,69 +1,68 @@
1
- // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
- //
3
- // SPDX-License-Identifier: Apache-2.0
4
- import { InferenceSession, Tensor } from 'onnxruntime-node';
5
- export const newInferenceSession = (forceCPU) => {
6
- return InferenceSession.create(new URL('silero_vad.onnx', import.meta.url).pathname, {
7
- interOpNumThreads: 1,
8
- intraOpNumThreads: 1,
9
- executionMode: 'sequential',
10
- executionProviders: forceCPU ? [{ name: 'cpu' }] : undefined,
11
- });
1
+ import { fileURLToPath } from "node:url";
2
+ import { InferenceSession, Tensor } from "onnxruntime-node";
3
+ const newInferenceSession = (forceCPU) => {
4
+ return InferenceSession.create(fileURLToPath(new URL("silero_vad.onnx", import.meta.url).href), {
5
+ interOpNumThreads: 1,
6
+ intraOpNumThreads: 1,
7
+ executionMode: "sequential",
8
+ executionProviders: forceCPU ? [{ name: "cpu" }] : void 0
9
+ });
12
10
  };
13
- export class OnnxModel {
14
- #session;
15
- #sampleRate;
16
- #windowSizeSamples;
17
- #contextSize;
18
- #sampleRateNd;
19
- #context;
20
- // #state: Float32Array;
21
- #rnnState;
22
- #inputBuffer;
23
- constructor(session, sampleRate) {
24
- this.#session = session;
25
- this.#sampleRate = sampleRate;
26
- switch (sampleRate) {
27
- case 8000:
28
- this.#windowSizeSamples = 256;
29
- this.#contextSize = 32;
30
- break;
31
- case 16000:
32
- this.#windowSizeSamples = 512;
33
- this.#contextSize = 64;
34
- break;
35
- }
36
- this.#sampleRateNd = BigInt64Array.from([BigInt(sampleRate)]);
37
- this.#context = new Float32Array(this.#contextSize);
38
- this.#rnnState = new Float32Array(2 * 1 * 128);
39
- this.#inputBuffer = new Float32Array(this.#contextSize + this.#windowSizeSamples);
40
- }
41
- get sampleRate() {
42
- return this.#sampleRate;
43
- }
44
- get windowSizeSamples() {
45
- return this.#windowSizeSamples;
46
- }
47
- get contextSize() {
48
- return this.#contextSize;
49
- }
50
- async run(x) {
51
- this.#inputBuffer.set(this.#context, 0);
52
- this.#inputBuffer.set(x, this.#contextSize);
53
- return await this.#session
54
- .run({
55
- input: new Tensor('float32', this.#inputBuffer, [
56
- 1,
57
- this.#contextSize + this.#windowSizeSamples,
58
- ]),
59
- state: new Tensor('float32', this.#rnnState, [2, 1, 128]),
60
- sr: new Tensor('int64', this.#sampleRateNd),
61
- })
62
- .then((result) => {
63
- // this.#state = result.output.data as Float32Array,
64
- this.#context = this.#inputBuffer.subarray(0, this.#contextSize);
65
- return result.output.data.at(0);
66
- });
11
+ class OnnxModel {
12
+ #session;
13
+ #sampleRate;
14
+ #windowSizeSamples;
15
+ #contextSize;
16
+ #sampleRateNd;
17
+ #context;
18
+ // #state: Float32Array;
19
+ #rnnState;
20
+ #inputBuffer;
21
+ constructor(session, sampleRate) {
22
+ this.#session = session;
23
+ this.#sampleRate = sampleRate;
24
+ switch (sampleRate) {
25
+ case 8e3:
26
+ this.#windowSizeSamples = 256;
27
+ this.#contextSize = 32;
28
+ break;
29
+ case 16e3:
30
+ this.#windowSizeSamples = 512;
31
+ this.#contextSize = 64;
32
+ break;
67
33
  }
34
+ this.#sampleRateNd = BigInt64Array.from([BigInt(sampleRate)]);
35
+ this.#context = new Float32Array(this.#contextSize);
36
+ this.#rnnState = new Float32Array(2 * 1 * 128);
37
+ this.#inputBuffer = new Float32Array(this.#contextSize + this.#windowSizeSamples);
38
+ }
39
+ get sampleRate() {
40
+ return this.#sampleRate;
41
+ }
42
+ get windowSizeSamples() {
43
+ return this.#windowSizeSamples;
44
+ }
45
+ get contextSize() {
46
+ return this.#contextSize;
47
+ }
48
+ async run(x) {
49
+ this.#inputBuffer.set(this.#context, 0);
50
+ this.#inputBuffer.set(x, this.#contextSize);
51
+ return await this.#session.run({
52
+ input: new Tensor("float32", this.#inputBuffer, [
53
+ 1,
54
+ this.#contextSize + this.#windowSizeSamples
55
+ ]),
56
+ state: new Tensor("float32", this.#rnnState, [2, 1, 128]),
57
+ sr: new Tensor("int64", this.#sampleRateNd)
58
+ }).then((result) => {
59
+ this.#context = this.#inputBuffer.subarray(0, this.#contextSize);
60
+ return result.output.data.at(0);
61
+ });
62
+ }
68
63
  }
64
+ export {
65
+ OnnxModel,
66
+ newInferenceSession
67
+ };
69
68
  //# sourceMappingURL=onnx_model.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"onnx_model.js","sourceRoot":"","sources":["../src/onnx_model.ts"],"names":[],"mappings":"AAAA,6CAA6C;AAC7C,EAAE;AACF,sCAAsC;AACtC,OAAO,EAAE,gBAAgB,EAAE,MAAM,EAAE,MAAM,kBAAkB,CAAC;AAI5D,MAAM,CAAC,MAAM,mBAAmB,GAAG,CAAC,QAAiB,EAAE,EAAE;IACvD,OAAO,gBAAgB,CAAC,MAAM,CAAC,IAAI,GAAG,CAAC,iBAAiB,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE;QACnF,iBAAiB,EAAE,CAAC;QACpB,iBAAiB,EAAE,CAAC;QACpB,aAAa,EAAE,YAAY;QAC3B,kBAAkB,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;KAC7D,CAAC,CAAC;AACL,CAAC,CAAC;AAEF,MAAM,OAAO,SAAS;IACpB,QAAQ,CAAmB;IAC3B,WAAW,CAAS;IACpB,kBAAkB,CAAS;IAC3B,YAAY,CAAS;IACrB,aAAa,CAAgB;IAC7B,QAAQ,CAAe;IACvB,wBAAwB;IACxB,SAAS,CAAe;IACxB,YAAY,CAAe;IAE3B,YAAY,OAAyB,EAAE,UAAsB;QAC3D,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;QACxB,IAAI,CAAC,WAAW,GAAG,UAAU,CAAC;QAE9B,QAAQ,UAAU,EAAE,CAAC;YACnB,KAAK,IAAI;gBACP,IAAI,CAAC,kBAAkB,GAAG,GAAG,CAAC;gBAC9B,IAAI,CAAC,YAAY,GAAG,EAAE,CAAC;gBACvB,MAAM;YACR,KAAK,KAAK;gBACR,IAAI,CAAC,kBAAkB,GAAG,GAAG,CAAC;gBAC9B,IAAI,CAAC,YAAY,GAAG,EAAE,CAAC;gBACvB,MAAM;QACV,CAAC;QAED,IAAI,CAAC,aAAa,GAAG,aAAa,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;QAC9D,IAAI,CAAC,QAAQ,GAAG,IAAI,YAAY,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QACpD,IAAI,CAAC,SAAS,GAAG,IAAI,YAAY,CAAC,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC,CAAC;QAC/C,IAAI,CAAC,YAAY,GAAG,IAAI,YAAY,CAAC,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC,kBAAkB,CAAC,CAAC;IACpF,CAAC;IAED,IAAI,UAAU;QACZ,OAAO,IAAI,CAAC,WAAW,CAAC;IAC1B,CAAC;IAED,IAAI,iBAAiB;QACnB,OAAO,IAAI,CAAC,kBAAkB,CAAC;IACjC,CAAC;IAED,IAAI,WAAW;QACb,OAAO,IAAI,CAAC,YAAY,CAAC;IAC3B,CAAC;IAED,KAAK,CAAC,GAAG,CAAC,CAAe;QACvB,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC;QACxC,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,YAAY,CAAC,CAAC;QAE5C,OAAO,MAAM,IAAI,CAAC,QAAQ;aACvB,GAAG,CAAC;YACH,KAAK,EAAE,IAAI,MAAM,CAAC,SAAS,EAAE,IAAI,CAAC,YAAY,EAAE;gBAC9C,CAAC;gBACD,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC,kBAAkB;aAC5C,CAAC;YACF,KAAK,EAAE,IAAI,MAAM,CAAC,SAAS,EAAE,IAAI,CAAC,SAAS,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,GAAG,CAAC,CAAC;YACzD,EAAE,EAAE,IAAI,MAAM,CAAC,OAAO,EAAE,IAAI,CAAC,aAAa,CAAC;SAC5C,CAAC;aACD,IAAI,CAAC,CAAC,MAAM,EAAE,EAAE;YACf,oDAAoD;YACpD,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,CAAC,EAAE,IAAI,CAAC,YAAY,CAAC,CAAC;YACjE,OAAQ,MAAM,CAAC,MAAO,CAAC,IAAqB,CAAC,EAAE,CAAC,CAAC,CAAE,CAAC;QACtD,CAAC,CAAC,CAAC;IACP,CAAC;CACF"}
1
+ {"version":3,"sources":["../src/onnx_model.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { fileURLToPath } from 'node:url';\nimport { InferenceSession, Tensor } from 'onnxruntime-node';\n\nexport type SampleRate = 8000 | 16000;\n\nexport const newInferenceSession = (forceCPU: boolean) => {\n return InferenceSession.create(fileURLToPath(new URL('silero_vad.onnx', import.meta.url).href), {\n interOpNumThreads: 1,\n intraOpNumThreads: 1,\n executionMode: 'sequential',\n executionProviders: forceCPU ? [{ name: 'cpu' }] : undefined,\n });\n};\n\nexport class OnnxModel {\n #session: InferenceSession;\n #sampleRate: number;\n #windowSizeSamples: number;\n #contextSize: number;\n #sampleRateNd: BigInt64Array;\n #context: Float32Array;\n // #state: Float32Array;\n #rnnState: Float32Array;\n #inputBuffer: Float32Array;\n\n constructor(session: InferenceSession, sampleRate: SampleRate) {\n this.#session = session;\n this.#sampleRate = sampleRate;\n\n switch (sampleRate) {\n case 8000:\n this.#windowSizeSamples = 256;\n this.#contextSize = 32;\n break;\n case 16000:\n this.#windowSizeSamples = 512;\n this.#contextSize = 64;\n break;\n }\n\n this.#sampleRateNd = BigInt64Array.from([BigInt(sampleRate)]);\n this.#context = new Float32Array(this.#contextSize);\n this.#rnnState = new Float32Array(2 * 1 * 128);\n this.#inputBuffer = new Float32Array(this.#contextSize + this.#windowSizeSamples);\n }\n\n get sampleRate(): number {\n return this.#sampleRate;\n }\n\n get windowSizeSamples(): number {\n return this.#windowSizeSamples;\n }\n\n get contextSize(): number {\n return this.#contextSize;\n }\n\n async run(x: Float32Array): Promise<number> {\n this.#inputBuffer.set(this.#context, 0);\n this.#inputBuffer.set(x, this.#contextSize);\n\n return await this.#session\n .run({\n input: new Tensor('float32', this.#inputBuffer, [\n 1,\n this.#contextSize + this.#windowSizeSamples,\n ]),\n state: new Tensor('float32', this.#rnnState, [2, 1, 128]),\n sr: new Tensor('int64', this.#sampleRateNd),\n })\n .then((result) => {\n // this.#state = result.output.data as Float32Array,\n this.#context = this.#inputBuffer.subarray(0, this.#contextSize);\n return (result.output!.data as Float32Array).at(0)!;\n });\n }\n}\n"],"mappings":"AAGA,SAAS,qBAAqB;AAC9B,SAAS,kBAAkB,cAAc;AAIlC,MAAM,sBAAsB,CAAC,aAAsB;AACxD,SAAO,iBAAiB,OAAO,cAAc,IAAI,IAAI,mBAAmB,YAAY,GAAG,EAAE,IAAI,GAAG;AAAA,IAC9F,mBAAmB;AAAA,IACnB,mBAAmB;AAAA,IACnB,eAAe;AAAA,IACf,oBAAoB,WAAW,CAAC,EAAE,MAAM,MAAM,CAAC,IAAI;AAAA,EACrD,CAAC;AACH;AAEO,MAAM,UAAU;AAAA,EACrB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA;AAAA,EAEA;AAAA,EACA;AAAA,EAEA,YAAY,SAA2B,YAAwB;AAC7D,SAAK,WAAW;AAChB,SAAK,cAAc;AAEnB,YAAQ,YAAY;AAAA,MAClB,KAAK;AACH,aAAK,qBAAqB;AAC1B,aAAK,eAAe;AACpB;AAAA,MACF,KAAK;AACH,aAAK,qBAAqB;AAC1B,aAAK,eAAe;AACpB;AAAA,IACJ;AAEA,SAAK,gBAAgB,cAAc,KAAK,CAAC,OAAO,UAAU,CAAC,CAAC;AAC5D,SAAK,WAAW,IAAI,aAAa,KAAK,YAAY;AAClD,SAAK,YAAY,IAAI,aAAa,IAAI,IAAI,GAAG;AAC7C,SAAK,eAAe,IAAI,aAAa,KAAK,eAAe,KAAK,kBAAkB;AAAA,EAClF;AAAA,EAEA,IAAI,aAAqB;AACvB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,oBAA4B;AAC9B,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,IAAI,cAAsB;AACxB,WAAO,KAAK;AAAA,EACd;AAAA,EAEA,MAAM,IAAI,GAAkC;AAC1C,SAAK,aAAa,IAAI,KAAK,UAAU,CAAC;AACtC,SAAK,aAAa,IAAI,GAAG,KAAK,YAAY;AAE1C,WAAO,MAAM,KAAK,SACf,IAAI;AAAA,MACH,OAAO,IAAI,OAAO,WAAW,KAAK,cAAc;AAAA,QAC9C;AAAA,QACA,KAAK,eAAe,KAAK;AAAA,MAC3B,CAAC;AAAA,MACD,OAAO,IAAI,OAAO,WAAW,KAAK,WAAW,CAAC,GAAG,GAAG,GAAG,CAAC;AAAA,MACxD,IAAI,IAAI,OAAO,SAAS,KAAK,aAAa;AAAA,IAC5C,CAAC,EACA,KAAK,CAAC,WAAW;AAEhB,WAAK,WAAW,KAAK,aAAa,SAAS,GAAG,KAAK,YAAY;AAC/D,aAAQ,OAAO,OAAQ,KAAsB,GAAG,CAAC;AAAA,IACnD,CAAC;AAAA,EACL;AACF;","names":[]}
@@ -0,0 +1,2 @@
1
+ "use strict";
2
+ //# sourceMappingURL=onnxruntime.d.cjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
@@ -0,0 +1 @@
1
+ //# sourceMappingURL=onnxruntime.d.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
package/dist/vad.cjs ADDED
@@ -0,0 +1,292 @@
1
+ "use strict";
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
6
+ var __export = (target, all) => {
7
+ for (var name in all)
8
+ __defProp(target, name, { get: all[name], enumerable: true });
9
+ };
10
+ var __copyProps = (to, from, except, desc) => {
11
+ if (from && typeof from === "object" || typeof from === "function") {
12
+ for (let key of __getOwnPropNames(from))
13
+ if (!__hasOwnProp.call(to, key) && key !== except)
14
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
15
+ }
16
+ return to;
17
+ };
18
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
+ var vad_exports = {};
20
+ __export(vad_exports, {
21
+ VAD: () => VAD,
22
+ VADStream: () => VADStream
23
+ });
24
+ module.exports = __toCommonJS(vad_exports);
25
+ var import_agents = require("@livekit/agents");
26
+ var import_rtc_node = require("@livekit/rtc-node");
27
+ var import_onnx_model = require("./onnx_model.cjs");
28
+ const SLOW_INFERENCE_THRESHOLD = 200;
29
+ const defaultVADOptions = {
30
+ minSpeechDuration: 50,
31
+ minSilenceDuration: 250,
32
+ prefixPaddingDuration: 500,
33
+ maxBufferedSpeech: 6e4,
34
+ activationThreshold: 0.5,
35
+ sampleRate: 16e3,
36
+ forceCPU: true
37
+ };
38
+ class VAD extends import_agents.VAD {
39
+ #session;
40
+ #opts;
41
+ label = "silero.VAD";
42
+ constructor(session, opts) {
43
+ super({ updateInterval: 32 });
44
+ this.#session = session;
45
+ this.#opts = opts;
46
+ }
47
+ /**
48
+ * Load and initialize the Silero VAD model.
49
+ *
50
+ * This method loads the ONNX model and prepares it for inference. When options are not provided,
51
+ * sane defaults are used.
52
+ *
53
+ * @remarks
54
+ * This method may take time to load the model into memory.
55
+ * It is recommended to call this method inside your prewarm mechanism.
56
+ *
57
+ * @example
58
+ * ```ts
59
+ * export default defineAgent({
60
+ * prewarm: async (proc: JobProcess) => {
61
+ * proc.userData.vad = await VAD.load();
62
+ * },
63
+ * entry: async (ctx: JobContext) => {
64
+ * const vad = ctx.proc.userData.vad! as VAD;
65
+ * // the rest of your agent logic
66
+ * },
67
+ * });
68
+ * ```
69
+ *
70
+ * @param options -
71
+ * @returns Promise\<{@link VAD}\>: An instance of the VAD class ready for streaming.
72
+ */
73
+ static async load(opts = {}) {
74
+ const mergedOpts = { ...defaultVADOptions, ...opts };
75
+ const session = await (0, import_onnx_model.newInferenceSession)(mergedOpts.forceCPU);
76
+ return new VAD(session, mergedOpts);
77
+ }
78
+ stream() {
79
+ return new VADStream(this, this.#opts, new import_onnx_model.OnnxModel(this.#session, this.#opts.sampleRate));
80
+ }
81
+ }
82
+ class VADStream extends import_agents.VADStream {
83
+ #opts;
84
+ #model;
85
+ #task;
86
+ #expFilter = new import_agents.ExpFilter(0.35);
87
+ #extraInferenceTime = 0;
88
+ #logger = (0, import_agents.log)();
89
+ constructor(vad, opts, model) {
90
+ super(vad);
91
+ this.#opts = opts;
92
+ this.#model = model;
93
+ this.#task = new Promise(async () => {
94
+ let inferenceData = new Float32Array(this.#model.windowSizeSamples);
95
+ let speechBuffer = null;
96
+ let speechBufferMaxReached = false;
97
+ let speechBufferIndex = 0;
98
+ let pubSpeaking = false;
99
+ let pubSpeechDuration = 0;
100
+ let pubSilenceDuration = 0;
101
+ let pubCurrentSample = 0;
102
+ let pubTimestamp = 0;
103
+ let pubSampleRate = 0;
104
+ let pubPrefixPaddingSamples = 0;
105
+ let speechThresholdDuration = 0;
106
+ let silenceThresholdDuration = 0;
107
+ let inputFrames = [];
108
+ let inferenceFrames = [];
109
+ let resampler = null;
110
+ let inputCopyRemainingFrac = 0;
111
+ for await (const frame of this.input) {
112
+ if (typeof frame === "symbol") {
113
+ continue;
114
+ }
115
+ if (!pubSampleRate || !speechBuffer) {
116
+ pubSampleRate = frame.sampleRate;
117
+ pubPrefixPaddingSamples = Math.trunc(
118
+ this.#opts.prefixPaddingDuration * pubSampleRate / 1e3
119
+ );
120
+ speechBuffer = new Int16Array(
121
+ this.#opts.maxBufferedSpeech * pubSampleRate + pubPrefixPaddingSamples
122
+ );
123
+ if (this.#opts.sampleRate !== pubSampleRate) {
124
+ resampler = new import_rtc_node.AudioResampler(
125
+ pubSampleRate,
126
+ this.#opts.sampleRate,
127
+ 1,
128
+ import_rtc_node.AudioResamplerQuality.QUICK
129
+ // VAD doesn't need high quality
130
+ );
131
+ }
132
+ } else if (frame.sampleRate !== pubSampleRate) {
133
+ this.#logger.error("a frame with a different sample rate was already published");
134
+ continue;
135
+ }
136
+ inputFrames.push(frame);
137
+ if (resampler) {
138
+ inferenceFrames.push(...resampler.push(frame));
139
+ } else {
140
+ inferenceFrames.push(frame);
141
+ }
142
+ while (true) {
143
+ const startTime = process.hrtime.bigint();
144
+ const availableInferenceSamples = inferenceFrames.map((x) => x.samplesPerChannel).reduce((acc, x) => acc + x, 0);
145
+ if (availableInferenceSamples < this.#model.windowSizeSamples) {
146
+ break;
147
+ }
148
+ const inputFrame = (0, import_agents.mergeFrames)(inputFrames);
149
+ const inferenceFrame = (0, import_agents.mergeFrames)(inferenceFrames);
150
+ inferenceData = Float32Array.from(
151
+ inferenceFrame.data.subarray(0, this.#model.windowSizeSamples),
152
+ (x) => x / 32767
153
+ );
154
+ const p = await this.#model.run(inferenceData).then((data) => this.#expFilter.apply(1, data));
155
+ const windowDuration = this.#model.windowSizeSamples / this.#opts.sampleRate * 1e3;
156
+ pubCurrentSample += this.#model.windowSizeSamples;
157
+ pubTimestamp += windowDuration;
158
+ const resamplingRatio = pubSampleRate / this.#model.sampleRate;
159
+ const toCopy = this.#model.windowSizeSamples * resamplingRatio + inputCopyRemainingFrac;
160
+ const toCopyInt = Math.trunc(toCopy);
161
+ inputCopyRemainingFrac = toCopy - toCopyInt;
162
+ const availableSpace = speechBuffer.length - speechBufferIndex;
163
+ const toCopyBuffer = Math.min(this.#model.windowSizeSamples, availableSpace);
164
+ if (toCopyBuffer > 0) {
165
+ speechBuffer.set(inputFrame.data.subarray(0, toCopyBuffer), speechBufferIndex);
166
+ speechBufferIndex += toCopyBuffer;
167
+ } else if (!speechBufferMaxReached) {
168
+ speechBufferMaxReached = true;
169
+ this.#logger.warn(
170
+ "maxBufferedSpeech reached, ignoring further data for the current speech input"
171
+ );
172
+ }
173
+ const inferenceDuration = Number((process.hrtime.bigint() - startTime) / BigInt(1e6));
174
+ this.#extraInferenceTime = Math.max(
175
+ 0,
176
+ this.#extraInferenceTime + inferenceDuration - windowDuration
177
+ );
178
+ if (this.#extraInferenceTime > SLOW_INFERENCE_THRESHOLD) {
179
+ this.#logger.child({ delay: this.#extraInferenceTime }).warn("inference is slower than realtime");
180
+ }
181
+ if (pubSpeaking) {
182
+ pubSpeechDuration += inferenceDuration;
183
+ } else {
184
+ pubSilenceDuration += inferenceDuration;
185
+ }
186
+ this.queue.put({
187
+ type: import_agents.VADEventType.INFERENCE_DONE,
188
+ samplesIndex: pubCurrentSample,
189
+ timestamp: pubTimestamp,
190
+ silenceDuration: pubSilenceDuration,
191
+ speechDuration: pubSpeechDuration,
192
+ probability: p,
193
+ inferenceDuration,
194
+ frames: [
195
+ new import_rtc_node.AudioFrame(inputFrame.data.subarray(0, toCopyInt), pubSampleRate, 1, toCopyInt)
196
+ ],
197
+ speaking: pubSpeaking,
198
+ rawAccumulatedSilence: silenceThresholdDuration,
199
+ rawAccumulatedSpeech: speechThresholdDuration
200
+ });
201
+ const resetWriteCursor = () => {
202
+ if (!speechBuffer) throw new Error("speechBuffer is empty");
203
+ if (speechBufferIndex <= pubPrefixPaddingSamples) {
204
+ return;
205
+ }
206
+ const paddingData = speechBuffer.subarray(
207
+ speechBufferIndex - pubPrefixPaddingSamples,
208
+ speechBufferIndex
209
+ );
210
+ speechBuffer.set(paddingData, 0);
211
+ speechBufferIndex = pubPrefixPaddingSamples;
212
+ speechBufferMaxReached = false;
213
+ };
214
+ const copySpeechBuffer = () => {
215
+ if (!speechBuffer) throw new Error("speechBuffer is empty");
216
+ return new import_rtc_node.AudioFrame(
217
+ speechBuffer.subarray(0, speechBufferIndex),
218
+ pubSampleRate,
219
+ 1,
220
+ speechBufferIndex
221
+ );
222
+ };
223
+ if (p > this.#opts.activationThreshold) {
224
+ speechThresholdDuration += windowDuration;
225
+ silenceThresholdDuration = 0;
226
+ if (!pubSpeaking && speechThresholdDuration >= this.#opts.minSpeechDuration) {
227
+ pubSpeaking = true;
228
+ pubSilenceDuration = 0;
229
+ pubSpeechDuration = speechThresholdDuration;
230
+ this.queue.put({
231
+ type: import_agents.VADEventType.START_OF_SPEECH,
232
+ samplesIndex: pubCurrentSample,
233
+ timestamp: pubTimestamp,
234
+ silenceDuration: pubSilenceDuration,
235
+ speechDuration: pubSpeechDuration,
236
+ probability: p,
237
+ inferenceDuration,
238
+ frames: [copySpeechBuffer()],
239
+ speaking: pubSpeaking,
240
+ rawAccumulatedSilence: 0,
241
+ rawAccumulatedSpeech: 0
242
+ });
243
+ }
244
+ } else {
245
+ silenceThresholdDuration += windowDuration;
246
+ speechThresholdDuration = 0;
247
+ if (!pubSpeaking) {
248
+ resetWriteCursor();
249
+ }
250
+ if (pubSpeaking && silenceThresholdDuration > this.#opts.minSilenceDuration) {
251
+ pubSpeaking = false;
252
+ pubSpeechDuration = 0;
253
+ pubSilenceDuration = silenceThresholdDuration;
254
+ this.queue.put({
255
+ type: import_agents.VADEventType.END_OF_SPEECH,
256
+ samplesIndex: pubCurrentSample,
257
+ timestamp: pubTimestamp,
258
+ silenceDuration: pubSilenceDuration,
259
+ speechDuration: pubSpeechDuration,
260
+ probability: p,
261
+ inferenceDuration,
262
+ frames: [copySpeechBuffer()],
263
+ speaking: pubSpeaking,
264
+ rawAccumulatedSilence: 0,
265
+ rawAccumulatedSpeech: 0
266
+ });
267
+ resetWriteCursor();
268
+ }
269
+ }
270
+ inputFrames = [];
271
+ inferenceFrames = [];
272
+ if (inputFrame.data.length > toCopyInt) {
273
+ const data = inputFrame.data.subarray(toCopyInt);
274
+ inputFrames.push(new import_rtc_node.AudioFrame(data, pubSampleRate, 1, Math.trunc(data.length / 2)));
275
+ }
276
+ if (inferenceFrame.data.length > this.#model.windowSizeSamples) {
277
+ const data = inferenceFrame.data.subarray(this.#model.windowSizeSamples);
278
+ inferenceFrames.push(
279
+ new import_rtc_node.AudioFrame(data, this.#opts.sampleRate, 1, Math.trunc(data.length / 2))
280
+ );
281
+ }
282
+ }
283
+ }
284
+ });
285
+ }
286
+ }
287
+ // Annotate the CommonJS export names for ESM import in node:
288
+ 0 && (module.exports = {
289
+ VAD,
290
+ VADStream
291
+ });
292
+ //# sourceMappingURL=vad.cjs.map