@s-h-a-d-o-w/speech-recorder 2.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.clang-format ADDED
@@ -0,0 +1,2 @@
1
+ BasedOnStyle: Google
2
+ IncludeBlocks: Preserve
@@ -0,0 +1,62 @@
1
+ name: Prebuilds
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "*"
7
+
8
+ permissions:
9
+ contents: write
10
+
11
+ jobs:
12
+ prebuild:
13
+ name: ${{ matrix.platform }} ${{ matrix.arch }}
14
+ runs-on: ${{ matrix.os }}
15
+
16
+ strategy:
17
+ fail-fast: false
18
+ matrix:
19
+ include:
20
+ - platform: linux
21
+ os: ubuntu-latest
22
+ arch: x64
23
+ - platform: macos
24
+ os: macos-latest
25
+ arch: x64
26
+ - platform: macos
27
+ os: macos-latest
28
+ arch: arm64
29
+ - platform: windows
30
+ os: windows-latest
31
+ arch: x64
32
+
33
+ steps:
34
+ - name: Check out repository
35
+ uses: actions/checkout@v6
36
+
37
+ - name: Set up pnpm
38
+ uses: pnpm/action-setup@v6
39
+
40
+ - name: Set up Node.js
41
+ uses: actions/setup-node@v6
42
+ with:
43
+ node-version: lts/*
44
+
45
+ # - name: Install Linux build dependencies
46
+ # if: runner.os == 'Linux'
47
+ # run: |
48
+ # sudo apt-get update
49
+ # sudo apt-get install -y libasound2-dev
50
+
51
+ - name: Install npm dependencies
52
+ run: pnpm install --frozen-lockfile --ignore-scripts
53
+
54
+ - name: Build native dependencies
55
+ shell: bash
56
+ run: bash setup.sh "${{ matrix.arch }}"
57
+
58
+ - name: Build and upload prebuild
59
+ shell: bash
60
+ env:
61
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
62
+ run: bash build.sh "${{ matrix.arch }}" "$GITHUB_TOKEN"
package/LICENSE ADDED
@@ -0,0 +1,7 @@
1
+ Copyright 2021 Serenade Labs, Inc.
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,122 @@
1
+ # Fork notes
2
+
3
+ ## Prerequisites
4
+
5
+ - cmake
6
+
7
+ ## Notes
8
+
9
+ `pnpm install` errors if there's no build. But we need the dependencies in order to build. Ignore the error.
10
+
11
+ Prebuilds MUST BE PUBLISHED TO GITHUB using `./build.sh <arch> <GITHUB_TOKEN>`!! Otherwise, users won't be able to install the package. (The build chain is nix-only, which would be a problem for windows users.)
12
+
13
+ Latest portaudio release was in 2021. Which is why it already requires the `-DCMAKE_POLICY_VERSION_MINIMUM=3.5` override and may become incompatible sooner or later.
14
+
15
+
16
+ # Speech Recorder
17
+
18
+ speech-recorder is a cross-platform, native [node.js](https://nodejs.org) [addon](http://nodejs.org/api/addons.html) for getting a stream of audio from a device's microphone. Using speech-recorder, you can also get only the audio that corresponds to someone speaking.
19
+
20
+ This module is used for speech recognition in [Serenade](https://serenade.ai). Serenade enables you to write code through natural speech, rather than typing.
21
+
22
+ ## Installation
23
+
24
+ speech-recorder has been tested on Windows 10, macOS 10.14+, and Ubuntu 18.04+ (and may work on other platforms as well).
25
+
26
+ To install speech-recorder, run:
27
+
28
+ yarn add speech-recorder
29
+
30
+ If you're using this library with Electron, you should probably use [electron-rebuild](https://github.com/electron/electron-rebuild).
31
+
32
+ ## Usage
33
+
34
+ This library uses two voice activity detection mechanisms: a fast first pass (the WebRTC VAD), and a slightly slower, but much more accurate, second pass (the Silero VAD). See below for the various options you can supply to each.
35
+
36
+ ### Streaming
37
+
38
+ When you start recording, you can register various callbacks. `onAudio` is called when any audio comes in from the microphone. `onChunkStart` is called when a chunk of speech begins, and `onChunkEnd` is called when speech ends.
39
+
40
+ const { SpeechRecorder } = require("speech-recorder");
41
+
42
+ const recorder = new SpeechRecorder({
43
+ onChunkStart: ({ audio }) => {
44
+ console.log(Date.now(), "Chunk start");
45
+ },
46
+ onAudio: ({ speaking, probability, volume }) => {
47
+ console.log(Date.now(), speaking, probability, volume);
48
+ },
49
+ onChunkEnd: () => {
50
+ console.log(Date.now(), "Chunk end");
51
+ },
52
+ });
53
+
54
+ console.log("Recording for 5 seconds...");
55
+ recorder.start();
56
+ setTimeout(() => {
57
+ console.log("Done!");
58
+ recorder.stop();
59
+ }, 5000);
60
+
61
+ You can write all audio from the microphone to a file with:
62
+
63
+ const { SpeechRecorder } = require("speech-recorder");
64
+
65
+ const writeStream = fs.createWriteStream("audio.raw");
66
+ const recorder = new SpeechRecorder({
67
+ onAudio: ({ audio }) => {
68
+ writeStream.write(audio);
69
+ }
70
+ });
71
+
72
+ Or, just the speech with:
73
+
74
+ const { SpeechRecorder } = require("speech-recorder");
75
+
76
+ const writeStream = fs.createWriteStream("audio.raw");
77
+ const recorder = new SpeechRecorder({
78
+ onAudio: ({ audio, speech }) => {
79
+ if (speech) {
80
+ writeStream.write(audio);
81
+ }
82
+ }
83
+ });
84
+
85
+ ### Devices
86
+
87
+ You can get a list of supported devices with:
88
+
89
+ const { devices } = require("speech-recorder");
90
+
91
+ console.log(devices());
92
+
93
+ ### Options
94
+
95
+ * `consecutiveFramesForSilence`: How many frames of audio must be silent before `onChunkEnd` is fired. Default `10`.
96
+ * `consecutiveFramesForSpeaking`: How many frames of audio must be speech before `onChunkStart` is fired. Default `1`.
97
+ * `device`: ID of the device to use for input (i.e., from the example above). Specify `-1` to use the system default. Default `-1`.
98
+ * `leadingBufferFrames`: How many frames of audio to keep in a buffer that's included in `onChunkStart`. Default `10`.
99
+ * `onChunkStart`: Callback to be executed when speech starts.
100
+ * `onAudio`: Callback to be executed when any audio comes in.
101
+ * `onChunkEnd`: Callback to be executed when speech ends.
102
+ * `samplesPerFrame`: How many audio samples to be included in each frame from the microphone. Default `480`.
103
+ * `sampleRate`: Audio sample rate. Default `16000`.
104
+ * `sileroVadBufferSize`: How many audio samples to pass to the VAD. Default `2000`.
105
+ * `sileroVadRateLimit`: Rate limit, in frames, for how frequently to call the VAD. Default `3`.
106
+ * `sileroVadSilenceThreshold`: Probability threshold for speech to transition to silence. Default `0.1`.
107
+ * `sileroVadSpeakingThreshold`: Probability threshold for silence to transition to speech. Default `0.3`.
108
+ * `webrtcVadLevel`: Aggressiveness for the first-pass VAD filter. `0` is least aggressive, and `3` is most aggressive. Default `3`.
109
+ * `webrtcVadBufferSize`: How many audio samples to pass to the first-pass VAD filter. Default `480`. Can only be `160`, `320`, or `480`.
110
+ * `webrtcVadResultsSize`: How many first-pass VAD filter results to keep in history. Default `10`.
111
+
112
+ ## Building SpeechRecorder
113
+
114
+ If you want to build speech-recorder from source, first install the necessary dependencies by running:
115
+
116
+ ./setup.sh <arch>
117
+
118
+ Where `<arch>` specifies the architecture you'd like to build for and is one of `x64` or `arm64`. If you're not sure, you probably want `x64`.
119
+
120
+ Then, you can build speech-recorder with:
121
+
122
+ ./build.sh <arch>
package/binding.gyp ADDED
@@ -0,0 +1,126 @@
1
+ {
2
+ "targets": [
3
+ {
4
+ "target_name": "speechrecorder",
5
+ "sources": ["src/speech_recorder.cpp"],
6
+ "cflags!": [
7
+ "-fno-exceptions",
8
+ "-fno-rtti",
9
+ ],
10
+ "cflags_cc!": [
11
+ "-fno-exceptions",
12
+ "-fno-rtti",
13
+ ],
14
+ "include_dirs": [
15
+ "<!@(node -p \"require('node-addon-api').include\")",
16
+ "<(module_root_dir)/include",
17
+ "<(module_root_dir)/lib/include",
18
+ "<(module_root_dir)/lib/build/_deps/drwav-src",
19
+ "<(module_root_dir)/lib/build/_deps/readerwriterqueue-src",
20
+ "<(module_root_dir)/lib/3rd_party/webrtcvad",
21
+ "<(module_root_dir)/lib/3rd_party/portaudio/include",
22
+ "<(module_root_dir)/lib/3rd_party/onnxruntime/include",
23
+ ],
24
+ "defines": [
25
+ "NAPI_VERSION=<(napi_build_version)",
26
+ "NAPI_CPP_EXCEPTIONS",
27
+ ],
28
+ "conditions": [
29
+ [
30
+ 'OS=="mac"',
31
+ {
32
+ "xcode_settings": {
33
+ "GCC_ENABLE_CPP_EXCEPTIONS": "YES",
34
+ "GCC_ENABLE_CPP_RTTI": "YES",
35
+ "MACOSX_DEPLOYMENT_TARGET": "10.14",
36
+ "OTHER_LDFLAGS": ["-Wl,-rpath,@loader_path/"],
37
+ },
38
+ "copies": [
39
+ {
40
+ "destination": "<(module_root_dir)/build/Release",
41
+ "files": [
42
+ "<(module_root_dir)/lib/install/lib/libspeechrecorder.dylib",
43
+ "<(module_root_dir)/lib/install/lib/libportaudio.dylib",
44
+ "<(module_root_dir)/lib/install/lib/libonnxruntime.1.10.0.dylib",
45
+ ],
46
+ }
47
+ ],
48
+ "libraries": [
49
+ "<(module_root_dir)/build/Release/libspeechrecorder.dylib",
50
+ "<(module_root_dir)/build/Release/libportaudio.dylib",
51
+ "<(module_root_dir)/build/Release/libonnxruntime.1.10.0.dylib",
52
+ ],
53
+ },
54
+ ],
55
+ [
56
+ 'OS=="win"',
57
+ {
58
+ "msvs_settings": {
59
+ "VCCLCompilerTool": {
60
+ "ExceptionHandling": 1,
61
+ },
62
+ },
63
+ "copies": [
64
+ {
65
+ "destination": "<(module_root_dir)/build/Release",
66
+ "files": [
67
+ "<(module_root_dir)/lib/install/lib/speechrecorder.dll",
68
+ "<(module_root_dir)/lib/install/lib/onnxruntime.dll",
69
+ "<(module_root_dir)/lib/install/lib/onnxruntime_providers_shared.dll",
70
+ ],
71
+ }
72
+ ],
73
+ "libraries": [
74
+ "<(module_root_dir)/lib/install/lib/speechrecorder.lib",
75
+ "<(module_root_dir)/lib/install/lib/onnxruntime.lib",
76
+ "<(module_root_dir)/lib/install/lib/onnxruntime_providers_shared.lib",
77
+ ],
78
+ },
79
+ ],
80
+ [
81
+ 'OS=="win" and target_arch=="x64"',
82
+ {
83
+ "copies": [
84
+ {
85
+ "destination": "<(module_root_dir)/build/Release",
86
+ "files": [
87
+ "<(module_root_dir)/lib/install/lib/portaudio_x64.dll",
88
+ "<(module_root_dir)/lib/3rd_party/vcruntime/x64/vcruntime140.dll",
89
+ "<(module_root_dir)/lib/3rd_party/vcruntime/x64/vcruntime140_1.dll",
90
+ ],
91
+ }
92
+ ],
93
+ "libraries": [
94
+ "<(module_root_dir)/lib/install/lib/portaudio_x64.lib",
95
+ ],
96
+ },
97
+ ],
98
+ [
99
+ 'OS=="linux"',
100
+ {
101
+ "link_settings": {
102
+ "libraries": [
103
+ "-Wl,-rpath,'$$ORIGIN'",
104
+ ]
105
+ },
106
+ "copies": [
107
+ {
108
+ "destination": "<(module_root_dir)/build/Release",
109
+ "files": [
110
+ "<(module_root_dir)/lib/install/lib/libspeechrecorder.so",
111
+ "<(module_root_dir)/lib/install/lib/libportaudio.so",
112
+ "<(module_root_dir)/lib/install/lib/libonnxruntime.so.1.10.0",
113
+ ],
114
+ }
115
+ ],
116
+ "libraries": [
117
+ "<(module_root_dir)/build/Release/libspeechrecorder.so",
118
+ "<(module_root_dir)/build/Release/libportaudio.so",
119
+ "<(module_root_dir)/build/Release/libonnxruntime.so.1.10.0",
120
+ ],
121
+ },
122
+ ],
123
+ ],
124
+ }
125
+ ]
126
+ }
package/build.sh ADDED
@@ -0,0 +1,44 @@
1
+ #!/bin/bash
2
+
3
+ set -e
4
+ HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
5
+ pushd "$HERE" &> /dev/null
6
+
7
+ if [[ -z "$1" ]] ; then
8
+ echo "Usage: build.sh x64|arm64 [github-token]"
9
+ exit 1
10
+ fi
11
+
12
+ rm -rf lib/build lib/install
13
+ mkdir -p lib/build
14
+ cd lib/build
15
+
16
+ if [[ `uname -s` == "MINGW"* ]] ; then
17
+ cmake -A x64 ..
18
+ elif [[ `uname -s` == "Darwin" ]] ; then
19
+ if [[ "$1" == "x64" ]] ; then
20
+ cmake -DCMAKE_OSX_ARCHITECTURES=x86_64 ..
21
+ elif [[ "$1" == "arm64" ]] ; then
22
+ cmake -DCMAKE_OSX_ARCHITECTURES=arm64 ..
23
+ fi
24
+ else
25
+ cmake ..
26
+ fi
27
+
28
+ cmake --build . --config Release
29
+ cmake --install . --prefix ../install
30
+
31
+ cd ../..
32
+ rm -rf prebuilds
33
+
34
+ node_arch="$1"
35
+
36
+ eval "npm_config_arch=$node_arch ./node_modules/.bin/node-gyp rebuild"
37
+
38
+ prebuild_command="./node_modules/.bin/prebuild -r napi --include-regex '.(node|a|dylib|dll|so.*)$' --arch=$node_arch"
39
+ if [[ -n "$2" ]] ; then
40
+ prebuild_command+=" --upload $2"
41
+ fi
42
+ eval $prebuild_command
43
+
44
+ popd &> /dev/null
@@ -0,0 +1,124 @@
1
+ const fs = require("fs");
2
+ const path = require("path");
3
+ const { SpeechRecorder } = require("../src/index");
4
+
5
+ const quantile = (elements, q) => {
6
+ const sorted = elements.sort((a, b) => a - b);
7
+ const p = (sorted.length - 1) * q;
8
+ const base = Math.floor(p);
9
+ const rest = p - base;
10
+ if (sorted[base + 1] !== undefined) {
11
+ return sorted[base] + rest * (sorted[base + 1] - sorted[base]);
12
+ } else {
13
+ return sorted[base];
14
+ }
15
+ };
16
+
17
+ if (process.argv.length < 4) {
18
+ console.log("Usage: node analyze-files.js /path/to/wav/files /path/to/labels");
19
+ process.exit(1);
20
+ }
21
+
22
+ let currentFile;
23
+ let samples = 0;
24
+ const leadingBufferFrames = 10;
25
+ const sampleRate = 16000;
26
+ const samplesPerFrame = 480;
27
+ let results = {};
28
+ let labels = JSON.parse(fs.readFileSync(process.argv[3], "utf8"));
29
+
30
+ const recorder = new SpeechRecorder({
31
+ leadingBufferFrames,
32
+ samplesPerFrame,
33
+ sampleRate,
34
+ onAudio: ({ audio, probability, volume }) => {
35
+ samples += audio.length;
36
+ },
37
+
38
+ onChunkStart: ({ audio }) => {
39
+ results[currentFile].speech.push([]);
40
+ results[currentFile].speech[results[currentFile].speech.length - 1].push(samples / sampleRate);
41
+ },
42
+
43
+ onChunkEnd: () => {
44
+ results[currentFile].speech[results[currentFile].speech.length - 1].push(samples / sampleRate);
45
+ },
46
+ });
47
+
48
+ fs.readdir(process.argv[2], async (error, files) => {
49
+ for (const file of files) {
50
+ if (!file.endsWith(".wav")) {
51
+ continue;
52
+ }
53
+
54
+ currentFile = file;
55
+ samples = 0;
56
+ results[file] = { speech: [] };
57
+ console.log(`Processing ${file}...`);
58
+ recorder.processFile(path.join(process.argv[2], file));
59
+ }
60
+
61
+ let speechWindowTooSmall = [];
62
+ let noiseWasSpeech = [];
63
+ let noise = 0;
64
+ let speech = 0;
65
+ let extra = [];
66
+ for (const i of Object.keys(results)) {
67
+ const label = labels[i].speech;
68
+ const result = results[i].speech;
69
+
70
+ if (label.length == 0) {
71
+ noise++;
72
+ } else {
73
+ speech++;
74
+ }
75
+
76
+ if (label.length == 0 && result.length > 0) {
77
+ console.log("Noise was speech:", i);
78
+ console.log("VAD:", result);
79
+ noiseWasSpeech.push(i);
80
+ }
81
+
82
+ if (label.length > 0 && result.length > 0) {
83
+ const start = Math.min(...result.map((e) => e[0]));
84
+ const stop = Math.max(...result.map((e) => e[1]));
85
+ if (isNaN(start) || isNaN(stop)) {
86
+ continue;
87
+ }
88
+
89
+ const tolerance = 0.05;
90
+ if (
91
+ start - (leadingBufferFrames * samplesPerFrame) / sampleRate > label[0] + tolerance ||
92
+ stop < label[1] - tolerance
93
+ ) {
94
+ console.log("Speech window too small:", i);
95
+ console.log("Label:", label);
96
+ console.log("VAD:", result, start, stop);
97
+ speechWindowTooSmall.push(i);
98
+ } else if (stop > label[1]) {
99
+ extra.push(stop - label[1]);
100
+ }
101
+ }
102
+ }
103
+
104
+ console.log(
105
+ `\nSpeech window too small: ${(speechWindowTooSmall.length / speech).toFixed(2)} (${
106
+ speechWindowTooSmall.length
107
+ } / ${speech})`
108
+ );
109
+
110
+ console.log(
111
+ `Noise was speech: ${noise > 0 ? (noiseWasSpeech.length / noise).toFixed(2) : 0} (${
112
+ noiseWasSpeech.length
113
+ } / ${noise})`
114
+ );
115
+
116
+ if (extra.length > 0) {
117
+ console.log(
118
+ `Average extra speech: ${(extra.reduce((a, b) => a + b) / extra.length).toFixed(2)}`
119
+ );
120
+ console.log(`p50 extra speech: ${quantile(extra, 0.5).toFixed(2)}`);
121
+ console.log(`p90 extra speech: ${quantile(extra, 0.75).toFixed(2)}`);
122
+ console.log(`Max extra speech: ${Math.max(...extra).toFixed(2)}`);
123
+ }
124
+ });
@@ -0,0 +1,3 @@
1
+ const { devices } = require("../src/index");
2
+
3
+ console.log(devices());
@@ -0,0 +1,20 @@
1
+ const { SpeechRecorder } = require("../src/index.js");
2
+
3
+ const recorder = new SpeechRecorder({
4
+ onChunkStart: () => {
5
+ console.log(Date.now(), "Chunk start");
6
+ },
7
+ onAudio: ({ speaking, probability, volume }) => {
8
+ console.log(Date.now(), speaking, probability, volume);
9
+ },
10
+ onChunkEnd: () => {
11
+ console.log(Date.now(), "Chunk end");
12
+ },
13
+ });
14
+
15
+ console.log("Recording...");
16
+ recorder.start();
17
+ setTimeout(() => {
18
+ console.log("Done!");
19
+ recorder.stop();
20
+ }, 60000);
@@ -0,0 +1,31 @@
1
+ const fs = require("fs");
2
+ const { SpeechRecorder } = require("../src/index");
3
+ const { WaveFile } = require("wavefile");
4
+
5
+ if (process.argv.length < 3) {
6
+ console.log("Usage: node record.js /path/to/output.wav");
7
+ process.exit(1);
8
+ }
9
+
10
+ let buffer = [];
11
+ const sampleRate = 16000;
12
+ const recorder = new SpeechRecorder({
13
+ onAudio: ({ audio }) => {
14
+ for (let i = 0; i < audio.length; i++) {
15
+ buffer.push(audio[i]);
16
+ }
17
+
18
+ if (buffer.length >= sampleRate * 5) {
19
+ let wav = new WaveFile();
20
+ wav.fromScratch(1, sampleRate, "16", buffer);
21
+ fs.writeFileSync(process.argv[2], wav.toBuffer());
22
+ process.exit(1);
23
+ }
24
+ },
25
+ });
26
+
27
+ console.log("Ready...");
28
+ setTimeout(() => {
29
+ console.log("Go!");
30
+ recorder.start();
31
+ }, 1000);
@@ -0,0 +1,48 @@
1
+ #pragma once
2
+
3
+ #include <napi.h>
4
+
5
+ #include <atomic>
6
+ #include <functional>
7
+ #include <thread>
8
+
9
+ #include "aligned.h"
10
+ #include "chunk_processor.h"
11
+
12
+ struct SpeechRecorderCallbackData {
13
+ std::string event = "";
14
+ std::vector<short> audio;
15
+ bool speaking = false;
16
+ double volume = 0.0;
17
+ bool speech = false;
18
+ double probability = 0.0;
19
+ int consecutiveSilence = 0;
20
+ };
21
+
22
+ class SpeechRecorder : public Napi::ObjectWrap<SpeechRecorder> {
23
+ private:
24
+ std::thread thread_;
25
+ Napi::ThreadSafeFunction threadSafeFunction_;
26
+ std::atomic<bool> stopped_;
27
+ BlockingReaderWriterQueue<SpeechRecorderCallbackData*> queue_;
28
+ Napi::FunctionReference callback_;
29
+ std::function<void(Napi::Env, Napi::Function, SpeechRecorderCallbackData*)>
30
+ threadSafeFunctionCallback_;
31
+ std::string modelPath_;
32
+ speechrecorder::ChunkProcessorOptions options_;
33
+ speechrecorder::ChunkProcessor processor_;
34
+ std::unique_ptr<speechrecorder::ChunkProcessor> processFileProcessor_;
35
+
36
+ void ProcessFile(const Napi::CallbackInfo& info);
37
+ void Start(const Napi::CallbackInfo& info);
38
+ void Stop(const Napi::CallbackInfo& info);
39
+
40
+ public:
41
+ SpeechRecorder(const Napi::CallbackInfo& info);
42
+ static Napi::Object Init(Napi::Env env, Napi::Object exports);
43
+
44
+ ALIGNED
45
+ };
46
+
47
+ Napi::Value GetDevices(const Napi::CallbackInfo& info);
48
+ Napi::Object Init(Napi::Env env, Napi::Object exports);
Binary file
package/package.json ADDED
@@ -0,0 +1,37 @@
1
+ {
2
+ "name": "@s-h-a-d-o-w/speech-recorder",
3
+ "version": "2.1.10",
4
+ "description": "A node.js library for streaming audio and speech from the microphone.",
5
+ "main": "src/index.js",
6
+ "repository": {
7
+ "type": "git",
8
+ "url": "git+https://github.com/s-h-a-d-o-w/speech-recorder.git"
9
+ },
10
+ "author": "Andreas Opferkuch",
11
+ "license": "MIT",
12
+ "gypfile": true,
13
+ "binary": {
14
+ "napi_versions": [
15
+ 10
16
+ ]
17
+ },
18
+ "publishConfig": {
19
+ "access": "public"
20
+ },
21
+ "scripts": {
22
+ "build": "bash build.sh",
23
+ "clean": "rm -rf build prebuilds lib/build lib/install",
24
+ "install": "prebuild-install -r napi || node-gyp rebuild"
25
+ },
26
+ "dependencies": {
27
+ "bindings": "^1.5.0",
28
+ "node-addon-api": "^8.8.0",
29
+ "prebuild-install": "^7.1.3"
30
+ },
31
+ "devDependencies": {
32
+ "node-gyp": "^13.0.0",
33
+ "prebuild": "^13.0.1",
34
+ "wavefile": "^11.0.0"
35
+ },
36
+ "packageManager": "pnpm@11.7.0+sha512.19cc852c120c7125760f2443ee6be0ca5b40f9f50598de1a09a1f177503e010e57c23c77646e01e761de59bf874fb22a3398c33ab9691fc13eb946b6f0f4d620"
37
+ }
@@ -0,0 +1,5 @@
1
+ allowBuilds:
2
+ es5-ext: true
3
+
4
+ overrides:
5
+ node-gyp: "^13.0.0"
package/setup.sh ADDED
@@ -0,0 +1,72 @@
1
+ #!/bin/bash
2
+
3
+ set -e
4
+ HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
5
+ pushd "$HERE" &> /dev/null
6
+
7
+ if [[ -z "$1" ]] ; then
8
+ echo "Usage: setup.sh x64|arm64"
9
+ exit 1
10
+ fi
11
+
12
+ rm -rf tmp lib/3rd_party/portaudio lib/3rd_party/onnxruntime
13
+
14
+ mkdir -p tmp/portaudio
15
+ cd tmp/portaudio
16
+ curl -Lo portaudio.tgz http://files.portaudio.com/archives/pa_stable_v190700_20210406.tgz
17
+ tar xvf portaudio.tgz
18
+
19
+ cd portaudio
20
+ mkdir dist install
21
+ cd dist
22
+
23
+ portaudio_cmake="cmake -DCMAKE_POLICY_VERSION_MINIMUM=3.5"
24
+ if [[ `uname -s` == "MINGW"* ]] ; then
25
+ portaudio_cmake+=" -A x64"
26
+ elif [[ `uname -s` == "Darwin" ]] ; then
27
+ portaudio_cmake+=" -DCMAKE_OSX_DEPLOYMENT_TARGET=10.14"
28
+ if [[ "$1" == "x64" ]] ; then
29
+ portaudio_cmake+=" -DCMAKE_OSX_ARCHITECTURES=x86_64"
30
+ elif [[ "$1" == "arm64" ]] ; then
31
+ portaudio_cmake+=" -DCMAKE_OSX_ARCHITECTURES=arm64"
32
+ fi
33
+ fi
34
+
35
+ portaudio_cmake+=" .."
36
+ eval $portaudio_cmake
37
+ cmake --build . --config Release
38
+ cmake --install . --prefix ../install
39
+ cp -r ../install ../../../../lib/3rd_party/portaudio
40
+
41
+ cd ../../..
42
+ mkdir onnxruntime
43
+ cd onnxruntime
44
+
45
+ if [[ `uname -s` == "MINGW"* ]] ; then
46
+ mkdir -p ../../lib/3rd_party/onnxruntime/lib
47
+ curl -Lo onnxruntime.zip https://www.nuget.org/api/v2/package/Microsoft.ML.OnnxRuntime/1.10.0
48
+ unzip onnxruntime.zip
49
+ cp -r build/native/include ../../lib/3rd_party/onnxruntime
50
+
51
+ path="win-x64"
52
+
53
+ cp runtimes/$path/native/*.dll ../../lib/3rd_party/onnxruntime/lib
54
+ cp runtimes/$path/native/*.lib ../../lib/3rd_party/onnxruntime/lib
55
+ else
56
+ path="onnxruntime-linux-x64-1.10.0"
57
+ if [[ `uname -s` == "Darwin" ]] ; then
58
+ if [[ "$1" == "x64" ]] ; then
59
+ path="onnxruntime-osx-x86_64-1.10.0"
60
+ elif [[ "$1" == "arm64" ]] ; then
61
+ path="onnxruntime-osx-arm64-1.10.0"
62
+ fi
63
+ fi
64
+
65
+ curl -Lo onnxruntime.tgz https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/$path.tgz
66
+ tar xvf onnxruntime.tgz
67
+ cp -r $path ../../lib/3rd_party/onnxruntime
68
+ fi
69
+
70
+ cd ../..
71
+ rm -rf tmp
72
+ popd &> /dev/null
package/src/index.js ADDED
@@ -0,0 +1,72 @@
1
+ const path = require("path");
2
+ const { SpeechRecorder, devices } = require("bindings")("speechrecorder.node");
3
+
4
+ class Wrapper {
5
+ constructor(options, model) {
6
+ options = options ? options : {};
7
+ options.consecutiveFramesForSilence =
8
+ options.consecutiveFramesForSilence !== undefined ? options.consecutiveFramesForSilence : 10;
9
+ options.consecutiveFramesForSpeaking =
10
+ options.consecutiveFramesForSpeaking !== undefined ? options.consecutiveFramesForSpeaking : 1;
11
+ options.device = options.device !== undefined ? options.device : -1;
12
+ options.leadingBufferFrames =
13
+ options.leadingBufferFrames !== undefined ? options.leadingBufferFrames : 10;
14
+ options.onChunkStart = options.onChunkStart !== undefined ? options.onChunkStart : (data) => {};
15
+ options.onAudio =
16
+ options.onAudio !== undefined
17
+ ? options.onAudio
18
+ : (audio, speaking, volume, speech, probability) => {};
19
+ options.onChunkEnd = options.onChunkEnd !== undefined ? options.onChunkEnd : (data) => {};
20
+ options.samplesPerFrame = options.samplesPerFrame !== undefined ? options.samplesPerFrame : 480;
21
+ options.sampleRate = options.sampleRate !== undefined ? options.sampleRate : 16000;
22
+ options.sileroVadBufferSize =
23
+ options.sileroVadBufferSize !== undefined ? options.sileroVadBufferSize : 2000;
24
+ options.sileroVadRateLimit =
25
+ options.sileroVadRateLimit !== undefined ? options.sileroVadRateLimit : 3;
26
+ options.sileroVadSilenceThreshold =
27
+ options.sileroVadSilenceThreshold !== undefined ? options.sileroVadSilenceThreshold : 0.1;
28
+ options.sileroVadSpeakingThreshold =
29
+ options.sileroVadSpeakingThreshold !== undefined ? options.sileroVadSpeakingThreshold : 0.3;
30
+ options.webrtcVadLevel = options.webrtcVadLevel !== undefined ? options.webrtcVadLevel : 3;
31
+ options.webrtcVadBufferSize =
32
+ options.webrtcVadBufferSize !== undefined ? options.webrtcVadBufferSize : 480;
33
+ options.webrtcVadResultsSize =
34
+ options.webrtcVadResultsSize !== undefined ? options.webrtcVadResultsSize : 10;
35
+
36
+ this.inner = new SpeechRecorder(
37
+ model !== undefined ? model : path.join(__dirname, "..", "lib", "resources", "vad.onnx"),
38
+ (event, data) => {
39
+ if (event == "chunkStart") {
40
+ options.onChunkStart({ audio: data.audio });
41
+ } else if (event == "audio") {
42
+ options.onAudio({
43
+ audio: data.audio,
44
+ speaking: data.speaking,
45
+ probability: data.probability,
46
+ volume: data.volume,
47
+ speech: data.speech,
48
+ consecutiveSilence: data.consecutiveSilence,
49
+ });
50
+ } else if (event == "chunkEnd") {
51
+ options.onChunkEnd();
52
+ }
53
+ },
54
+ options
55
+ );
56
+ }
57
+
58
+ processFile(file) {
59
+ this.inner.processFile(path.resolve(file));
60
+ }
61
+
62
+ start() {
63
+ this.inner.start();
64
+ }
65
+
66
+ stop() {
67
+ this.inner.stop();
68
+ }
69
+ }
70
+
71
+ exports.SpeechRecorder = Wrapper;
72
+ exports.devices = devices;
@@ -0,0 +1,298 @@
1
+ #include <napi.h>
2
+
3
+ #include <atomic>
4
+ #include <chrono>
5
+ #include <memory>
6
+ #include <string>
7
+ #include <vector>
8
+
9
+ #include "chunk_processor.h"
10
+ #include "devices.h"
11
+ #include "portaudio.h"
12
+ #include "speech_recorder.h"
13
+
14
+ #define DR_WAV_IMPLEMENTATION
15
+ #include "dr_wav.h"
16
+
17
+ Napi::Object SpeechRecorder::Init(Napi::Env env, Napi::Object exports) {
18
+ Napi::Function f = DefineClass(
19
+ env, "SpeechRecorder",
20
+ {
21
+ InstanceMethod<&SpeechRecorder::ProcessFile>(
22
+ "processFile", static_cast<napi_property_attributes>(
23
+ napi_writable | napi_configurable)),
24
+ InstanceMethod<&SpeechRecorder::Start>(
25
+ "start", static_cast<napi_property_attributes>(
26
+ napi_writable | napi_configurable)),
27
+ InstanceMethod<&SpeechRecorder::Stop>(
28
+ "stop", static_cast<napi_property_attributes>(napi_writable |
29
+ napi_configurable)),
30
+ });
31
+
32
+ Napi::FunctionReference* constructor = new Napi::FunctionReference();
33
+ *constructor = Napi::Persistent(f);
34
+
35
+ exports.Set("SpeechRecorder", f);
36
+ env.SetInstanceData<Napi::FunctionReference>(constructor);
37
+
38
+ exports.Set(Napi::String::New(env, "devices"),
39
+ Napi::Function::New(env, GetDevices));
40
+ return exports;
41
+ }
42
+
43
+ SpeechRecorder::SpeechRecorder(const Napi::CallbackInfo& info)
44
+ : Napi::ObjectWrap<SpeechRecorder>(info),
45
+ stopped_(true),
46
+ queue_(),
47
+ callback_(Napi::Persistent(info[1].As<Napi::Function>())),
48
+ threadSafeFunctionCallback_([&](Napi::Env env, Napi::Function jsCallback,
49
+ SpeechRecorderCallbackData* data) {
50
+ Napi::Object object = Napi::Object::New(env);
51
+ object.Set("speaking", Napi::Boolean::New(env, data->speaking));
52
+ object.Set("volume", Napi::Number::New(env, data->volume));
53
+ object.Set("speech", Napi::Boolean::New(env, data->speech));
54
+ object.Set("probability", Napi::Number::New(env, data->probability));
55
+ object.Set("consecutiveSilence",
56
+ Napi::Number::New(env, (double)data->consecutiveSilence));
57
+
58
+ if (data->audio.size() > 0) {
59
+ Napi::Int16Array buffer =
60
+ Napi::Int16Array::New(env, data->audio.size());
61
+ for (size_t i = 0; i < data->audio.size(); i++) {
62
+ buffer[i] = data->audio[i];
63
+ }
64
+
65
+ object.Set("audio", buffer);
66
+ }
67
+
68
+ jsCallback.Call({Napi::String::New(env, data->event), object});
69
+ delete data;
70
+ }),
71
+ modelPath_(info[0].As<Napi::String>().Utf8Value()),
72
+ options_({
73
+ info[2]
74
+ .As<Napi::Object>()
75
+ .Get("consecutiveFramesForSilence")
76
+ .As<Napi::Number>()
77
+ .Int32Value(),
78
+ info[2]
79
+ .As<Napi::Object>()
80
+ .Get("consecutiveFramesForSpeaking")
81
+ .As<Napi::Number>()
82
+ .Int32Value(),
83
+ info[2]
84
+ .As<Napi::Object>()
85
+ .Get("device")
86
+ .As<Napi::Number>()
87
+ .Int32Value(),
88
+ info[2]
89
+ .As<Napi::Object>()
90
+ .Get("leadingBufferFrames")
91
+ .As<Napi::Number>()
92
+ .Int32Value(),
93
+ [&](std::vector<short> audio) {
94
+ SpeechRecorderCallbackData* data = new SpeechRecorderCallbackData();
95
+ data->event = "chunkStart";
96
+ data->audio = audio;
97
+ queue_.enqueue(data);
98
+ },
99
+ [&](std::vector<short> audio, bool speaking, double volume,
100
+ bool speech, double probability, int consecutiveSilence) {
101
+ SpeechRecorderCallbackData* data = new SpeechRecorderCallbackData();
102
+ data->event = "audio";
103
+ data->audio = audio;
104
+ data->speaking = speaking;
105
+ data->volume = volume;
106
+ data->speech = speech;
107
+ data->probability = probability;
108
+ data->consecutiveSilence = consecutiveSilence;
109
+ queue_.enqueue(data);
110
+ },
111
+ [&]() {
112
+ SpeechRecorderCallbackData* data = new SpeechRecorderCallbackData();
113
+ data->event = "chunkEnd";
114
+ queue_.enqueue(data);
115
+ },
116
+ info[2]
117
+ .As<Napi::Object>()
118
+ .Get("samplesPerFrame")
119
+ .As<Napi::Number>()
120
+ .Int32Value(),
121
+ info[2]
122
+ .As<Napi::Object>()
123
+ .Get("sampleRate")
124
+ .As<Napi::Number>()
125
+ .Int32Value(),
126
+ info[2]
127
+ .As<Napi::Object>()
128
+ .Get("sileroVadBufferSize")
129
+ .As<Napi::Number>()
130
+ .Int32Value(),
131
+ info[2]
132
+ .As<Napi::Object>()
133
+ .Get("sileroVadRateLimit")
134
+ .As<Napi::Number>()
135
+ .Int32Value(),
136
+ info[2]
137
+ .As<Napi::Object>()
138
+ .Get("sileroVadSilenceThreshold")
139
+ .As<Napi::Number>()
140
+ .DoubleValue(),
141
+ info[2]
142
+ .As<Napi::Object>()
143
+ .Get("sileroVadSpeakingThreshold")
144
+ .As<Napi::Number>()
145
+ .DoubleValue(),
146
+ info[2]
147
+ .As<Napi::Object>()
148
+ .Get("webrtcVadLevel")
149
+ .As<Napi::Number>()
150
+ .Int32Value(),
151
+ info[2]
152
+ .As<Napi::Object>()
153
+ .Get("webrtcVadBufferSize")
154
+ .As<Napi::Number>()
155
+ .Int32Value(),
156
+ info[2]
157
+ .As<Napi::Object>()
158
+ .Get("webrtcVadResultsSize")
159
+ .As<Napi::Number>()
160
+ .Int32Value(),
161
+ }),
162
+ processor_(modelPath_, options_) {}
163
+
164
+ void SpeechRecorder::ProcessFile(const Napi::CallbackInfo& info) {
165
+ Napi::Env env = info.Env();
166
+ std::string path = info[0].As<Napi::String>().Utf8Value();
167
+
168
+ // we don't want to create two processors on startup, because loading the
169
+ // silero model is expensive, so lazily create this instance only if this
170
+ // method is actually called (which is probably not common)
171
+ if (!processFileProcessor_) {
172
+ speechrecorder::ChunkProcessorOptions options = options_;
173
+
174
+ options.onChunkStart = [&](std::vector<short> audio) {
175
+ Napi::Object object = Napi::Object::New(env);
176
+ if (audio.size() > 0) {
177
+ Napi::Int16Array buffer = Napi::Int16Array::New(env, audio.size());
178
+ for (size_t i = 0; i < audio.size(); i++) {
179
+ buffer[i] = audio[i];
180
+ }
181
+
182
+ object.Set("audio", buffer);
183
+ }
184
+
185
+ callback_.Value().Call({Napi::String::New(env, "chunkStart"), object});
186
+ };
187
+
188
+ options.onAudio = [&](std::vector<short> audio, bool speaking,
189
+ double volume, bool speech, double probability,
190
+ int consecutiveSilence) {
191
+ Napi::Object object = Napi::Object::New(env);
192
+ object.Set("speaking", Napi::Boolean::New(env, speaking));
193
+ object.Set("volume", Napi::Number::New(env, volume));
194
+ object.Set("speech", Napi::Boolean::New(env, speech));
195
+ object.Set("probability", Napi::Number::New(env, probability));
196
+ object.Set("consecutiveSilence",
197
+ Napi::Number::New(env, (double)consecutiveSilence));
198
+
199
+ if (audio.size() > 0) {
200
+ Napi::Int16Array buffer = Napi::Int16Array::New(env, audio.size());
201
+ for (size_t i = 0; i < audio.size(); i++) {
202
+ buffer[i] = audio[i];
203
+ }
204
+
205
+ object.Set("audio", buffer);
206
+ callback_.Value().Call({Napi::String::New(env, "audio"), object});
207
+ }
208
+ };
209
+
210
+ options.onChunkEnd = [&] {
211
+ callback_.Value().Call({Napi::String::New(env, "chunkEnd")});
212
+ };
213
+
214
+ processFileProcessor_ =
215
+ std::make_unique<speechrecorder::ChunkProcessor>(modelPath_, options);
216
+ }
217
+
218
+ unsigned int channels;
219
+ unsigned int sampleRate;
220
+ drwav_uint64 frames;
221
+ short* data = drwav_open_file_and_read_pcm_frames_s16(
222
+ path.c_str(), &channels, &sampleRate, &frames, nullptr);
223
+
224
+ processFileProcessor_->Reset();
225
+ int size = (int)frames;
226
+ for (int i = 0; i < size; i += options_.samplesPerFrame) {
227
+ std::vector<short> buffer;
228
+ for (int j = 0; j < options_.samplesPerFrame; j++) {
229
+ if (i + j < size) {
230
+ buffer.push_back(data[i + j]);
231
+ }
232
+ }
233
+
234
+ if (buffer.size() == (size_t)options_.samplesPerFrame) {
235
+ processFileProcessor_->Process(buffer.data());
236
+ }
237
+ }
238
+
239
+ drwav_free(data, nullptr);
240
+ }
241
+
242
+ void SpeechRecorder::Start(const Napi::CallbackInfo& info) {
243
+ stopped_ = false;
244
+ threadSafeFunction_ = Napi::ThreadSafeFunction::New(
245
+ info.Env(), callback_.Value(), "Speech Recorder Start", 0, 1,
246
+ [&](Napi::Env env) {
247
+ thread_.join();
248
+ });
249
+
250
+ thread_ = std::thread([&] {
251
+ while (!stopped_) {
252
+ SpeechRecorderCallbackData* data;
253
+ bool element = queue_.try_dequeue(data);
254
+ if (element) {
255
+ threadSafeFunction_.BlockingCall(data, threadSafeFunctionCallback_);
256
+ }
257
+
258
+ std::this_thread::sleep_for(std::chrono::milliseconds(10));
259
+ }
260
+
261
+ threadSafeFunction_.Release();
262
+ });
263
+
264
+ processor_.Start();
265
+ }
266
+
267
+ void SpeechRecorder::Stop(const Napi::CallbackInfo& info) {
268
+ stopped_ = true;
269
+ processor_.Stop();
270
+ }
271
+
272
+ Napi::Value GetDevices(const Napi::CallbackInfo& info) {
273
+ Napi::Env env = info.Env();
274
+
275
+ std::vector<speechrecorder::Device> devices = speechrecorder::GetDevices();
276
+ Napi::Array result = Napi::Array::New(env, devices.size());
277
+ for (size_t i = 0; i < devices.size(); i++) {
278
+ Napi::Object e = Napi::Object::New(env);
279
+ e.Set("id", devices[i].id);
280
+ e.Set("name", devices[i].name);
281
+ e.Set("apiName", devices[i].apiName);
282
+ e.Set("maxInputChannels", devices[i].maxInputChannels);
283
+ e.Set("maxOutputChannels", devices[i].maxOutputChannels);
284
+ e.Set("defaultSampleRate", devices[i].defaultSampleRate);
285
+ e.Set("isDefaultInput", devices[i].isDefaultInput);
286
+ e.Set("isDefaultOutput", devices[i].isDefaultOutput);
287
+ result[i] = e;
288
+ }
289
+
290
+ return result;
291
+ }
292
+
293
+ Napi::Object Init(Napi::Env env, Napi::Object exports) {
294
+ SpeechRecorder::Init(env, exports);
295
+ return exports;
296
+ }
297
+
298
+ NODE_API_MODULE(addon, Init);