@s-h-a-d-o-w/speech-recorder 2.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.clang-format +2 -0
- package/.github/workflows/prebuilds.yml +62 -0
- package/LICENSE +7 -0
- package/README.md +122 -0
- package/binding.gyp +126 -0
- package/build.sh +44 -0
- package/examples/analyze-files.js +124 -0
- package/examples/devices.js +3 -0
- package/examples/live.js +20 -0
- package/examples/record.js +31 -0
- package/include/speech_recorder.h +48 -0
- package/lib/resources/vad.onnx +0 -0
- package/package.json +37 -0
- package/pnpm-workspace.yaml +5 -0
- package/setup.sh +72 -0
- package/src/index.js +72 -0
- package/src/speech_recorder.cpp +298 -0
package/.clang-format
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
name: Prebuilds
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "*"
|
|
7
|
+
|
|
8
|
+
permissions:
|
|
9
|
+
contents: write
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
prebuild:
|
|
13
|
+
name: ${{ matrix.platform }} ${{ matrix.arch }}
|
|
14
|
+
runs-on: ${{ matrix.os }}
|
|
15
|
+
|
|
16
|
+
strategy:
|
|
17
|
+
fail-fast: false
|
|
18
|
+
matrix:
|
|
19
|
+
include:
|
|
20
|
+
- platform: linux
|
|
21
|
+
os: ubuntu-latest
|
|
22
|
+
arch: x64
|
|
23
|
+
- platform: macos
|
|
24
|
+
os: macos-latest
|
|
25
|
+
arch: x64
|
|
26
|
+
- platform: macos
|
|
27
|
+
os: macos-latest
|
|
28
|
+
arch: arm64
|
|
29
|
+
- platform: windows
|
|
30
|
+
os: windows-latest
|
|
31
|
+
arch: x64
|
|
32
|
+
|
|
33
|
+
steps:
|
|
34
|
+
- name: Check out repository
|
|
35
|
+
uses: actions/checkout@v6
|
|
36
|
+
|
|
37
|
+
- name: Set up pnpm
|
|
38
|
+
uses: pnpm/action-setup@v6
|
|
39
|
+
|
|
40
|
+
- name: Set up Node.js
|
|
41
|
+
uses: actions/setup-node@v6
|
|
42
|
+
with:
|
|
43
|
+
node-version: lts/*
|
|
44
|
+
|
|
45
|
+
# - name: Install Linux build dependencies
|
|
46
|
+
# if: runner.os == 'Linux'
|
|
47
|
+
# run: |
|
|
48
|
+
# sudo apt-get update
|
|
49
|
+
# sudo apt-get install -y libasound2-dev
|
|
50
|
+
|
|
51
|
+
- name: Install npm dependencies
|
|
52
|
+
run: pnpm install --frozen-lockfile --ignore-scripts
|
|
53
|
+
|
|
54
|
+
- name: Build native dependencies
|
|
55
|
+
shell: bash
|
|
56
|
+
run: bash setup.sh "${{ matrix.arch }}"
|
|
57
|
+
|
|
58
|
+
- name: Build and upload prebuild
|
|
59
|
+
shell: bash
|
|
60
|
+
env:
|
|
61
|
+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
62
|
+
run: bash build.sh "${{ matrix.arch }}" "$GITHUB_TOKEN"
|
package/LICENSE
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
Copyright 2021 Serenade Labs, Inc.
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
4
|
+
|
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
6
|
+
|
|
7
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# Fork notes
|
|
2
|
+
|
|
3
|
+
## Prerequisites
|
|
4
|
+
|
|
5
|
+
- cmake
|
|
6
|
+
|
|
7
|
+
## Notes
|
|
8
|
+
|
|
9
|
+
`pnpm install` errors if there's no build. But we need the dependencies in order to build. Ignore the error.
|
|
10
|
+
|
|
11
|
+
Prebuilds MUST BE PUBLISHED TO GITHUB using `./build.sh <arch> <GITHUB_TOKEN>`!! Otherwise, users won't be able to install the package. (The build chain is nix-only, which would be a problem for windows users.)
|
|
12
|
+
|
|
13
|
+
Latest portaudio release was in 2021. Which is why it already requires the `-DCMAKE_POLICY_VERSION_MINIMUM=3.5` override and may become incompatible sooner or later.
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# Speech Recorder
|
|
17
|
+
|
|
18
|
+
speech-recorder is a cross-platform, native [node.js](https://nodejs.org) [addon](http://nodejs.org/api/addons.html) for getting a stream of audio from a device's microphone. Using speech-recorder, you can also get only the audio that corresponds to someone speaking.
|
|
19
|
+
|
|
20
|
+
This module is used for speech recognition in [Serenade](https://serenade.ai). Serenade enables you to write code through natural speech, rather than typing.
|
|
21
|
+
|
|
22
|
+
## Installation
|
|
23
|
+
|
|
24
|
+
speech-recorder has been tested on Windows 10, macOS 10.14+, and Ubuntu 18.04+ (and may work on other platforms as well).
|
|
25
|
+
|
|
26
|
+
To install speech-recorder, run:
|
|
27
|
+
|
|
28
|
+
yarn add speech-recorder
|
|
29
|
+
|
|
30
|
+
If you're using this library with Electron, you should probably use [electron-rebuild](https://github.com/electron/electron-rebuild).
|
|
31
|
+
|
|
32
|
+
## Usage
|
|
33
|
+
|
|
34
|
+
This library uses two voice activity detection mechanisms: a fast first pass (the WebRTC VAD), and a slightly slower, but much more accurate, second pass (the Silero VAD). See below for the various options you can supply to each.
|
|
35
|
+
|
|
36
|
+
### Streaming
|
|
37
|
+
|
|
38
|
+
When you start recording, you can register various callbacks. `onAudio` is called when any audio comes in from the microphone. `onChunkStart` is called when a chunk of speech begins, and `onChunkEnd` is called when speech ends.
|
|
39
|
+
|
|
40
|
+
const { SpeechRecorder } = require("speech-recorder");
|
|
41
|
+
|
|
42
|
+
const recorder = new SpeechRecorder({
|
|
43
|
+
onChunkStart: ({ audio }) => {
|
|
44
|
+
console.log(Date.now(), "Chunk start");
|
|
45
|
+
},
|
|
46
|
+
onAudio: ({ speaking, probability, volume }) => {
|
|
47
|
+
console.log(Date.now(), speaking, probability, volume);
|
|
48
|
+
},
|
|
49
|
+
onChunkEnd: () => {
|
|
50
|
+
console.log(Date.now(), "Chunk end");
|
|
51
|
+
},
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
console.log("Recording for 5 seconds...");
|
|
55
|
+
recorder.start();
|
|
56
|
+
setTimeout(() => {
|
|
57
|
+
console.log("Done!");
|
|
58
|
+
recorder.stop();
|
|
59
|
+
}, 5000);
|
|
60
|
+
|
|
61
|
+
You can write all audio from the microphone to a file with:
|
|
62
|
+
|
|
63
|
+
const { SpeechRecorder } = require("speech-recorder");
|
|
64
|
+
|
|
65
|
+
const writeStream = fs.createWriteStream("audio.raw");
|
|
66
|
+
const recorder = new SpeechRecorder({
|
|
67
|
+
onAudio: ({ audio }) => {
|
|
68
|
+
writeStream.write(audio);
|
|
69
|
+
}
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
Or, just the speech with:
|
|
73
|
+
|
|
74
|
+
const { SpeechRecorder } = require("speech-recorder");
|
|
75
|
+
|
|
76
|
+
const writeStream = fs.createWriteStream("audio.raw");
|
|
77
|
+
const recorder = new SpeechRecorder({
|
|
78
|
+
onAudio: ({ audio, speech }) => {
|
|
79
|
+
if (speech) {
|
|
80
|
+
writeStream.write(audio);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
### Devices
|
|
86
|
+
|
|
87
|
+
You can get a list of supported devices with:
|
|
88
|
+
|
|
89
|
+
const { devices } = require("speech-recorder");
|
|
90
|
+
|
|
91
|
+
console.log(devices());
|
|
92
|
+
|
|
93
|
+
### Options
|
|
94
|
+
|
|
95
|
+
* `consecutiveFramesForSilence`: How many frames of audio must be silent before `onChunkEnd` is fired. Default `10`.
|
|
96
|
+
* `consecutiveFramesForSpeaking`: How many frames of audio must be speech before `onChunkStart` is fired. Default `1`.
|
|
97
|
+
* `device`: ID of the device to use for input (i.e., from the example above). Specify `-1` to use the system default. Default `-1`.
|
|
98
|
+
* `leadingBufferFrames`: How many frames of audio to keep in a buffer that's included in `onChunkStart`. Default `10`.
|
|
99
|
+
* `onChunkStart`: Callback to be executed when speech starts.
|
|
100
|
+
* `onAudio`: Callback to be executed when any audio comes in.
|
|
101
|
+
* `onChunkEnd`: Callback to be executed when speech ends.
|
|
102
|
+
* `samplesPerFrame`: How many audio samples to be included in each frame from the microphone. Default `480`.
|
|
103
|
+
* `sampleRate`: Audio sample rate. Default `16000`.
|
|
104
|
+
* `sileroVadBufferSize`: How many audio samples to pass to the VAD. Default `2000`.
|
|
105
|
+
* `sileroVadRateLimit`: Rate limit, in frames, for how frequently to call the VAD. Default `3`.
|
|
106
|
+
* `sileroVadSilenceThreshold`: Probability threshold for speech to transition to silence. Default `0.1`.
|
|
107
|
+
* `sileroVadSpeakingThreshold`: Probability threshold for silence to transition to speech. Default `0.3`.
|
|
108
|
+
* `webrtcVadLevel`: Aggressiveness for the first-pass VAD filter. `0` is least aggressive, and `3` is most aggressive. Default `3`.
|
|
109
|
+
* `webrtcVadBufferSize`: How many audio samples to pass to the first-pass VAD filter. Default `480`. Can only be `160`, `320`, or `480`.
|
|
110
|
+
* `webrtcVadResultsSize`: How many first-pass VAD filter results to keep in history. Default `10`.
|
|
111
|
+
|
|
112
|
+
## Building SpeechRecorder
|
|
113
|
+
|
|
114
|
+
If you want to build speech-recorder from source, first install the necessary dependencies by running:
|
|
115
|
+
|
|
116
|
+
./setup.sh <arch>
|
|
117
|
+
|
|
118
|
+
Where `<arch>` specifies the architecture you'd like to build for and is one of `x64` or `arm64`. If you're not sure, you probably want `x64`.
|
|
119
|
+
|
|
120
|
+
Then, you can build speech-recorder with:
|
|
121
|
+
|
|
122
|
+
./build.sh <arch>
|
package/binding.gyp
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
{
|
|
2
|
+
"targets": [
|
|
3
|
+
{
|
|
4
|
+
"target_name": "speechrecorder",
|
|
5
|
+
"sources": ["src/speech_recorder.cpp"],
|
|
6
|
+
"cflags!": [
|
|
7
|
+
"-fno-exceptions",
|
|
8
|
+
"-fno-rtti",
|
|
9
|
+
],
|
|
10
|
+
"cflags_cc!": [
|
|
11
|
+
"-fno-exceptions",
|
|
12
|
+
"-fno-rtti",
|
|
13
|
+
],
|
|
14
|
+
"include_dirs": [
|
|
15
|
+
"<!@(node -p \"require('node-addon-api').include\")",
|
|
16
|
+
"<(module_root_dir)/include",
|
|
17
|
+
"<(module_root_dir)/lib/include",
|
|
18
|
+
"<(module_root_dir)/lib/build/_deps/drwav-src",
|
|
19
|
+
"<(module_root_dir)/lib/build/_deps/readerwriterqueue-src",
|
|
20
|
+
"<(module_root_dir)/lib/3rd_party/webrtcvad",
|
|
21
|
+
"<(module_root_dir)/lib/3rd_party/portaudio/include",
|
|
22
|
+
"<(module_root_dir)/lib/3rd_party/onnxruntime/include",
|
|
23
|
+
],
|
|
24
|
+
"defines": [
|
|
25
|
+
"NAPI_VERSION=<(napi_build_version)",
|
|
26
|
+
"NAPI_CPP_EXCEPTIONS",
|
|
27
|
+
],
|
|
28
|
+
"conditions": [
|
|
29
|
+
[
|
|
30
|
+
'OS=="mac"',
|
|
31
|
+
{
|
|
32
|
+
"xcode_settings": {
|
|
33
|
+
"GCC_ENABLE_CPP_EXCEPTIONS": "YES",
|
|
34
|
+
"GCC_ENABLE_CPP_RTTI": "YES",
|
|
35
|
+
"MACOSX_DEPLOYMENT_TARGET": "10.14",
|
|
36
|
+
"OTHER_LDFLAGS": ["-Wl,-rpath,@loader_path/"],
|
|
37
|
+
},
|
|
38
|
+
"copies": [
|
|
39
|
+
{
|
|
40
|
+
"destination": "<(module_root_dir)/build/Release",
|
|
41
|
+
"files": [
|
|
42
|
+
"<(module_root_dir)/lib/install/lib/libspeechrecorder.dylib",
|
|
43
|
+
"<(module_root_dir)/lib/install/lib/libportaudio.dylib",
|
|
44
|
+
"<(module_root_dir)/lib/install/lib/libonnxruntime.1.10.0.dylib",
|
|
45
|
+
],
|
|
46
|
+
}
|
|
47
|
+
],
|
|
48
|
+
"libraries": [
|
|
49
|
+
"<(module_root_dir)/build/Release/libspeechrecorder.dylib",
|
|
50
|
+
"<(module_root_dir)/build/Release/libportaudio.dylib",
|
|
51
|
+
"<(module_root_dir)/build/Release/libonnxruntime.1.10.0.dylib",
|
|
52
|
+
],
|
|
53
|
+
},
|
|
54
|
+
],
|
|
55
|
+
[
|
|
56
|
+
'OS=="win"',
|
|
57
|
+
{
|
|
58
|
+
"msvs_settings": {
|
|
59
|
+
"VCCLCompilerTool": {
|
|
60
|
+
"ExceptionHandling": 1,
|
|
61
|
+
},
|
|
62
|
+
},
|
|
63
|
+
"copies": [
|
|
64
|
+
{
|
|
65
|
+
"destination": "<(module_root_dir)/build/Release",
|
|
66
|
+
"files": [
|
|
67
|
+
"<(module_root_dir)/lib/install/lib/speechrecorder.dll",
|
|
68
|
+
"<(module_root_dir)/lib/install/lib/onnxruntime.dll",
|
|
69
|
+
"<(module_root_dir)/lib/install/lib/onnxruntime_providers_shared.dll",
|
|
70
|
+
],
|
|
71
|
+
}
|
|
72
|
+
],
|
|
73
|
+
"libraries": [
|
|
74
|
+
"<(module_root_dir)/lib/install/lib/speechrecorder.lib",
|
|
75
|
+
"<(module_root_dir)/lib/install/lib/onnxruntime.lib",
|
|
76
|
+
"<(module_root_dir)/lib/install/lib/onnxruntime_providers_shared.lib",
|
|
77
|
+
],
|
|
78
|
+
},
|
|
79
|
+
],
|
|
80
|
+
[
|
|
81
|
+
'OS=="win" and target_arch=="x64"',
|
|
82
|
+
{
|
|
83
|
+
"copies": [
|
|
84
|
+
{
|
|
85
|
+
"destination": "<(module_root_dir)/build/Release",
|
|
86
|
+
"files": [
|
|
87
|
+
"<(module_root_dir)/lib/install/lib/portaudio_x64.dll",
|
|
88
|
+
"<(module_root_dir)/lib/3rd_party/vcruntime/x64/vcruntime140.dll",
|
|
89
|
+
"<(module_root_dir)/lib/3rd_party/vcruntime/x64/vcruntime140_1.dll",
|
|
90
|
+
],
|
|
91
|
+
}
|
|
92
|
+
],
|
|
93
|
+
"libraries": [
|
|
94
|
+
"<(module_root_dir)/lib/install/lib/portaudio_x64.lib",
|
|
95
|
+
],
|
|
96
|
+
},
|
|
97
|
+
],
|
|
98
|
+
[
|
|
99
|
+
'OS=="linux"',
|
|
100
|
+
{
|
|
101
|
+
"link_settings": {
|
|
102
|
+
"libraries": [
|
|
103
|
+
"-Wl,-rpath,'$$ORIGIN'",
|
|
104
|
+
]
|
|
105
|
+
},
|
|
106
|
+
"copies": [
|
|
107
|
+
{
|
|
108
|
+
"destination": "<(module_root_dir)/build/Release",
|
|
109
|
+
"files": [
|
|
110
|
+
"<(module_root_dir)/lib/install/lib/libspeechrecorder.so",
|
|
111
|
+
"<(module_root_dir)/lib/install/lib/libportaudio.so",
|
|
112
|
+
"<(module_root_dir)/lib/install/lib/libonnxruntime.so.1.10.0",
|
|
113
|
+
],
|
|
114
|
+
}
|
|
115
|
+
],
|
|
116
|
+
"libraries": [
|
|
117
|
+
"<(module_root_dir)/build/Release/libspeechrecorder.so",
|
|
118
|
+
"<(module_root_dir)/build/Release/libportaudio.so",
|
|
119
|
+
"<(module_root_dir)/build/Release/libonnxruntime.so.1.10.0",
|
|
120
|
+
],
|
|
121
|
+
},
|
|
122
|
+
],
|
|
123
|
+
],
|
|
124
|
+
}
|
|
125
|
+
]
|
|
126
|
+
}
|
package/build.sh
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
|
|
3
|
+
set -e
|
|
4
|
+
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
5
|
+
pushd "$HERE" &> /dev/null
|
|
6
|
+
|
|
7
|
+
if [[ -z "$1" ]] ; then
|
|
8
|
+
echo "Usage: build.sh x64|arm64 [github-token]"
|
|
9
|
+
exit 1
|
|
10
|
+
fi
|
|
11
|
+
|
|
12
|
+
rm -rf lib/build lib/install
|
|
13
|
+
mkdir -p lib/build
|
|
14
|
+
cd lib/build
|
|
15
|
+
|
|
16
|
+
if [[ `uname -s` == "MINGW"* ]] ; then
|
|
17
|
+
cmake -A x64 ..
|
|
18
|
+
elif [[ `uname -s` == "Darwin" ]] ; then
|
|
19
|
+
if [[ "$1" == "x64" ]] ; then
|
|
20
|
+
cmake -DCMAKE_OSX_ARCHITECTURES=x86_64 ..
|
|
21
|
+
elif [[ "$1" == "arm64" ]] ; then
|
|
22
|
+
cmake -DCMAKE_OSX_ARCHITECTURES=arm64 ..
|
|
23
|
+
fi
|
|
24
|
+
else
|
|
25
|
+
cmake ..
|
|
26
|
+
fi
|
|
27
|
+
|
|
28
|
+
cmake --build . --config Release
|
|
29
|
+
cmake --install . --prefix ../install
|
|
30
|
+
|
|
31
|
+
cd ../..
|
|
32
|
+
rm -rf prebuilds
|
|
33
|
+
|
|
34
|
+
node_arch="$1"
|
|
35
|
+
|
|
36
|
+
eval "npm_config_arch=$node_arch ./node_modules/.bin/node-gyp rebuild"
|
|
37
|
+
|
|
38
|
+
prebuild_command="./node_modules/.bin/prebuild -r napi --include-regex '.(node|a|dylib|dll|so.*)$' --arch=$node_arch"
|
|
39
|
+
if [[ -n "$2" ]] ; then
|
|
40
|
+
prebuild_command+=" --upload $2"
|
|
41
|
+
fi
|
|
42
|
+
eval $prebuild_command
|
|
43
|
+
|
|
44
|
+
popd &> /dev/null
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
const fs = require("fs");
|
|
2
|
+
const path = require("path");
|
|
3
|
+
const { SpeechRecorder } = require("../src/index");
|
|
4
|
+
|
|
5
|
+
const quantile = (elements, q) => {
|
|
6
|
+
const sorted = elements.sort((a, b) => a - b);
|
|
7
|
+
const p = (sorted.length - 1) * q;
|
|
8
|
+
const base = Math.floor(p);
|
|
9
|
+
const rest = p - base;
|
|
10
|
+
if (sorted[base + 1] !== undefined) {
|
|
11
|
+
return sorted[base] + rest * (sorted[base + 1] - sorted[base]);
|
|
12
|
+
} else {
|
|
13
|
+
return sorted[base];
|
|
14
|
+
}
|
|
15
|
+
};
|
|
16
|
+
|
|
17
|
+
if (process.argv.length < 4) {
|
|
18
|
+
console.log("Usage: node analyze-files.js /path/to/wav/files /path/to/labels");
|
|
19
|
+
process.exit(1);
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
let currentFile;
|
|
23
|
+
let samples = 0;
|
|
24
|
+
const leadingBufferFrames = 10;
|
|
25
|
+
const sampleRate = 16000;
|
|
26
|
+
const samplesPerFrame = 480;
|
|
27
|
+
let results = {};
|
|
28
|
+
let labels = JSON.parse(fs.readFileSync(process.argv[3], "utf8"));
|
|
29
|
+
|
|
30
|
+
const recorder = new SpeechRecorder({
|
|
31
|
+
leadingBufferFrames,
|
|
32
|
+
samplesPerFrame,
|
|
33
|
+
sampleRate,
|
|
34
|
+
onAudio: ({ audio, probability, volume }) => {
|
|
35
|
+
samples += audio.length;
|
|
36
|
+
},
|
|
37
|
+
|
|
38
|
+
onChunkStart: ({ audio }) => {
|
|
39
|
+
results[currentFile].speech.push([]);
|
|
40
|
+
results[currentFile].speech[results[currentFile].speech.length - 1].push(samples / sampleRate);
|
|
41
|
+
},
|
|
42
|
+
|
|
43
|
+
onChunkEnd: () => {
|
|
44
|
+
results[currentFile].speech[results[currentFile].speech.length - 1].push(samples / sampleRate);
|
|
45
|
+
},
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
fs.readdir(process.argv[2], async (error, files) => {
|
|
49
|
+
for (const file of files) {
|
|
50
|
+
if (!file.endsWith(".wav")) {
|
|
51
|
+
continue;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
currentFile = file;
|
|
55
|
+
samples = 0;
|
|
56
|
+
results[file] = { speech: [] };
|
|
57
|
+
console.log(`Processing ${file}...`);
|
|
58
|
+
recorder.processFile(path.join(process.argv[2], file));
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
let speechWindowTooSmall = [];
|
|
62
|
+
let noiseWasSpeech = [];
|
|
63
|
+
let noise = 0;
|
|
64
|
+
let speech = 0;
|
|
65
|
+
let extra = [];
|
|
66
|
+
for (const i of Object.keys(results)) {
|
|
67
|
+
const label = labels[i].speech;
|
|
68
|
+
const result = results[i].speech;
|
|
69
|
+
|
|
70
|
+
if (label.length == 0) {
|
|
71
|
+
noise++;
|
|
72
|
+
} else {
|
|
73
|
+
speech++;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
if (label.length == 0 && result.length > 0) {
|
|
77
|
+
console.log("Noise was speech:", i);
|
|
78
|
+
console.log("VAD:", result);
|
|
79
|
+
noiseWasSpeech.push(i);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
if (label.length > 0 && result.length > 0) {
|
|
83
|
+
const start = Math.min(...result.map((e) => e[0]));
|
|
84
|
+
const stop = Math.max(...result.map((e) => e[1]));
|
|
85
|
+
if (isNaN(start) || isNaN(stop)) {
|
|
86
|
+
continue;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
const tolerance = 0.05;
|
|
90
|
+
if (
|
|
91
|
+
start - (leadingBufferFrames * samplesPerFrame) / sampleRate > label[0] + tolerance ||
|
|
92
|
+
stop < label[1] - tolerance
|
|
93
|
+
) {
|
|
94
|
+
console.log("Speech window too small:", i);
|
|
95
|
+
console.log("Label:", label);
|
|
96
|
+
console.log("VAD:", result, start, stop);
|
|
97
|
+
speechWindowTooSmall.push(i);
|
|
98
|
+
} else if (stop > label[1]) {
|
|
99
|
+
extra.push(stop - label[1]);
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
console.log(
|
|
105
|
+
`\nSpeech window too small: ${(speechWindowTooSmall.length / speech).toFixed(2)} (${
|
|
106
|
+
speechWindowTooSmall.length
|
|
107
|
+
} / ${speech})`
|
|
108
|
+
);
|
|
109
|
+
|
|
110
|
+
console.log(
|
|
111
|
+
`Noise was speech: ${noise > 0 ? (noiseWasSpeech.length / noise).toFixed(2) : 0} (${
|
|
112
|
+
noiseWasSpeech.length
|
|
113
|
+
} / ${noise})`
|
|
114
|
+
);
|
|
115
|
+
|
|
116
|
+
if (extra.length > 0) {
|
|
117
|
+
console.log(
|
|
118
|
+
`Average extra speech: ${(extra.reduce((a, b) => a + b) / extra.length).toFixed(2)}`
|
|
119
|
+
);
|
|
120
|
+
console.log(`p50 extra speech: ${quantile(extra, 0.5).toFixed(2)}`);
|
|
121
|
+
console.log(`p90 extra speech: ${quantile(extra, 0.75).toFixed(2)}`);
|
|
122
|
+
console.log(`Max extra speech: ${Math.max(...extra).toFixed(2)}`);
|
|
123
|
+
}
|
|
124
|
+
});
|
package/examples/live.js
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
const { SpeechRecorder } = require("../src/index.js");
|
|
2
|
+
|
|
3
|
+
const recorder = new SpeechRecorder({
|
|
4
|
+
onChunkStart: () => {
|
|
5
|
+
console.log(Date.now(), "Chunk start");
|
|
6
|
+
},
|
|
7
|
+
onAudio: ({ speaking, probability, volume }) => {
|
|
8
|
+
console.log(Date.now(), speaking, probability, volume);
|
|
9
|
+
},
|
|
10
|
+
onChunkEnd: () => {
|
|
11
|
+
console.log(Date.now(), "Chunk end");
|
|
12
|
+
},
|
|
13
|
+
});
|
|
14
|
+
|
|
15
|
+
console.log("Recording...");
|
|
16
|
+
recorder.start();
|
|
17
|
+
setTimeout(() => {
|
|
18
|
+
console.log("Done!");
|
|
19
|
+
recorder.stop();
|
|
20
|
+
}, 60000);
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
const fs = require("fs");
|
|
2
|
+
const { SpeechRecorder } = require("../src/index");
|
|
3
|
+
const { WaveFile } = require("wavefile");
|
|
4
|
+
|
|
5
|
+
if (process.argv.length < 3) {
|
|
6
|
+
console.log("Usage: node record.js /path/to/output.wav");
|
|
7
|
+
process.exit(1);
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
let buffer = [];
|
|
11
|
+
const sampleRate = 16000;
|
|
12
|
+
const recorder = new SpeechRecorder({
|
|
13
|
+
onAudio: ({ audio }) => {
|
|
14
|
+
for (let i = 0; i < audio.length; i++) {
|
|
15
|
+
buffer.push(audio[i]);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
if (buffer.length >= sampleRate * 5) {
|
|
19
|
+
let wav = new WaveFile();
|
|
20
|
+
wav.fromScratch(1, sampleRate, "16", buffer);
|
|
21
|
+
fs.writeFileSync(process.argv[2], wav.toBuffer());
|
|
22
|
+
process.exit(1);
|
|
23
|
+
}
|
|
24
|
+
},
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
console.log("Ready...");
|
|
28
|
+
setTimeout(() => {
|
|
29
|
+
console.log("Go!");
|
|
30
|
+
recorder.start();
|
|
31
|
+
}, 1000);
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include <napi.h>
|
|
4
|
+
|
|
5
|
+
#include <atomic>
|
|
6
|
+
#include <functional>
|
|
7
|
+
#include <thread>
|
|
8
|
+
|
|
9
|
+
#include "aligned.h"
|
|
10
|
+
#include "chunk_processor.h"
|
|
11
|
+
|
|
12
|
+
struct SpeechRecorderCallbackData {
|
|
13
|
+
std::string event = "";
|
|
14
|
+
std::vector<short> audio;
|
|
15
|
+
bool speaking = false;
|
|
16
|
+
double volume = 0.0;
|
|
17
|
+
bool speech = false;
|
|
18
|
+
double probability = 0.0;
|
|
19
|
+
int consecutiveSilence = 0;
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
class SpeechRecorder : public Napi::ObjectWrap<SpeechRecorder> {
|
|
23
|
+
private:
|
|
24
|
+
std::thread thread_;
|
|
25
|
+
Napi::ThreadSafeFunction threadSafeFunction_;
|
|
26
|
+
std::atomic<bool> stopped_;
|
|
27
|
+
BlockingReaderWriterQueue<SpeechRecorderCallbackData*> queue_;
|
|
28
|
+
Napi::FunctionReference callback_;
|
|
29
|
+
std::function<void(Napi::Env, Napi::Function, SpeechRecorderCallbackData*)>
|
|
30
|
+
threadSafeFunctionCallback_;
|
|
31
|
+
std::string modelPath_;
|
|
32
|
+
speechrecorder::ChunkProcessorOptions options_;
|
|
33
|
+
speechrecorder::ChunkProcessor processor_;
|
|
34
|
+
std::unique_ptr<speechrecorder::ChunkProcessor> processFileProcessor_;
|
|
35
|
+
|
|
36
|
+
void ProcessFile(const Napi::CallbackInfo& info);
|
|
37
|
+
void Start(const Napi::CallbackInfo& info);
|
|
38
|
+
void Stop(const Napi::CallbackInfo& info);
|
|
39
|
+
|
|
40
|
+
public:
|
|
41
|
+
SpeechRecorder(const Napi::CallbackInfo& info);
|
|
42
|
+
static Napi::Object Init(Napi::Env env, Napi::Object exports);
|
|
43
|
+
|
|
44
|
+
ALIGNED
|
|
45
|
+
};
|
|
46
|
+
|
|
47
|
+
Napi::Value GetDevices(const Napi::CallbackInfo& info);
|
|
48
|
+
Napi::Object Init(Napi::Env env, Napi::Object exports);
|
|
Binary file
|
package/package.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@s-h-a-d-o-w/speech-recorder",
|
|
3
|
+
"version": "2.1.10",
|
|
4
|
+
"description": "A node.js library for streaming audio and speech from the microphone.",
|
|
5
|
+
"main": "src/index.js",
|
|
6
|
+
"repository": {
|
|
7
|
+
"type": "git",
|
|
8
|
+
"url": "git+https://github.com/s-h-a-d-o-w/speech-recorder.git"
|
|
9
|
+
},
|
|
10
|
+
"author": "Andreas Opferkuch",
|
|
11
|
+
"license": "MIT",
|
|
12
|
+
"gypfile": true,
|
|
13
|
+
"binary": {
|
|
14
|
+
"napi_versions": [
|
|
15
|
+
10
|
|
16
|
+
]
|
|
17
|
+
},
|
|
18
|
+
"publishConfig": {
|
|
19
|
+
"access": "public"
|
|
20
|
+
},
|
|
21
|
+
"scripts": {
|
|
22
|
+
"build": "bash build.sh",
|
|
23
|
+
"clean": "rm -rf build prebuilds lib/build lib/install",
|
|
24
|
+
"install": "prebuild-install -r napi || node-gyp rebuild"
|
|
25
|
+
},
|
|
26
|
+
"dependencies": {
|
|
27
|
+
"bindings": "^1.5.0",
|
|
28
|
+
"node-addon-api": "^8.8.0",
|
|
29
|
+
"prebuild-install": "^7.1.3"
|
|
30
|
+
},
|
|
31
|
+
"devDependencies": {
|
|
32
|
+
"node-gyp": "^13.0.0",
|
|
33
|
+
"prebuild": "^13.0.1",
|
|
34
|
+
"wavefile": "^11.0.0"
|
|
35
|
+
},
|
|
36
|
+
"packageManager": "pnpm@11.7.0+sha512.19cc852c120c7125760f2443ee6be0ca5b40f9f50598de1a09a1f177503e010e57c23c77646e01e761de59bf874fb22a3398c33ab9691fc13eb946b6f0f4d620"
|
|
37
|
+
}
|
package/setup.sh
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
|
|
3
|
+
set -e
|
|
4
|
+
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
5
|
+
pushd "$HERE" &> /dev/null
|
|
6
|
+
|
|
7
|
+
if [[ -z "$1" ]] ; then
|
|
8
|
+
echo "Usage: setup.sh x64|arm64"
|
|
9
|
+
exit 1
|
|
10
|
+
fi
|
|
11
|
+
|
|
12
|
+
rm -rf tmp lib/3rd_party/portaudio lib/3rd_party/onnxruntime
|
|
13
|
+
|
|
14
|
+
mkdir -p tmp/portaudio
|
|
15
|
+
cd tmp/portaudio
|
|
16
|
+
curl -Lo portaudio.tgz http://files.portaudio.com/archives/pa_stable_v190700_20210406.tgz
|
|
17
|
+
tar xvf portaudio.tgz
|
|
18
|
+
|
|
19
|
+
cd portaudio
|
|
20
|
+
mkdir dist install
|
|
21
|
+
cd dist
|
|
22
|
+
|
|
23
|
+
portaudio_cmake="cmake -DCMAKE_POLICY_VERSION_MINIMUM=3.5"
|
|
24
|
+
if [[ `uname -s` == "MINGW"* ]] ; then
|
|
25
|
+
portaudio_cmake+=" -A x64"
|
|
26
|
+
elif [[ `uname -s` == "Darwin" ]] ; then
|
|
27
|
+
portaudio_cmake+=" -DCMAKE_OSX_DEPLOYMENT_TARGET=10.14"
|
|
28
|
+
if [[ "$1" == "x64" ]] ; then
|
|
29
|
+
portaudio_cmake+=" -DCMAKE_OSX_ARCHITECTURES=x86_64"
|
|
30
|
+
elif [[ "$1" == "arm64" ]] ; then
|
|
31
|
+
portaudio_cmake+=" -DCMAKE_OSX_ARCHITECTURES=arm64"
|
|
32
|
+
fi
|
|
33
|
+
fi
|
|
34
|
+
|
|
35
|
+
portaudio_cmake+=" .."
|
|
36
|
+
eval $portaudio_cmake
|
|
37
|
+
cmake --build . --config Release
|
|
38
|
+
cmake --install . --prefix ../install
|
|
39
|
+
cp -r ../install ../../../../lib/3rd_party/portaudio
|
|
40
|
+
|
|
41
|
+
cd ../../..
|
|
42
|
+
mkdir onnxruntime
|
|
43
|
+
cd onnxruntime
|
|
44
|
+
|
|
45
|
+
if [[ `uname -s` == "MINGW"* ]] ; then
|
|
46
|
+
mkdir -p ../../lib/3rd_party/onnxruntime/lib
|
|
47
|
+
curl -Lo onnxruntime.zip https://www.nuget.org/api/v2/package/Microsoft.ML.OnnxRuntime/1.10.0
|
|
48
|
+
unzip onnxruntime.zip
|
|
49
|
+
cp -r build/native/include ../../lib/3rd_party/onnxruntime
|
|
50
|
+
|
|
51
|
+
path="win-x64"
|
|
52
|
+
|
|
53
|
+
cp runtimes/$path/native/*.dll ../../lib/3rd_party/onnxruntime/lib
|
|
54
|
+
cp runtimes/$path/native/*.lib ../../lib/3rd_party/onnxruntime/lib
|
|
55
|
+
else
|
|
56
|
+
path="onnxruntime-linux-x64-1.10.0"
|
|
57
|
+
if [[ `uname -s` == "Darwin" ]] ; then
|
|
58
|
+
if [[ "$1" == "x64" ]] ; then
|
|
59
|
+
path="onnxruntime-osx-x86_64-1.10.0"
|
|
60
|
+
elif [[ "$1" == "arm64" ]] ; then
|
|
61
|
+
path="onnxruntime-osx-arm64-1.10.0"
|
|
62
|
+
fi
|
|
63
|
+
fi
|
|
64
|
+
|
|
65
|
+
curl -Lo onnxruntime.tgz https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/$path.tgz
|
|
66
|
+
tar xvf onnxruntime.tgz
|
|
67
|
+
cp -r $path ../../lib/3rd_party/onnxruntime
|
|
68
|
+
fi
|
|
69
|
+
|
|
70
|
+
cd ../..
|
|
71
|
+
rm -rf tmp
|
|
72
|
+
popd &> /dev/null
|
package/src/index.js
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
const path = require("path");
|
|
2
|
+
const { SpeechRecorder, devices } = require("bindings")("speechrecorder.node");
|
|
3
|
+
|
|
4
|
+
class Wrapper {
|
|
5
|
+
constructor(options, model) {
|
|
6
|
+
options = options ? options : {};
|
|
7
|
+
options.consecutiveFramesForSilence =
|
|
8
|
+
options.consecutiveFramesForSilence !== undefined ? options.consecutiveFramesForSilence : 10;
|
|
9
|
+
options.consecutiveFramesForSpeaking =
|
|
10
|
+
options.consecutiveFramesForSpeaking !== undefined ? options.consecutiveFramesForSpeaking : 1;
|
|
11
|
+
options.device = options.device !== undefined ? options.device : -1;
|
|
12
|
+
options.leadingBufferFrames =
|
|
13
|
+
options.leadingBufferFrames !== undefined ? options.leadingBufferFrames : 10;
|
|
14
|
+
options.onChunkStart = options.onChunkStart !== undefined ? options.onChunkStart : (data) => {};
|
|
15
|
+
options.onAudio =
|
|
16
|
+
options.onAudio !== undefined
|
|
17
|
+
? options.onAudio
|
|
18
|
+
: (audio, speaking, volume, speech, probability) => {};
|
|
19
|
+
options.onChunkEnd = options.onChunkEnd !== undefined ? options.onChunkEnd : (data) => {};
|
|
20
|
+
options.samplesPerFrame = options.samplesPerFrame !== undefined ? options.samplesPerFrame : 480;
|
|
21
|
+
options.sampleRate = options.sampleRate !== undefined ? options.sampleRate : 16000;
|
|
22
|
+
options.sileroVadBufferSize =
|
|
23
|
+
options.sileroVadBufferSize !== undefined ? options.sileroVadBufferSize : 2000;
|
|
24
|
+
options.sileroVadRateLimit =
|
|
25
|
+
options.sileroVadRateLimit !== undefined ? options.sileroVadRateLimit : 3;
|
|
26
|
+
options.sileroVadSilenceThreshold =
|
|
27
|
+
options.sileroVadSilenceThreshold !== undefined ? options.sileroVadSilenceThreshold : 0.1;
|
|
28
|
+
options.sileroVadSpeakingThreshold =
|
|
29
|
+
options.sileroVadSpeakingThreshold !== undefined ? options.sileroVadSpeakingThreshold : 0.3;
|
|
30
|
+
options.webrtcVadLevel = options.webrtcVadLevel !== undefined ? options.webrtcVadLevel : 3;
|
|
31
|
+
options.webrtcVadBufferSize =
|
|
32
|
+
options.webrtcVadBufferSize !== undefined ? options.webrtcVadBufferSize : 480;
|
|
33
|
+
options.webrtcVadResultsSize =
|
|
34
|
+
options.webrtcVadResultsSize !== undefined ? options.webrtcVadResultsSize : 10;
|
|
35
|
+
|
|
36
|
+
this.inner = new SpeechRecorder(
|
|
37
|
+
model !== undefined ? model : path.join(__dirname, "..", "lib", "resources", "vad.onnx"),
|
|
38
|
+
(event, data) => {
|
|
39
|
+
if (event == "chunkStart") {
|
|
40
|
+
options.onChunkStart({ audio: data.audio });
|
|
41
|
+
} else if (event == "audio") {
|
|
42
|
+
options.onAudio({
|
|
43
|
+
audio: data.audio,
|
|
44
|
+
speaking: data.speaking,
|
|
45
|
+
probability: data.probability,
|
|
46
|
+
volume: data.volume,
|
|
47
|
+
speech: data.speech,
|
|
48
|
+
consecutiveSilence: data.consecutiveSilence,
|
|
49
|
+
});
|
|
50
|
+
} else if (event == "chunkEnd") {
|
|
51
|
+
options.onChunkEnd();
|
|
52
|
+
}
|
|
53
|
+
},
|
|
54
|
+
options
|
|
55
|
+
);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
processFile(file) {
|
|
59
|
+
this.inner.processFile(path.resolve(file));
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
start() {
|
|
63
|
+
this.inner.start();
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
stop() {
|
|
67
|
+
this.inner.stop();
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
exports.SpeechRecorder = Wrapper;
|
|
72
|
+
exports.devices = devices;
|
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
#include <napi.h>
|
|
2
|
+
|
|
3
|
+
#include <atomic>
|
|
4
|
+
#include <chrono>
|
|
5
|
+
#include <memory>
|
|
6
|
+
#include <string>
|
|
7
|
+
#include <vector>
|
|
8
|
+
|
|
9
|
+
#include "chunk_processor.h"
|
|
10
|
+
#include "devices.h"
|
|
11
|
+
#include "portaudio.h"
|
|
12
|
+
#include "speech_recorder.h"
|
|
13
|
+
|
|
14
|
+
#define DR_WAV_IMPLEMENTATION
|
|
15
|
+
#include "dr_wav.h"
|
|
16
|
+
|
|
17
|
+
Napi::Object SpeechRecorder::Init(Napi::Env env, Napi::Object exports) {
|
|
18
|
+
Napi::Function f = DefineClass(
|
|
19
|
+
env, "SpeechRecorder",
|
|
20
|
+
{
|
|
21
|
+
InstanceMethod<&SpeechRecorder::ProcessFile>(
|
|
22
|
+
"processFile", static_cast<napi_property_attributes>(
|
|
23
|
+
napi_writable | napi_configurable)),
|
|
24
|
+
InstanceMethod<&SpeechRecorder::Start>(
|
|
25
|
+
"start", static_cast<napi_property_attributes>(
|
|
26
|
+
napi_writable | napi_configurable)),
|
|
27
|
+
InstanceMethod<&SpeechRecorder::Stop>(
|
|
28
|
+
"stop", static_cast<napi_property_attributes>(napi_writable |
|
|
29
|
+
napi_configurable)),
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
Napi::FunctionReference* constructor = new Napi::FunctionReference();
|
|
33
|
+
*constructor = Napi::Persistent(f);
|
|
34
|
+
|
|
35
|
+
exports.Set("SpeechRecorder", f);
|
|
36
|
+
env.SetInstanceData<Napi::FunctionReference>(constructor);
|
|
37
|
+
|
|
38
|
+
exports.Set(Napi::String::New(env, "devices"),
|
|
39
|
+
Napi::Function::New(env, GetDevices));
|
|
40
|
+
return exports;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
SpeechRecorder::SpeechRecorder(const Napi::CallbackInfo& info)
|
|
44
|
+
: Napi::ObjectWrap<SpeechRecorder>(info),
|
|
45
|
+
stopped_(true),
|
|
46
|
+
queue_(),
|
|
47
|
+
callback_(Napi::Persistent(info[1].As<Napi::Function>())),
|
|
48
|
+
threadSafeFunctionCallback_([&](Napi::Env env, Napi::Function jsCallback,
|
|
49
|
+
SpeechRecorderCallbackData* data) {
|
|
50
|
+
Napi::Object object = Napi::Object::New(env);
|
|
51
|
+
object.Set("speaking", Napi::Boolean::New(env, data->speaking));
|
|
52
|
+
object.Set("volume", Napi::Number::New(env, data->volume));
|
|
53
|
+
object.Set("speech", Napi::Boolean::New(env, data->speech));
|
|
54
|
+
object.Set("probability", Napi::Number::New(env, data->probability));
|
|
55
|
+
object.Set("consecutiveSilence",
|
|
56
|
+
Napi::Number::New(env, (double)data->consecutiveSilence));
|
|
57
|
+
|
|
58
|
+
if (data->audio.size() > 0) {
|
|
59
|
+
Napi::Int16Array buffer =
|
|
60
|
+
Napi::Int16Array::New(env, data->audio.size());
|
|
61
|
+
for (size_t i = 0; i < data->audio.size(); i++) {
|
|
62
|
+
buffer[i] = data->audio[i];
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
object.Set("audio", buffer);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
jsCallback.Call({Napi::String::New(env, data->event), object});
|
|
69
|
+
delete data;
|
|
70
|
+
}),
|
|
71
|
+
modelPath_(info[0].As<Napi::String>().Utf8Value()),
|
|
72
|
+
options_({
|
|
73
|
+
info[2]
|
|
74
|
+
.As<Napi::Object>()
|
|
75
|
+
.Get("consecutiveFramesForSilence")
|
|
76
|
+
.As<Napi::Number>()
|
|
77
|
+
.Int32Value(),
|
|
78
|
+
info[2]
|
|
79
|
+
.As<Napi::Object>()
|
|
80
|
+
.Get("consecutiveFramesForSpeaking")
|
|
81
|
+
.As<Napi::Number>()
|
|
82
|
+
.Int32Value(),
|
|
83
|
+
info[2]
|
|
84
|
+
.As<Napi::Object>()
|
|
85
|
+
.Get("device")
|
|
86
|
+
.As<Napi::Number>()
|
|
87
|
+
.Int32Value(),
|
|
88
|
+
info[2]
|
|
89
|
+
.As<Napi::Object>()
|
|
90
|
+
.Get("leadingBufferFrames")
|
|
91
|
+
.As<Napi::Number>()
|
|
92
|
+
.Int32Value(),
|
|
93
|
+
[&](std::vector<short> audio) {
|
|
94
|
+
SpeechRecorderCallbackData* data = new SpeechRecorderCallbackData();
|
|
95
|
+
data->event = "chunkStart";
|
|
96
|
+
data->audio = audio;
|
|
97
|
+
queue_.enqueue(data);
|
|
98
|
+
},
|
|
99
|
+
[&](std::vector<short> audio, bool speaking, double volume,
|
|
100
|
+
bool speech, double probability, int consecutiveSilence) {
|
|
101
|
+
SpeechRecorderCallbackData* data = new SpeechRecorderCallbackData();
|
|
102
|
+
data->event = "audio";
|
|
103
|
+
data->audio = audio;
|
|
104
|
+
data->speaking = speaking;
|
|
105
|
+
data->volume = volume;
|
|
106
|
+
data->speech = speech;
|
|
107
|
+
data->probability = probability;
|
|
108
|
+
data->consecutiveSilence = consecutiveSilence;
|
|
109
|
+
queue_.enqueue(data);
|
|
110
|
+
},
|
|
111
|
+
[&]() {
|
|
112
|
+
SpeechRecorderCallbackData* data = new SpeechRecorderCallbackData();
|
|
113
|
+
data->event = "chunkEnd";
|
|
114
|
+
queue_.enqueue(data);
|
|
115
|
+
},
|
|
116
|
+
info[2]
|
|
117
|
+
.As<Napi::Object>()
|
|
118
|
+
.Get("samplesPerFrame")
|
|
119
|
+
.As<Napi::Number>()
|
|
120
|
+
.Int32Value(),
|
|
121
|
+
info[2]
|
|
122
|
+
.As<Napi::Object>()
|
|
123
|
+
.Get("sampleRate")
|
|
124
|
+
.As<Napi::Number>()
|
|
125
|
+
.Int32Value(),
|
|
126
|
+
info[2]
|
|
127
|
+
.As<Napi::Object>()
|
|
128
|
+
.Get("sileroVadBufferSize")
|
|
129
|
+
.As<Napi::Number>()
|
|
130
|
+
.Int32Value(),
|
|
131
|
+
info[2]
|
|
132
|
+
.As<Napi::Object>()
|
|
133
|
+
.Get("sileroVadRateLimit")
|
|
134
|
+
.As<Napi::Number>()
|
|
135
|
+
.Int32Value(),
|
|
136
|
+
info[2]
|
|
137
|
+
.As<Napi::Object>()
|
|
138
|
+
.Get("sileroVadSilenceThreshold")
|
|
139
|
+
.As<Napi::Number>()
|
|
140
|
+
.DoubleValue(),
|
|
141
|
+
info[2]
|
|
142
|
+
.As<Napi::Object>()
|
|
143
|
+
.Get("sileroVadSpeakingThreshold")
|
|
144
|
+
.As<Napi::Number>()
|
|
145
|
+
.DoubleValue(),
|
|
146
|
+
info[2]
|
|
147
|
+
.As<Napi::Object>()
|
|
148
|
+
.Get("webrtcVadLevel")
|
|
149
|
+
.As<Napi::Number>()
|
|
150
|
+
.Int32Value(),
|
|
151
|
+
info[2]
|
|
152
|
+
.As<Napi::Object>()
|
|
153
|
+
.Get("webrtcVadBufferSize")
|
|
154
|
+
.As<Napi::Number>()
|
|
155
|
+
.Int32Value(),
|
|
156
|
+
info[2]
|
|
157
|
+
.As<Napi::Object>()
|
|
158
|
+
.Get("webrtcVadResultsSize")
|
|
159
|
+
.As<Napi::Number>()
|
|
160
|
+
.Int32Value(),
|
|
161
|
+
}),
|
|
162
|
+
processor_(modelPath_, options_) {}
|
|
163
|
+
|
|
164
|
+
void SpeechRecorder::ProcessFile(const Napi::CallbackInfo& info) {
|
|
165
|
+
Napi::Env env = info.Env();
|
|
166
|
+
std::string path = info[0].As<Napi::String>().Utf8Value();
|
|
167
|
+
|
|
168
|
+
// we don't want to create two processors on startup, because loading the
|
|
169
|
+
// silero model is expensive, so lazily create this instance only if this
|
|
170
|
+
// method is actually called (which is probably not common)
|
|
171
|
+
if (!processFileProcessor_) {
|
|
172
|
+
speechrecorder::ChunkProcessorOptions options = options_;
|
|
173
|
+
|
|
174
|
+
options.onChunkStart = [&](std::vector<short> audio) {
|
|
175
|
+
Napi::Object object = Napi::Object::New(env);
|
|
176
|
+
if (audio.size() > 0) {
|
|
177
|
+
Napi::Int16Array buffer = Napi::Int16Array::New(env, audio.size());
|
|
178
|
+
for (size_t i = 0; i < audio.size(); i++) {
|
|
179
|
+
buffer[i] = audio[i];
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
object.Set("audio", buffer);
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
callback_.Value().Call({Napi::String::New(env, "chunkStart"), object});
|
|
186
|
+
};
|
|
187
|
+
|
|
188
|
+
options.onAudio = [&](std::vector<short> audio, bool speaking,
|
|
189
|
+
double volume, bool speech, double probability,
|
|
190
|
+
int consecutiveSilence) {
|
|
191
|
+
Napi::Object object = Napi::Object::New(env);
|
|
192
|
+
object.Set("speaking", Napi::Boolean::New(env, speaking));
|
|
193
|
+
object.Set("volume", Napi::Number::New(env, volume));
|
|
194
|
+
object.Set("speech", Napi::Boolean::New(env, speech));
|
|
195
|
+
object.Set("probability", Napi::Number::New(env, probability));
|
|
196
|
+
object.Set("consecutiveSilence",
|
|
197
|
+
Napi::Number::New(env, (double)consecutiveSilence));
|
|
198
|
+
|
|
199
|
+
if (audio.size() > 0) {
|
|
200
|
+
Napi::Int16Array buffer = Napi::Int16Array::New(env, audio.size());
|
|
201
|
+
for (size_t i = 0; i < audio.size(); i++) {
|
|
202
|
+
buffer[i] = audio[i];
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
object.Set("audio", buffer);
|
|
206
|
+
callback_.Value().Call({Napi::String::New(env, "audio"), object});
|
|
207
|
+
}
|
|
208
|
+
};
|
|
209
|
+
|
|
210
|
+
options.onChunkEnd = [&] {
|
|
211
|
+
callback_.Value().Call({Napi::String::New(env, "chunkEnd")});
|
|
212
|
+
};
|
|
213
|
+
|
|
214
|
+
processFileProcessor_ =
|
|
215
|
+
std::make_unique<speechrecorder::ChunkProcessor>(modelPath_, options);
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
unsigned int channels;
|
|
219
|
+
unsigned int sampleRate;
|
|
220
|
+
drwav_uint64 frames;
|
|
221
|
+
short* data = drwav_open_file_and_read_pcm_frames_s16(
|
|
222
|
+
path.c_str(), &channels, &sampleRate, &frames, nullptr);
|
|
223
|
+
|
|
224
|
+
processFileProcessor_->Reset();
|
|
225
|
+
int size = (int)frames;
|
|
226
|
+
for (int i = 0; i < size; i += options_.samplesPerFrame) {
|
|
227
|
+
std::vector<short> buffer;
|
|
228
|
+
for (int j = 0; j < options_.samplesPerFrame; j++) {
|
|
229
|
+
if (i + j < size) {
|
|
230
|
+
buffer.push_back(data[i + j]);
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
if (buffer.size() == (size_t)options_.samplesPerFrame) {
|
|
235
|
+
processFileProcessor_->Process(buffer.data());
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
drwav_free(data, nullptr);
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
void SpeechRecorder::Start(const Napi::CallbackInfo& info) {
|
|
243
|
+
stopped_ = false;
|
|
244
|
+
threadSafeFunction_ = Napi::ThreadSafeFunction::New(
|
|
245
|
+
info.Env(), callback_.Value(), "Speech Recorder Start", 0, 1,
|
|
246
|
+
[&](Napi::Env env) {
|
|
247
|
+
thread_.join();
|
|
248
|
+
});
|
|
249
|
+
|
|
250
|
+
thread_ = std::thread([&] {
|
|
251
|
+
while (!stopped_) {
|
|
252
|
+
SpeechRecorderCallbackData* data;
|
|
253
|
+
bool element = queue_.try_dequeue(data);
|
|
254
|
+
if (element) {
|
|
255
|
+
threadSafeFunction_.BlockingCall(data, threadSafeFunctionCallback_);
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
std::this_thread::sleep_for(std::chrono::milliseconds(10));
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
threadSafeFunction_.Release();
|
|
262
|
+
});
|
|
263
|
+
|
|
264
|
+
processor_.Start();
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
void SpeechRecorder::Stop(const Napi::CallbackInfo& info) {
|
|
268
|
+
stopped_ = true;
|
|
269
|
+
processor_.Stop();
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
Napi::Value GetDevices(const Napi::CallbackInfo& info) {
|
|
273
|
+
Napi::Env env = info.Env();
|
|
274
|
+
|
|
275
|
+
std::vector<speechrecorder::Device> devices = speechrecorder::GetDevices();
|
|
276
|
+
Napi::Array result = Napi::Array::New(env, devices.size());
|
|
277
|
+
for (size_t i = 0; i < devices.size(); i++) {
|
|
278
|
+
Napi::Object e = Napi::Object::New(env);
|
|
279
|
+
e.Set("id", devices[i].id);
|
|
280
|
+
e.Set("name", devices[i].name);
|
|
281
|
+
e.Set("apiName", devices[i].apiName);
|
|
282
|
+
e.Set("maxInputChannels", devices[i].maxInputChannels);
|
|
283
|
+
e.Set("maxOutputChannels", devices[i].maxOutputChannels);
|
|
284
|
+
e.Set("defaultSampleRate", devices[i].defaultSampleRate);
|
|
285
|
+
e.Set("isDefaultInput", devices[i].isDefaultInput);
|
|
286
|
+
e.Set("isDefaultOutput", devices[i].isDefaultOutput);
|
|
287
|
+
result[i] = e;
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
return result;
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
Napi::Object Init(Napi::Env env, Napi::Object exports) {
|
|
294
|
+
SpeechRecorder::Init(env, exports);
|
|
295
|
+
return exports;
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
NODE_API_MODULE(addon, Init);
|