@streaming-sortformer-node/darwin-arm64 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/Release/sortformer-addon.node +0 -0
- package/index.js +1 -0
- package/package.json +30 -0
- package/src/DiarizeWorker.cpp +194 -0
- package/src/DiarizeWorker.h +90 -0
- package/src/SortformerModel.cpp +138 -0
- package/src/SortformerModel.h +41 -0
- package/src/StreamingSession.cpp +210 -0
- package/src/StreamingSession.h +56 -0
- package/src/addon.cpp +11 -0
|
Binary file
|
package/index.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
module.exports = require('./build/Release/sortformer-addon.node');
|
package/package.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@streaming-sortformer-node/darwin-arm64",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Native bindings for SortFormer on macOS ARM64",
|
|
5
|
+
"os": [
|
|
6
|
+
"darwin"
|
|
7
|
+
],
|
|
8
|
+
"cpu": [
|
|
9
|
+
"arm64"
|
|
10
|
+
],
|
|
11
|
+
"main": "./index.js",
|
|
12
|
+
"files": [
|
|
13
|
+
"index.js",
|
|
14
|
+
"build/Release/sortformer-addon.node",
|
|
15
|
+
"src"
|
|
16
|
+
],
|
|
17
|
+
"dependencies": {
|
|
18
|
+
"node-addon-api": "^7.0.0"
|
|
19
|
+
},
|
|
20
|
+
"devDependencies": {
|
|
21
|
+
"cmake-js": "^7.0.0"
|
|
22
|
+
},
|
|
23
|
+
"engines": {
|
|
24
|
+
"node": ">=18.0.0"
|
|
25
|
+
},
|
|
26
|
+
"scripts": {
|
|
27
|
+
"build": "cmake-js build",
|
|
28
|
+
"clean": "cmake-js clean"
|
|
29
|
+
}
|
|
30
|
+
}
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
#include "DiarizeWorker.h"
|
|
2
|
+
#include <cmath>
|
|
3
|
+
#include <cstring>
|
|
4
|
+
|
|
5
|
+
// Number of speakers output by the model
|
|
6
|
+
static constexpr int NUM_SPEAKERS = 4;
|
|
7
|
+
|
|
8
|
+
// RTTM buffer size: ~1KB per minute of audio, plus safety margin
|
|
9
|
+
// For 16kHz audio with 160-sample hop, 1 minute = 6000 frames
|
|
10
|
+
// Estimate: 100 bytes per RTTM line, ~60 lines per minute per speaker = 24KB/min
|
|
11
|
+
// Use 32KB per minute as safe estimate
|
|
12
|
+
static constexpr int RTTM_BYTES_PER_MINUTE = 32 * 1024;
|
|
13
|
+
|
|
14
|
+
DiarizeWorker::DiarizeWorker(
|
|
15
|
+
Napi::Env env,
|
|
16
|
+
sortformer_context* ctx,
|
|
17
|
+
std::vector<float> audio,
|
|
18
|
+
DiarizeOptions options,
|
|
19
|
+
Napi::Promise::Deferred deferred)
|
|
20
|
+
: Napi::AsyncWorker(env)
|
|
21
|
+
, ctx_(ctx)
|
|
22
|
+
, audio_(std::move(audio))
|
|
23
|
+
, options_(std::move(options))
|
|
24
|
+
, deferred_(deferred)
|
|
25
|
+
{
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
void DiarizeWorker::Execute() {
|
|
29
|
+
// IMPORTANT: This runs on a worker thread - NO Napi calls allowed!
|
|
30
|
+
|
|
31
|
+
if (ctx_ == nullptr) {
|
|
32
|
+
SetError("Model context is null or has been closed");
|
|
33
|
+
return;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
if (audio_.empty()) {
|
|
37
|
+
SetError("Audio data is empty");
|
|
38
|
+
return;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// Calculate expected number of frames
|
|
42
|
+
// Formula: n_frames = (n_samples - 400) / 160 + 1, then subsampled by 8
|
|
43
|
+
// Simplified: n_frames ≈ n_samples / (160 * 8) = n_samples / 1280
|
|
44
|
+
// Add safety margin for padding
|
|
45
|
+
int n_samples = static_cast<int>(audio_.size());
|
|
46
|
+
int n_frames_max = (n_samples / 1280) + 100; // Add margin for padding
|
|
47
|
+
|
|
48
|
+
// Allocate output buffer for predictions (n_frames * 4 speakers)
|
|
49
|
+
predictions_.resize(n_frames_max * NUM_SPEAKERS);
|
|
50
|
+
|
|
51
|
+
// Run diarization
|
|
52
|
+
n_frames_ = sortformer_diarize(
|
|
53
|
+
ctx_,
|
|
54
|
+
audio_.data(),
|
|
55
|
+
n_samples,
|
|
56
|
+
predictions_.data(),
|
|
57
|
+
n_frames_max);
|
|
58
|
+
|
|
59
|
+
if (n_frames_ < 0) {
|
|
60
|
+
SetError("Diarization failed");
|
|
61
|
+
return;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// Trim predictions to actual frame count
|
|
65
|
+
predictions_.resize(n_frames_ * NUM_SPEAKERS);
|
|
66
|
+
|
|
67
|
+
// Calculate RTTM buffer size based on audio duration
|
|
68
|
+
// Audio duration in minutes = n_samples / (16000 * 60)
|
|
69
|
+
float duration_minutes = static_cast<float>(n_samples) / (16000.0f * 60.0f);
|
|
70
|
+
int rttm_size = static_cast<int>(std::ceil(duration_minutes + 1.0f) * RTTM_BYTES_PER_MINUTE);
|
|
71
|
+
rttm_size = std::max(rttm_size, 4096); // Minimum 4KB
|
|
72
|
+
|
|
73
|
+
// Allocate RTTM buffer
|
|
74
|
+
std::vector<char> rttm_buffer(rttm_size);
|
|
75
|
+
|
|
76
|
+
// Convert predictions to RTTM format
|
|
77
|
+
int rttm_bytes = sortformer_to_rttm(
|
|
78
|
+
predictions_.data(),
|
|
79
|
+
n_frames_,
|
|
80
|
+
options_.threshold,
|
|
81
|
+
options_.median_filter,
|
|
82
|
+
options_.filename.c_str(),
|
|
83
|
+
rttm_buffer.data(),
|
|
84
|
+
rttm_size);
|
|
85
|
+
|
|
86
|
+
if (rttm_bytes < 0) {
|
|
87
|
+
SetError("Failed to convert predictions to RTTM format");
|
|
88
|
+
return;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// Store RTTM string (null-terminated by sortformer_to_rttm)
|
|
92
|
+
rttm_ = std::string(rttm_buffer.data(), rttm_bytes);
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
void DiarizeWorker::OnOK() {
|
|
96
|
+
Napi::Env env = Env();
|
|
97
|
+
Napi::HandleScope scope(env);
|
|
98
|
+
|
|
99
|
+
// Create result object
|
|
100
|
+
Napi::Object result = Napi::Object::New(env);
|
|
101
|
+
|
|
102
|
+
// Add RTTM string
|
|
103
|
+
result.Set("rttm", Napi::String::New(env, rttm_));
|
|
104
|
+
|
|
105
|
+
// Create Float32Array for predictions
|
|
106
|
+
// Copy data to a new ArrayBuffer owned by JavaScript
|
|
107
|
+
Napi::ArrayBuffer buffer = Napi::ArrayBuffer::New(
|
|
108
|
+
env,
|
|
109
|
+
predictions_.size() * sizeof(float));
|
|
110
|
+
|
|
111
|
+
std::memcpy(buffer.Data(), predictions_.data(), predictions_.size() * sizeof(float));
|
|
112
|
+
|
|
113
|
+
Napi::Float32Array predictions = Napi::Float32Array::New(
|
|
114
|
+
env,
|
|
115
|
+
predictions_.size(),
|
|
116
|
+
buffer,
|
|
117
|
+
0);
|
|
118
|
+
|
|
119
|
+
result.Set("predictions", predictions);
|
|
120
|
+
|
|
121
|
+
// Add frame count for convenience
|
|
122
|
+
result.Set("frameCount", Napi::Number::New(env, n_frames_));
|
|
123
|
+
|
|
124
|
+
// Add speaker count
|
|
125
|
+
result.Set("speakerCount", Napi::Number::New(env, NUM_SPEAKERS));
|
|
126
|
+
|
|
127
|
+
// Resolve the promise with the result
|
|
128
|
+
deferred_.Resolve(result);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
void DiarizeWorker::OnError(const Napi::Error& e) {
|
|
132
|
+
// Reject the promise with the error
|
|
133
|
+
deferred_.Reject(e.Value());
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
LatencyPreset DiarizeWorker::ParsePreset(const std::string& preset) {
|
|
137
|
+
if (preset == "offline" || preset == "default") {
|
|
138
|
+
return LatencyPreset::Offline;
|
|
139
|
+
} else if (preset == "low") {
|
|
140
|
+
return LatencyPreset::Low;
|
|
141
|
+
} else if (preset == "2s") {
|
|
142
|
+
return LatencyPreset::TwoSecond;
|
|
143
|
+
} else if (preset == "3s") {
|
|
144
|
+
return LatencyPreset::ThreeSecond;
|
|
145
|
+
} else if (preset == "5s") {
|
|
146
|
+
return LatencyPreset::FiveSecond;
|
|
147
|
+
}
|
|
148
|
+
// Default to offline for unknown presets
|
|
149
|
+
return LatencyPreset::Offline;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
void DiarizeWorker::ApplyPreset(sortformer_params& params, LatencyPreset preset) {
|
|
153
|
+
switch (preset) {
|
|
154
|
+
case LatencyPreset::Offline:
|
|
155
|
+
// Default/offline: chunk_len=188, right_context=1, fifo_len=0, spkcache_update_period=188
|
|
156
|
+
params.chunk_len = 188;
|
|
157
|
+
params.right_context = 1;
|
|
158
|
+
params.fifo_len = 0;
|
|
159
|
+
params.spkcache_update_period = 188;
|
|
160
|
+
break;
|
|
161
|
+
|
|
162
|
+
case LatencyPreset::Low:
|
|
163
|
+
// Low latency: chunk_len=6, right_context=7, fifo_len=188, spkcache_update_period=144
|
|
164
|
+
params.chunk_len = 6;
|
|
165
|
+
params.right_context = 7;
|
|
166
|
+
params.fifo_len = 188;
|
|
167
|
+
params.spkcache_update_period = 144;
|
|
168
|
+
break;
|
|
169
|
+
|
|
170
|
+
case LatencyPreset::TwoSecond:
|
|
171
|
+
// 2s latency: chunk_len=15, right_context=10, fifo_len=100, spkcache_update_period=144
|
|
172
|
+
params.chunk_len = 15;
|
|
173
|
+
params.right_context = 10;
|
|
174
|
+
params.fifo_len = 100;
|
|
175
|
+
params.spkcache_update_period = 144;
|
|
176
|
+
break;
|
|
177
|
+
|
|
178
|
+
case LatencyPreset::ThreeSecond:
|
|
179
|
+
// 3s latency: chunk_len=30, right_context=7, fifo_len=100, spkcache_update_period=100
|
|
180
|
+
params.chunk_len = 30;
|
|
181
|
+
params.right_context = 7;
|
|
182
|
+
params.fifo_len = 100;
|
|
183
|
+
params.spkcache_update_period = 100;
|
|
184
|
+
break;
|
|
185
|
+
|
|
186
|
+
case LatencyPreset::FiveSecond:
|
|
187
|
+
// 5s latency: chunk_len=55, right_context=7, fifo_len=100, spkcache_update_period=100
|
|
188
|
+
params.chunk_len = 55;
|
|
189
|
+
params.right_context = 7;
|
|
190
|
+
params.fifo_len = 100;
|
|
191
|
+
params.spkcache_update_period = 100;
|
|
192
|
+
break;
|
|
193
|
+
}
|
|
194
|
+
}
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
#ifndef DIARIZE_WORKER_H
|
|
2
|
+
#define DIARIZE_WORKER_H
|
|
3
|
+
|
|
4
|
+
#include <napi.h>
|
|
5
|
+
#include <vector>
|
|
6
|
+
#include <string>
|
|
7
|
+
|
|
8
|
+
#include "sortformer.h"
|
|
9
|
+
|
|
10
|
+
// Latency preset enumeration
|
|
11
|
+
enum class LatencyPreset {
|
|
12
|
+
Offline, // Default: chunk_len=188, right_context=1, fifo_len=0, spkcache_update_period=188
|
|
13
|
+
Low, // Low latency: chunk_len=6, right_context=7, fifo_len=188, spkcache_update_period=144
|
|
14
|
+
TwoSecond, // 2s latency: chunk_len=15, right_context=10, fifo_len=100, spkcache_update_period=144
|
|
15
|
+
ThreeSecond,// 3s latency: chunk_len=30, right_context=7, fifo_len=100, spkcache_update_period=100
|
|
16
|
+
FiveSecond // 5s latency: chunk_len=55, right_context=7, fifo_len=100, spkcache_update_period=100
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
// Diarization options passed from JavaScript
|
|
20
|
+
struct DiarizeOptions {
|
|
21
|
+
LatencyPreset preset = LatencyPreset::Offline;
|
|
22
|
+
float threshold = 0.5f;
|
|
23
|
+
int median_filter = 11;
|
|
24
|
+
std::string filename = "audio"; // For RTTM output
|
|
25
|
+
};
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* AsyncWorker for non-blocking speaker diarization inference.
|
|
29
|
+
*
|
|
30
|
+
* Runs sortformer_diarize() on a worker thread to avoid blocking the Node.js event loop.
|
|
31
|
+
* Returns a Promise that resolves with { rttm: string, predictions: Float32Array }.
|
|
32
|
+
*/
|
|
33
|
+
class DiarizeWorker : public Napi::AsyncWorker {
|
|
34
|
+
public:
|
|
35
|
+
/**
|
|
36
|
+
* Create a new DiarizeWorker.
|
|
37
|
+
*
|
|
38
|
+
* @param env The N-API environment
|
|
39
|
+
* @param ctx The sortformer context (must remain valid during execution)
|
|
40
|
+
* @param audio Audio samples (16kHz mono float32) - copied internally
|
|
41
|
+
* @param options Diarization options including latency preset
|
|
42
|
+
* @param deferred Promise deferred for async/await support
|
|
43
|
+
*/
|
|
44
|
+
DiarizeWorker(
|
|
45
|
+
Napi::Env env,
|
|
46
|
+
sortformer_context* ctx,
|
|
47
|
+
std::vector<float> audio,
|
|
48
|
+
DiarizeOptions options,
|
|
49
|
+
Napi::Promise::Deferred deferred);
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Execute diarization on worker thread.
|
|
53
|
+
* IMPORTANT: Cannot use any Napi objects here - runs off main thread.
|
|
54
|
+
*/
|
|
55
|
+
void Execute() override;
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Called on main thread when Execute() completes successfully.
|
|
59
|
+
* Creates the result object and resolves the promise.
|
|
60
|
+
*/
|
|
61
|
+
void OnOK() override;
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Called on main thread when Execute() throws or SetError() is called.
|
|
65
|
+
* Rejects the promise with the error message.
|
|
66
|
+
*/
|
|
67
|
+
void OnError(const Napi::Error& e) override;
|
|
68
|
+
|
|
69
|
+
// Helper to parse latency preset from JavaScript string
|
|
70
|
+
static LatencyPreset ParsePreset(const std::string& preset);
|
|
71
|
+
|
|
72
|
+
// Helper to apply preset to sortformer_params
|
|
73
|
+
static void ApplyPreset(sortformer_params& params, LatencyPreset preset);
|
|
74
|
+
|
|
75
|
+
private:
|
|
76
|
+
// Input data (copied from JavaScript)
|
|
77
|
+
sortformer_context* ctx_;
|
|
78
|
+
std::vector<float> audio_;
|
|
79
|
+
DiarizeOptions options_;
|
|
80
|
+
|
|
81
|
+
// Promise for async/await support
|
|
82
|
+
Napi::Promise::Deferred deferred_;
|
|
83
|
+
|
|
84
|
+
// Results (populated in Execute, used in OnOK)
|
|
85
|
+
std::vector<float> predictions_;
|
|
86
|
+
std::string rttm_;
|
|
87
|
+
int n_frames_ = 0;
|
|
88
|
+
};
|
|
89
|
+
|
|
90
|
+
#endif // DIARIZE_WORKER_H
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
#include "SortformerModel.h"
|
|
2
|
+
#include "DiarizeWorker.h"
|
|
3
|
+
|
|
4
|
+
// Static constructor reference
|
|
5
|
+
Napi::FunctionReference SortformerModel::constructor;
|
|
6
|
+
|
|
7
|
+
Napi::Object SortformerModel::Init(Napi::Env env, Napi::Object exports) {
|
|
8
|
+
Napi::HandleScope scope(env);
|
|
9
|
+
|
|
10
|
+
Napi::Function func = DefineClass(env, "SortformerModel", {
|
|
11
|
+
InstanceMethod("close", &SortformerModel::Close),
|
|
12
|
+
InstanceMethod("diarize", &SortformerModel::Diarize),
|
|
13
|
+
});
|
|
14
|
+
|
|
15
|
+
constructor = Napi::Persistent(func);
|
|
16
|
+
constructor.SuppressDestruct();
|
|
17
|
+
|
|
18
|
+
exports.Set("SortformerModel", func);
|
|
19
|
+
return exports;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
SortformerModel::SortformerModel(const Napi::CallbackInfo& info)
|
|
23
|
+
: Napi::ObjectWrap<SortformerModel>(info), ctx_(nullptr) {
|
|
24
|
+
|
|
25
|
+
Napi::Env env = info.Env();
|
|
26
|
+
|
|
27
|
+
// Validate arguments: expect exactly one string argument (model path)
|
|
28
|
+
if (info.Length() < 1) {
|
|
29
|
+
Napi::TypeError::New(env, "Model path is required").ThrowAsJavaScriptException();
|
|
30
|
+
return;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
if (!info[0].IsString()) {
|
|
34
|
+
Napi::TypeError::New(env, "Model path must be a string").ThrowAsJavaScriptException();
|
|
35
|
+
return;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
std::string modelPath = info[0].As<Napi::String>().Utf8Value();
|
|
39
|
+
|
|
40
|
+
// Get default parameters
|
|
41
|
+
sortformer_params params = sortformer_default_params();
|
|
42
|
+
|
|
43
|
+
// Initialize the sortformer context
|
|
44
|
+
ctx_ = sortformer_init(modelPath.c_str(), params);
|
|
45
|
+
|
|
46
|
+
if (ctx_ == nullptr) {
|
|
47
|
+
Napi::Error::New(env, "Failed to load model from path: " + modelPath).ThrowAsJavaScriptException();
|
|
48
|
+
return;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
SortformerModel::~SortformerModel() {
|
|
53
|
+
Cleanup();
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
void SortformerModel::Cleanup() {
|
|
57
|
+
if (ctx_ != nullptr) {
|
|
58
|
+
sortformer_free(ctx_);
|
|
59
|
+
ctx_ = nullptr;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
Napi::Value SortformerModel::Close(const Napi::CallbackInfo& info) {
|
|
64
|
+
Napi::Env env = info.Env();
|
|
65
|
+
|
|
66
|
+
Cleanup();
|
|
67
|
+
|
|
68
|
+
return env.Undefined();
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
Napi::Value SortformerModel::Diarize(const Napi::CallbackInfo& info) {
|
|
72
|
+
Napi::Env env = info.Env();
|
|
73
|
+
|
|
74
|
+
if (ctx_ == nullptr) {
|
|
75
|
+
Napi::Error::New(env, "Model is closed or not initialized").ThrowAsJavaScriptException();
|
|
76
|
+
return env.Undefined();
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
if (info.Length() < 1) {
|
|
80
|
+
Napi::TypeError::New(env, "Audio data is required").ThrowAsJavaScriptException();
|
|
81
|
+
return env.Undefined();
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
if (!info[0].IsTypedArray()) {
|
|
85
|
+
Napi::TypeError::New(env, "Audio must be a Float32Array").ThrowAsJavaScriptException();
|
|
86
|
+
return env.Undefined();
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
Napi::TypedArray typedArray = info[0].As<Napi::TypedArray>();
|
|
90
|
+
if (typedArray.TypedArrayType() != napi_float32_array) {
|
|
91
|
+
Napi::TypeError::New(env, "Audio must be a Float32Array").ThrowAsJavaScriptException();
|
|
92
|
+
return env.Undefined();
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
Napi::Float32Array audioArray = info[0].As<Napi::Float32Array>();
|
|
96
|
+
size_t audioLength = audioArray.ElementLength();
|
|
97
|
+
|
|
98
|
+
if (audioLength == 0) {
|
|
99
|
+
Napi::Error::New(env, "Audio data cannot be empty").ThrowAsJavaScriptException();
|
|
100
|
+
return env.Undefined();
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
std::vector<float> audio(audioLength);
|
|
104
|
+
for (size_t i = 0; i < audioLength; i++) {
|
|
105
|
+
audio[i] = audioArray[i];
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
DiarizeOptions options;
|
|
109
|
+
|
|
110
|
+
if (info.Length() >= 2 && info[1].IsObject()) {
|
|
111
|
+
Napi::Object opts = info[1].As<Napi::Object>();
|
|
112
|
+
|
|
113
|
+
if (opts.Has("threshold") && opts.Get("threshold").IsNumber()) {
|
|
114
|
+
options.threshold = opts.Get("threshold").As<Napi::Number>().FloatValue();
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
if (opts.Has("medianFilter") && opts.Get("medianFilter").IsNumber()) {
|
|
118
|
+
options.median_filter = opts.Get("medianFilter").As<Napi::Number>().Int32Value();
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
if (opts.Has("filename") && opts.Get("filename").IsString()) {
|
|
122
|
+
options.filename = opts.Get("filename").As<Napi::String>().Utf8Value();
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
Napi::Promise::Deferred deferred = Napi::Promise::Deferred::New(env);
|
|
127
|
+
|
|
128
|
+
DiarizeWorker* worker = new DiarizeWorker(
|
|
129
|
+
env,
|
|
130
|
+
ctx_,
|
|
131
|
+
std::move(audio),
|
|
132
|
+
options,
|
|
133
|
+
deferred);
|
|
134
|
+
|
|
135
|
+
worker->Queue();
|
|
136
|
+
|
|
137
|
+
return deferred.Promise();
|
|
138
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
#ifndef SORTFORMER_MODEL_H
|
|
2
|
+
#define SORTFORMER_MODEL_H
|
|
3
|
+
|
|
4
|
+
#include <napi.h>
|
|
5
|
+
#include "sortformer.h"
|
|
6
|
+
|
|
7
|
+
class SortformerModel : public Napi::ObjectWrap<SortformerModel> {
|
|
8
|
+
public:
|
|
9
|
+
// Initialize the class and register with N-API
|
|
10
|
+
static Napi::Object Init(Napi::Env env, Napi::Object exports);
|
|
11
|
+
|
|
12
|
+
// Constructor - takes model path string
|
|
13
|
+
SortformerModel(const Napi::CallbackInfo& info);
|
|
14
|
+
|
|
15
|
+
// Destructor - frees sortformer context
|
|
16
|
+
~SortformerModel();
|
|
17
|
+
|
|
18
|
+
// Get the underlying context pointer (for DiarizeWorker)
|
|
19
|
+
sortformer_context* GetContext() { return ctx_; }
|
|
20
|
+
|
|
21
|
+
// Check if context is valid
|
|
22
|
+
bool IsValid() const { return ctx_ != nullptr; }
|
|
23
|
+
|
|
24
|
+
private:
|
|
25
|
+
// Static constructor reference for N-API
|
|
26
|
+
static Napi::FunctionReference constructor;
|
|
27
|
+
|
|
28
|
+
// The underlying sortformer context
|
|
29
|
+
sortformer_context* ctx_;
|
|
30
|
+
|
|
31
|
+
// Explicit cleanup method (callable from JavaScript)
|
|
32
|
+
Napi::Value Close(const Napi::CallbackInfo& info);
|
|
33
|
+
|
|
34
|
+
// Diarization method - runs async inference via DiarizeWorker
|
|
35
|
+
Napi::Value Diarize(const Napi::CallbackInfo& info);
|
|
36
|
+
|
|
37
|
+
// Internal cleanup helper
|
|
38
|
+
void Cleanup();
|
|
39
|
+
};
|
|
40
|
+
|
|
41
|
+
#endif // SORTFORMER_MODEL_H
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
#include "StreamingSession.h"
|
|
2
|
+
#include "SortformerModel.h"
|
|
3
|
+
|
|
4
|
+
Napi::FunctionReference StreamingSession::constructor;
|
|
5
|
+
|
|
6
|
+
Napi::Object StreamingSession::Init(Napi::Env env, Napi::Object exports) {
|
|
7
|
+
Napi::HandleScope scope(env);
|
|
8
|
+
|
|
9
|
+
Napi::Function func = DefineClass(env, "StreamingSession", {
|
|
10
|
+
InstanceMethod("feed", &StreamingSession::Feed),
|
|
11
|
+
InstanceMethod("flush", &StreamingSession::Flush),
|
|
12
|
+
InstanceMethod("reset", &StreamingSession::Reset),
|
|
13
|
+
InstanceMethod("close", &StreamingSession::Close),
|
|
14
|
+
InstanceMethod("getTotalFrames", &StreamingSession::GetTotalFrames),
|
|
15
|
+
InstanceMethod("isClosed", &StreamingSession::IsClosed),
|
|
16
|
+
});
|
|
17
|
+
|
|
18
|
+
constructor = Napi::Persistent(func);
|
|
19
|
+
constructor.SuppressDestruct();
|
|
20
|
+
|
|
21
|
+
exports.Set("StreamingSession", func);
|
|
22
|
+
return exports;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
StreamingSession::StreamingSession(const Napi::CallbackInfo& info)
|
|
26
|
+
: Napi::ObjectWrap<StreamingSession>(info), stream_(nullptr), ctx_(nullptr), closed_(false), total_frames_(0) {
|
|
27
|
+
|
|
28
|
+
Napi::Env env = info.Env();
|
|
29
|
+
|
|
30
|
+
// Expect: new StreamingSession(model, preset)
|
|
31
|
+
// model: SortformerModel instance
|
|
32
|
+
// preset: number (0=low, 1=2s, 2=3s, 3=5s)
|
|
33
|
+
|
|
34
|
+
if (info.Length() < 2) {
|
|
35
|
+
Napi::TypeError::New(env, "Expected (model, preset) arguments").ThrowAsJavaScriptException();
|
|
36
|
+
return;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// Get model context from SortformerModel
|
|
40
|
+
if (!info[0].IsObject()) {
|
|
41
|
+
Napi::TypeError::New(env, "First argument must be a SortformerModel").ThrowAsJavaScriptException();
|
|
42
|
+
return;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
Napi::Object modelObj = info[0].As<Napi::Object>();
|
|
46
|
+
SortformerModel* model = Napi::ObjectWrap<SortformerModel>::Unwrap(modelObj);
|
|
47
|
+
|
|
48
|
+
if (!model || !model->IsValid()) {
|
|
49
|
+
Napi::Error::New(env, "Model is closed or invalid").ThrowAsJavaScriptException();
|
|
50
|
+
return;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
ctx_ = model->GetContext();
|
|
54
|
+
|
|
55
|
+
// Get preset
|
|
56
|
+
if (!info[1].IsNumber()) {
|
|
57
|
+
Napi::TypeError::New(env, "Second argument must be a preset number").ThrowAsJavaScriptException();
|
|
58
|
+
return;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
int preset_num = info[1].As<Napi::Number>().Int32Value();
|
|
62
|
+
sortformer_stream_preset preset = static_cast<sortformer_stream_preset>(preset_num);
|
|
63
|
+
|
|
64
|
+
// Initialize streaming session
|
|
65
|
+
stream_ = sortformer_stream_init(ctx_, preset);
|
|
66
|
+
|
|
67
|
+
if (stream_ == nullptr) {
|
|
68
|
+
Napi::Error::New(env, "Failed to create streaming session").ThrowAsJavaScriptException();
|
|
69
|
+
return;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
StreamingSession::~StreamingSession() {
|
|
74
|
+
Cleanup();
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
void StreamingSession::Cleanup() {
|
|
78
|
+
if (stream_ != nullptr) {
|
|
79
|
+
sortformer_stream_free(stream_);
|
|
80
|
+
stream_ = nullptr;
|
|
81
|
+
}
|
|
82
|
+
closed_ = true;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
Napi::Value StreamingSession::Feed(const Napi::CallbackInfo& info) {
|
|
86
|
+
Napi::Env env = info.Env();
|
|
87
|
+
|
|
88
|
+
if (closed_ || stream_ == nullptr) {
|
|
89
|
+
Napi::Error::New(env, "Session is closed").ThrowAsJavaScriptException();
|
|
90
|
+
return env.Undefined();
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
if (info.Length() < 1 || !info[0].IsTypedArray()) {
|
|
94
|
+
Napi::TypeError::New(env, "Expected Float32Array argument").ThrowAsJavaScriptException();
|
|
95
|
+
return env.Undefined();
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
Napi::TypedArray typedArray = info[0].As<Napi::TypedArray>();
|
|
99
|
+
if (typedArray.TypedArrayType() != napi_float32_array) {
|
|
100
|
+
Napi::TypeError::New(env, "Audio must be a Float32Array").ThrowAsJavaScriptException();
|
|
101
|
+
return env.Undefined();
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
Napi::Float32Array audioArray = info[0].As<Napi::Float32Array>();
|
|
105
|
+
size_t audioLength = audioArray.ElementLength();
|
|
106
|
+
|
|
107
|
+
if (audioLength == 0) {
|
|
108
|
+
// Return empty result
|
|
109
|
+
Napi::Object result = Napi::Object::New(env);
|
|
110
|
+
result.Set("predictions", Napi::Float32Array::New(env, 0));
|
|
111
|
+
result.Set("frameCount", Napi::Number::New(env, 0));
|
|
112
|
+
return result;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// Get audio data pointer
|
|
116
|
+
float* audioData = audioArray.Data();
|
|
117
|
+
|
|
118
|
+
// Allocate output buffer (generous size)
|
|
119
|
+
int max_frames = (audioLength / 160) + 100; // hop=160, plus margin
|
|
120
|
+
std::vector<float> probs_out(max_frames * 4);
|
|
121
|
+
|
|
122
|
+
// Feed to streaming pipeline
|
|
123
|
+
int n_frames = sortformer_stream_feed(stream_, audioData, audioLength,
|
|
124
|
+
probs_out.data(), max_frames);
|
|
125
|
+
|
|
126
|
+
if (n_frames < 0) {
|
|
127
|
+
Napi::Error::New(env, "Stream feed failed").ThrowAsJavaScriptException();
|
|
128
|
+
return env.Undefined();
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
total_frames_ += n_frames;
|
|
132
|
+
|
|
133
|
+
// Create result object
|
|
134
|
+
Napi::Object result = Napi::Object::New(env);
|
|
135
|
+
|
|
136
|
+
// Copy predictions to Float32Array
|
|
137
|
+
Napi::Float32Array predictions = Napi::Float32Array::New(env, n_frames * 4);
|
|
138
|
+
for (int i = 0; i < n_frames * 4; i++) {
|
|
139
|
+
predictions[i] = probs_out[i];
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
result.Set("predictions", predictions);
|
|
143
|
+
result.Set("frameCount", Napi::Number::New(env, n_frames));
|
|
144
|
+
|
|
145
|
+
return result;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
Napi::Value StreamingSession::Flush(const Napi::CallbackInfo& info) {
|
|
149
|
+
Napi::Env env = info.Env();
|
|
150
|
+
|
|
151
|
+
if (closed_ || stream_ == nullptr) {
|
|
152
|
+
Napi::Error::New(env, "Session is closed").ThrowAsJavaScriptException();
|
|
153
|
+
return env.Undefined();
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
int max_frames = 1000;
|
|
157
|
+
std::vector<float> probs_out(max_frames * 4);
|
|
158
|
+
|
|
159
|
+
int n_frames = sortformer_stream_flush(stream_, probs_out.data(), max_frames);
|
|
160
|
+
|
|
161
|
+
if (n_frames < 0) {
|
|
162
|
+
Napi::Error::New(env, "Stream flush failed").ThrowAsJavaScriptException();
|
|
163
|
+
return env.Undefined();
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
total_frames_ += n_frames;
|
|
167
|
+
|
|
168
|
+
Napi::Object result = Napi::Object::New(env);
|
|
169
|
+
Napi::Float32Array predictions = Napi::Float32Array::New(env, n_frames * 4);
|
|
170
|
+
for (int i = 0; i < n_frames * 4; i++) {
|
|
171
|
+
predictions[i] = probs_out[i];
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
result.Set("predictions", predictions);
|
|
175
|
+
result.Set("frameCount", Napi::Number::New(env, n_frames));
|
|
176
|
+
|
|
177
|
+
return result;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
Napi::Value StreamingSession::Reset(const Napi::CallbackInfo& info) {
|
|
181
|
+
Napi::Env env = info.Env();
|
|
182
|
+
|
|
183
|
+
if (closed_ || stream_ == nullptr) {
|
|
184
|
+
Napi::Error::New(env, "Session is closed").ThrowAsJavaScriptException();
|
|
185
|
+
return env.Undefined();
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
sortformer_stream_reset(stream_);
|
|
189
|
+
total_frames_ = 0;
|
|
190
|
+
|
|
191
|
+
return env.Undefined();
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
Napi::Value StreamingSession::Close(const Napi::CallbackInfo& info) {
|
|
195
|
+
Napi::Env env = info.Env();
|
|
196
|
+
|
|
197
|
+
Cleanup();
|
|
198
|
+
|
|
199
|
+
return env.Undefined();
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
Napi::Value StreamingSession::GetTotalFrames(const Napi::CallbackInfo& info) {
|
|
203
|
+
Napi::Env env = info.Env();
|
|
204
|
+
return Napi::Number::New(env, static_cast<double>(total_frames_));
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
Napi::Value StreamingSession::IsClosed(const Napi::CallbackInfo& info) {
|
|
208
|
+
Napi::Env env = info.Env();
|
|
209
|
+
return Napi::Boolean::New(env, closed_);
|
|
210
|
+
}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
#ifndef STREAMING_SESSION_H
|
|
2
|
+
#define STREAMING_SESSION_H
|
|
3
|
+
|
|
4
|
+
#include <napi.h>
|
|
5
|
+
#include "sortformer.h"
|
|
6
|
+
|
|
7
|
+
class StreamingSession : public Napi::ObjectWrap<StreamingSession> {
|
|
8
|
+
public:
|
|
9
|
+
// Initialize the class and register with N-API
|
|
10
|
+
static Napi::Object Init(Napi::Env env, Napi::Object exports);
|
|
11
|
+
|
|
12
|
+
// Constructor - takes SortformerModel and preset
|
|
13
|
+
StreamingSession(const Napi::CallbackInfo& info);
|
|
14
|
+
|
|
15
|
+
// Destructor
|
|
16
|
+
~StreamingSession();
|
|
17
|
+
|
|
18
|
+
private:
|
|
19
|
+
// Static constructor reference for N-API
|
|
20
|
+
static Napi::FunctionReference constructor;
|
|
21
|
+
|
|
22
|
+
// The underlying streaming state (owned)
|
|
23
|
+
sortformer_stream_state* stream_;
|
|
24
|
+
|
|
25
|
+
// Reference to the model context (not owned - must outlive session)
|
|
26
|
+
sortformer_context* ctx_;
|
|
27
|
+
|
|
28
|
+
// Whether the session has been closed
|
|
29
|
+
bool closed_;
|
|
30
|
+
|
|
31
|
+
// Total frames output so far
|
|
32
|
+
int64_t total_frames_;
|
|
33
|
+
|
|
34
|
+
// Feed audio samples, get predictions
|
|
35
|
+
Napi::Value Feed(const Napi::CallbackInfo& info);
|
|
36
|
+
|
|
37
|
+
// Flush remaining buffered audio
|
|
38
|
+
Napi::Value Flush(const Napi::CallbackInfo& info);
|
|
39
|
+
|
|
40
|
+
// Reset streaming state
|
|
41
|
+
Napi::Value Reset(const Napi::CallbackInfo& info);
|
|
42
|
+
|
|
43
|
+
// Close and free resources
|
|
44
|
+
Napi::Value Close(const Napi::CallbackInfo& info);
|
|
45
|
+
|
|
46
|
+
// Get total frames output
|
|
47
|
+
Napi::Value GetTotalFrames(const Napi::CallbackInfo& info);
|
|
48
|
+
|
|
49
|
+
// Check if closed
|
|
50
|
+
Napi::Value IsClosed(const Napi::CallbackInfo& info);
|
|
51
|
+
|
|
52
|
+
// Internal cleanup
|
|
53
|
+
void Cleanup();
|
|
54
|
+
};
|
|
55
|
+
|
|
56
|
+
#endif // STREAMING_SESSION_H
|
package/src/addon.cpp
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
#include <napi.h>
|
|
2
|
+
#include "SortformerModel.h"
|
|
3
|
+
#include "StreamingSession.h"
|
|
4
|
+
|
|
5
|
+
Napi::Object Init(Napi::Env env, Napi::Object exports) {
|
|
6
|
+
SortformerModel::Init(env, exports);
|
|
7
|
+
StreamingSession::Init(env, exports);
|
|
8
|
+
return exports;
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
NODE_API_MODULE(sortformer, Init)
|