uneeq-js 2.46.7 → 2.47.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +1 -1
- package/dist/index.js.LICENSE.txt +6 -0
- package/dist/src/lib/vad/frame-processor.d.ts +89 -0
- package/dist/src/lib/vad/messages.d.ts +6 -0
- package/dist/src/lib/vad/models.d.ts +23 -0
- package/dist/src/lib/vad/real-time-vad.d.ts +61 -0
- package/dist/src/service/speech-handler.d.ts +25 -0
- package/dist/src/types/MessageTypes.d.ts +26 -0
- package/dist/src/types/SpeechHandlerOptions.d.ts +11 -0
- package/dist/src/types/SpeechRecognitionMessageAction.d.ts +4 -0
- package/dist/src/types/SpeechRecognitionStartMessage.d.ts +9 -0
- package/dist/src/types/SpeechRecognitionStopMessage.d.ts +4 -0
- package/dist/src/types/SpeechTranscriptionResult.d.ts +6 -0
- package/dist/src/types/UneeqOptions.d.ts +9 -0
- package/dist/src/uneeq.d.ts +1 -0
- package/dist/umd/index.js +2465 -192
- package/package.json +5 -1
- package/readme.md +6 -0
|
@@ -5,6 +5,12 @@
|
|
|
5
5
|
* @license MIT
|
|
6
6
|
*/
|
|
7
7
|
|
|
8
|
+
/*!
|
|
9
|
+
* ONNX Runtime Web v1.15.1
|
|
10
|
+
* Copyright (c) Microsoft Corporation. All rights reserved.
|
|
11
|
+
* Licensed under the MIT License.
|
|
12
|
+
*/
|
|
13
|
+
|
|
8
14
|
/*! ieee754. BSD-3-Clause License. Feross Aboukhadijeh <https://feross.org/opensource> */
|
|
9
15
|
|
|
10
16
|
/*! queue-microtask. MIT License. Feross Aboukhadijeh <https://feross.org/opensource> */
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import { Message } from './messages';
|
|
2
|
+
import { SpeechProbabilities } from './models';
|
|
3
|
+
export interface FrameProcessorOptions {
|
|
4
|
+
/** Threshold over which values returned by the Silero VAD model will be considered as positively indicating speech.
|
|
5
|
+
* The Silero VAD model is run on each frame. This number should be between 0 and 1.
|
|
6
|
+
*/
|
|
7
|
+
positiveSpeechThreshold: number;
|
|
8
|
+
/** Threshold under which values returned by the Silero VAD model will be considered as indicating an absence of speech.
|
|
9
|
+
* Note that the creators of the Silero VAD have historically set this number at 0.15 less than `positiveSpeechThreshold`.
|
|
10
|
+
*/
|
|
11
|
+
negativeSpeechThreshold: number;
|
|
12
|
+
/** After a VAD value under the `negativeSpeechThreshold` is observed, the algorithm will wait `redemptionFrames` frames
|
|
13
|
+
* before running `onSpeechEnd`. If the model returns a value over `positiveSpeechThreshold` during this grace period, then
|
|
14
|
+
* the algorithm will consider the previously-detected "speech end" as having been a false negative.
|
|
15
|
+
*/
|
|
16
|
+
redemptionFrames: number;
|
|
17
|
+
/** Number of audio samples (under a sample rate of 16000) to comprise one "frame" to feed to the Silero VAD model.
|
|
18
|
+
* The `frame` serves as a unit of measurement of lengths of audio segments and many other parameters are defined in terms of
|
|
19
|
+
* frames. The authors of the Silero VAD model offer the following warning:
|
|
20
|
+
* > WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate and
|
|
21
|
+
* 256, 512, 768 samples for 8000 sample rate.
|
|
22
|
+
* > Values other than these may affect model performance!!
|
|
23
|
+
* In this context, audio fed to the VAD model always has sample rate 16000. It is probably a good idea to leave this at 1536.
|
|
24
|
+
*/
|
|
25
|
+
frameSamples: number;
|
|
26
|
+
/** Number of frames to prepend to the audio segment that will be passed to `onSpeechEnd`. */
|
|
27
|
+
preSpeechPadFrames: number;
|
|
28
|
+
/** If an audio segment is detected as a speech segment according to initial algorithm but it has fewer than `minSpeechFrames`,
|
|
29
|
+
* it will be discarded and `onVADMisfire` will be run instead of `onSpeechEnd`.
|
|
30
|
+
*/
|
|
31
|
+
minSpeechFrames: number;
|
|
32
|
+
}
|
|
33
|
+
export declare const defaultFrameProcessorOptions: FrameProcessorOptions;
|
|
34
|
+
export declare function validateOptions(options: FrameProcessorOptions): void;
|
|
35
|
+
export interface FrameProcessorInterface {
|
|
36
|
+
resume: () => void;
|
|
37
|
+
process: (arr: Float32Array) => Promise<{
|
|
38
|
+
probs?: SpeechProbabilities;
|
|
39
|
+
msg?: Message;
|
|
40
|
+
audio?: Float32Array;
|
|
41
|
+
}>;
|
|
42
|
+
endSegment: () => {
|
|
43
|
+
msg?: Message;
|
|
44
|
+
audio?: Float32Array;
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
export declare class FrameProcessor implements FrameProcessorInterface {
|
|
48
|
+
modelProcessFunc: (frame: Float32Array) => Promise<SpeechProbabilities>;
|
|
49
|
+
modelResetFunc: () => any;
|
|
50
|
+
options: FrameProcessorOptions;
|
|
51
|
+
speaking: boolean;
|
|
52
|
+
audioBuffer: Array<{
|
|
53
|
+
frame: Float32Array;
|
|
54
|
+
isSpeech: boolean;
|
|
55
|
+
}>;
|
|
56
|
+
redemptionCounter: number;
|
|
57
|
+
active: boolean;
|
|
58
|
+
constructor(modelProcessFunc: (frame: Float32Array) => Promise<SpeechProbabilities>, modelResetFunc: () => any, options: FrameProcessorOptions);
|
|
59
|
+
reset: () => void;
|
|
60
|
+
pause: () => void;
|
|
61
|
+
resume: () => void;
|
|
62
|
+
endSegment: () => {
|
|
63
|
+
msg: Message;
|
|
64
|
+
audio: Float32Array;
|
|
65
|
+
} | {
|
|
66
|
+
msg: Message;
|
|
67
|
+
audio?: undefined;
|
|
68
|
+
} | {
|
|
69
|
+
msg?: undefined;
|
|
70
|
+
audio?: undefined;
|
|
71
|
+
};
|
|
72
|
+
process: (frame: Float32Array) => Promise<{
|
|
73
|
+
probs?: undefined;
|
|
74
|
+
msg?: undefined;
|
|
75
|
+
audio?: undefined;
|
|
76
|
+
} | {
|
|
77
|
+
probs: SpeechProbabilities;
|
|
78
|
+
msg: Message;
|
|
79
|
+
audio?: undefined;
|
|
80
|
+
} | {
|
|
81
|
+
probs: SpeechProbabilities;
|
|
82
|
+
msg: Message;
|
|
83
|
+
audio: Float32Array;
|
|
84
|
+
} | {
|
|
85
|
+
probs: SpeechProbabilities;
|
|
86
|
+
msg?: undefined;
|
|
87
|
+
audio?: undefined;
|
|
88
|
+
}>;
|
|
89
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
export type ONNXRuntimeAPI = any;
|
|
2
|
+
export type ModelFetcher = () => Promise<ArrayBuffer>;
|
|
3
|
+
export interface SpeechProbabilities {
|
|
4
|
+
notSpeech: number;
|
|
5
|
+
isSpeech: number;
|
|
6
|
+
}
|
|
7
|
+
export interface Model {
|
|
8
|
+
reset_state: () => void;
|
|
9
|
+
process: (arr: Float32Array) => Promise<SpeechProbabilities>;
|
|
10
|
+
}
|
|
11
|
+
export declare class Silero {
|
|
12
|
+
private ort;
|
|
13
|
+
private modelFetcher;
|
|
14
|
+
static new: (ort: ONNXRuntimeAPI, modelFetcher: ModelFetcher) => Promise<Silero>;
|
|
15
|
+
_session: any;
|
|
16
|
+
_h: any;
|
|
17
|
+
_c: any;
|
|
18
|
+
_sr: any;
|
|
19
|
+
constructor(ort: ONNXRuntimeAPI, modelFetcher: ModelFetcher);
|
|
20
|
+
init: () => Promise<void>;
|
|
21
|
+
reset_state: () => void;
|
|
22
|
+
process: (audioFrame: Float32Array) => Promise<SpeechProbabilities>;
|
|
23
|
+
}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* This VAD module is a fork from the following project:
|
|
3
|
+
* https://github.com/ricky0123/vad/tree/master/packages/web
|
|
4
|
+
*
|
|
5
|
+
* It has been modified to only include what is required for our
|
|
6
|
+
* own VAD implementation.
|
|
7
|
+
*
|
|
8
|
+
* This module is a wrapper to bring the following ML Model into
|
|
9
|
+
* JS: https://github.com/snakers4/silero-vad
|
|
10
|
+
*/
|
|
11
|
+
import { FrameProcessor, FrameProcessorOptions } from './frame-processor';
|
|
12
|
+
import { SpeechProbabilities } from './models';
|
|
13
|
+
interface RealTimeVADCallbacks {
|
|
14
|
+
/** Callback to run after each frame. The size (number of samples) of a frame is given by `frameSamples`. */
|
|
15
|
+
onFrameProcessed: (probabilities: SpeechProbabilities) => any;
|
|
16
|
+
/** Callback to run if speech start was detected but `onSpeechEnd` will not be run because the
|
|
17
|
+
* audio segment is smaller than `minSpeechFrames`.
|
|
18
|
+
*/
|
|
19
|
+
onVADMisfire: () => any;
|
|
20
|
+
/** Callback to run when speech start is detected */
|
|
21
|
+
onSpeechStart: () => any;
|
|
22
|
+
/**
|
|
23
|
+
* Callback to run when speech end is detected.
|
|
24
|
+
* Takes as arg a Float32Array of audio samples between -1 and 1, sample rate 16000.
|
|
25
|
+
* This will not run if the audio segment is smaller than `minSpeechFrames`.
|
|
26
|
+
*/
|
|
27
|
+
onSpeechEnd: (audio: Float32Array) => any;
|
|
28
|
+
}
|
|
29
|
+
interface RealTimeVADOptionsWithStream extends FrameProcessorOptions, RealTimeVADCallbacks {
|
|
30
|
+
stream: MediaStream;
|
|
31
|
+
assetBasePath: string;
|
|
32
|
+
}
|
|
33
|
+
export type RealTimeVADOptions = RealTimeVADOptionsWithStream;
|
|
34
|
+
export declare const defaultRealTimeVADOptions: RealTimeVADOptions;
|
|
35
|
+
export declare class MicVAD {
|
|
36
|
+
options: RealTimeVADOptions;
|
|
37
|
+
static new(options?: Partial<RealTimeVADOptions>): Promise<MicVAD>;
|
|
38
|
+
audioContext: AudioContext;
|
|
39
|
+
stream: MediaStream;
|
|
40
|
+
audioNodeVAD: AudioNodeVAD;
|
|
41
|
+
listening: boolean;
|
|
42
|
+
constructor(options: RealTimeVADOptions);
|
|
43
|
+
init: () => Promise<void>;
|
|
44
|
+
pause: () => void;
|
|
45
|
+
start: () => void;
|
|
46
|
+
}
|
|
47
|
+
export declare class AudioNodeVAD {
|
|
48
|
+
ctx: AudioContext;
|
|
49
|
+
options: RealTimeVADOptions;
|
|
50
|
+
static new(ctx: AudioContext, options?: Partial<RealTimeVADOptions>): Promise<AudioNodeVAD>;
|
|
51
|
+
frameProcessor: FrameProcessor;
|
|
52
|
+
entryNode: AudioNode;
|
|
53
|
+
constructor(ctx: AudioContext, options: RealTimeVADOptions);
|
|
54
|
+
pause: () => void;
|
|
55
|
+
start: () => void;
|
|
56
|
+
receive: (node: AudioNode) => void;
|
|
57
|
+
processFrame: (frame: Float32Array) => Promise<void>;
|
|
58
|
+
init: () => Promise<void>;
|
|
59
|
+
modelFetcher: () => Promise<ArrayBuffer>;
|
|
60
|
+
}
|
|
61
|
+
export {};
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import { SpeechHandlerOptions } from '../types/SpeechHandlerOptions';
|
|
2
|
+
export declare class SpeechHandler {
|
|
3
|
+
private options;
|
|
4
|
+
private ws;
|
|
5
|
+
private mediaRecorder;
|
|
6
|
+
private logPrefix;
|
|
7
|
+
private speechBuffer;
|
|
8
|
+
private micVad;
|
|
9
|
+
private recordingLive;
|
|
10
|
+
private headerBlob;
|
|
11
|
+
private OpusMediaRecorder;
|
|
12
|
+
constructor(options: SpeechHandlerOptions);
|
|
13
|
+
private initWebsocket;
|
|
14
|
+
private loadSavedAudioHeaders;
|
|
15
|
+
private initVoiceActivityDetection;
|
|
16
|
+
private onVadSpeechStart;
|
|
17
|
+
private onVadSpeechEnd;
|
|
18
|
+
private handleWebsocketMessage;
|
|
19
|
+
private sendStartMessage;
|
|
20
|
+
private sendStopMessage;
|
|
21
|
+
private initMicrophone;
|
|
22
|
+
private initMediaRecorder;
|
|
23
|
+
private mediaRecorderOnData;
|
|
24
|
+
private handleTranscriptionResult;
|
|
25
|
+
}
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { ClientPerformanceStats } from './ClientPerformanceStats';
|
|
2
2
|
import { PublishSubscribeState } from './PublishSubscribeState';
|
|
3
|
+
import { SpeechTranscriptionResult } from './SpeechTranscriptionResult';
|
|
3
4
|
import { Devices } from './types';
|
|
4
5
|
export interface UneeqMessage {
|
|
5
6
|
uneeqMessageType: UneeqMessageType;
|
|
@@ -32,6 +33,9 @@ export declare enum UneeqMessageType {
|
|
|
32
33
|
FinishedSpeaking = "FinishedSpeaking",
|
|
33
34
|
AvatarTextInputFinished = "AvatarTextInputFinished",
|
|
34
35
|
AvatarQuestionText = "AvatarQuestionText",
|
|
36
|
+
SpeechTranscription = "SpeechTranscription",
|
|
37
|
+
UserStartedSpeaking = "UserStartedSpeaking",
|
|
38
|
+
UserStoppedSpeaking = "UserStoppedSpeaking",
|
|
35
39
|
DevicePermissionAllowed = "DevicePermissionAllowed",
|
|
36
40
|
RecordingStarted = "RecordingStarted",
|
|
37
41
|
RecordingStopped = "RecordingStopped",
|
|
@@ -243,6 +247,28 @@ export declare class AvatarQuestionMessage implements UneeqMessage {
|
|
|
243
247
|
uneeqMessageType: UneeqMessageType;
|
|
244
248
|
constructor(question: any, transcriptId: string);
|
|
245
249
|
}
|
|
250
|
+
/**
|
|
251
|
+
* The question the user asked
|
|
252
|
+
*/
|
|
253
|
+
export declare class SpeechTranscriptionMessage implements UneeqMessage {
|
|
254
|
+
readonly speechTranscription: SpeechTranscriptionResult;
|
|
255
|
+
uneeqMessageType: UneeqMessageType;
|
|
256
|
+
constructor(speechTranscription: SpeechTranscriptionResult);
|
|
257
|
+
}
|
|
258
|
+
/**
|
|
259
|
+
* Digital human has started speaking the message
|
|
260
|
+
*/
|
|
261
|
+
export declare class UserStartedSpeakingMessage implements UneeqMessage {
|
|
262
|
+
uneeqMessageType: UneeqMessageType;
|
|
263
|
+
constructor();
|
|
264
|
+
}
|
|
265
|
+
/**
|
|
266
|
+
* Digital human has started speaking the message
|
|
267
|
+
*/
|
|
268
|
+
export declare class UserStoppedSpeakingMessage implements UneeqMessage {
|
|
269
|
+
uneeqMessageType: UneeqMessageType;
|
|
270
|
+
constructor();
|
|
271
|
+
}
|
|
246
272
|
/**
|
|
247
273
|
* Digital human has started speaking the message
|
|
248
274
|
*/
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import { Subject } from 'rxjs';
|
|
2
|
+
import { API } from '../service/api';
|
|
3
|
+
export interface SpeechHandlerOptions {
|
|
4
|
+
apiUrl: string;
|
|
5
|
+
sessionId: string;
|
|
6
|
+
api: API;
|
|
7
|
+
messages: Subject<any>;
|
|
8
|
+
assetBasePath?: string;
|
|
9
|
+
hintPhrases: string;
|
|
10
|
+
locales: string;
|
|
11
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import { SpeechRecognitionMessageAction } from './SpeechRecognitionMessageAction';
|
|
2
|
+
export interface SpeechRecognitionStartMessage {
|
|
3
|
+
action: SpeechRecognitionMessageAction.startTranscription;
|
|
4
|
+
channels: number;
|
|
5
|
+
sampleRate: number;
|
|
6
|
+
interimResults: boolean;
|
|
7
|
+
lang: string;
|
|
8
|
+
phrases: string;
|
|
9
|
+
}
|
|
@@ -72,4 +72,13 @@ export interface UneeqOptions {
|
|
|
72
72
|
* Locales should be separated with a colon e.g "en-US:en-GB:en-AU".
|
|
73
73
|
*/
|
|
74
74
|
speechToTextLocales?: string;
|
|
75
|
+
/**
|
|
76
|
+
* Override the asset base path.
|
|
77
|
+
*/
|
|
78
|
+
assetBasePath?: string;
|
|
79
|
+
/**
|
|
80
|
+
* Speech Recognition Hint Phrases
|
|
81
|
+
* A space separated list of phrases the speech recognition system should expect.
|
|
82
|
+
*/
|
|
83
|
+
speechRecognitionHintPhrases?: string;
|
|
75
84
|
}
|
package/dist/src/uneeq.d.ts
CHANGED