uneeq-js 2.46.6 → 2.47.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,6 +5,12 @@
5
5
  * @license MIT
6
6
  */
7
7
 
8
+ /*!
9
+ * ONNX Runtime Web v1.15.1
10
+ * Copyright (c) Microsoft Corporation. All rights reserved.
11
+ * Licensed under the MIT License.
12
+ */
13
+
8
14
  /*! ieee754. BSD-3-Clause License. Feross Aboukhadijeh <https://feross.org/opensource> */
9
15
 
10
16
  /*! queue-microtask. MIT License. Feross Aboukhadijeh <https://feross.org/opensource> */
@@ -0,0 +1,89 @@
1
+ import { Message } from './messages';
2
+ import { SpeechProbabilities } from './models';
3
+ export interface FrameProcessorOptions {
4
+ /** Threshold over which values returned by the Silero VAD model will be considered as positively indicating speech.
5
+ * The Silero VAD model is run on each frame. This number should be between 0 and 1.
6
+ */
7
+ positiveSpeechThreshold: number;
8
+ /** Threshold under which values returned by the Silero VAD model will be considered as indicating an absence of speech.
9
+ * Note that the creators of the Silero VAD have historically set this number at 0.15 less than `positiveSpeechThreshold`.
10
+ */
11
+ negativeSpeechThreshold: number;
12
+ /** After a VAD value under the `negativeSpeechThreshold` is observed, the algorithm will wait `redemptionFrames` frames
13
+ * before running `onSpeechEnd`. If the model returns a value over `positiveSpeechThreshold` during this grace period, then
14
+ * the algorithm will consider the previously-detected "speech end" as having been a false negative.
15
+ */
16
+ redemptionFrames: number;
17
+ /** Number of audio samples (under a sample rate of 16000) to comprise one "frame" to feed to the Silero VAD model.
18
+ * The `frame` serves as a unit of measurement of lengths of audio segments and many other parameters are defined in terms of
19
+ * frames. The authors of the Silero VAD model offer the following warning:
20
+ * > WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate and
21
+ * 256, 512, 768 samples for 8000 sample rate.
22
+ * > Values other than these may affect model performance!!
23
+ * In this context, audio fed to the VAD model always has sample rate 16000. It is probably a good idea to leave this at 1536.
24
+ */
25
+ frameSamples: number;
26
+ /** Number of frames to prepend to the audio segment that will be passed to `onSpeechEnd`. */
27
+ preSpeechPadFrames: number;
28
+ /** If an audio segment is detected as a speech segment according to initial algorithm but it has fewer than `minSpeechFrames`,
29
+ * it will be discarded and `onVADMisfire` will be run instead of `onSpeechEnd`.
30
+ */
31
+ minSpeechFrames: number;
32
+ }
33
+ export declare const defaultFrameProcessorOptions: FrameProcessorOptions;
34
+ export declare function validateOptions(options: FrameProcessorOptions): void;
35
+ export interface FrameProcessorInterface {
36
+ resume: () => void;
37
+ process: (arr: Float32Array) => Promise<{
38
+ probs?: SpeechProbabilities;
39
+ msg?: Message;
40
+ audio?: Float32Array;
41
+ }>;
42
+ endSegment: () => {
43
+ msg?: Message;
44
+ audio?: Float32Array;
45
+ };
46
+ }
47
+ export declare class FrameProcessor implements FrameProcessorInterface {
48
+ modelProcessFunc: (frame: Float32Array) => Promise<SpeechProbabilities>;
49
+ modelResetFunc: () => any;
50
+ options: FrameProcessorOptions;
51
+ speaking: boolean;
52
+ audioBuffer: Array<{
53
+ frame: Float32Array;
54
+ isSpeech: boolean;
55
+ }>;
56
+ redemptionCounter: number;
57
+ active: boolean;
58
+ constructor(modelProcessFunc: (frame: Float32Array) => Promise<SpeechProbabilities>, modelResetFunc: () => any, options: FrameProcessorOptions);
59
+ reset: () => void;
60
+ pause: () => void;
61
+ resume: () => void;
62
+ endSegment: () => {
63
+ msg: Message;
64
+ audio: Float32Array;
65
+ } | {
66
+ msg: Message;
67
+ audio?: undefined;
68
+ } | {
69
+ msg?: undefined;
70
+ audio?: undefined;
71
+ };
72
+ process: (frame: Float32Array) => Promise<{
73
+ probs?: undefined;
74
+ msg?: undefined;
75
+ audio?: undefined;
76
+ } | {
77
+ probs: SpeechProbabilities;
78
+ msg: Message;
79
+ audio?: undefined;
80
+ } | {
81
+ probs: SpeechProbabilities;
82
+ msg: Message;
83
+ audio: Float32Array;
84
+ } | {
85
+ probs: SpeechProbabilities;
86
+ msg?: undefined;
87
+ audio?: undefined;
88
+ }>;
89
+ }
@@ -0,0 +1,6 @@
1
+ export declare enum Message {
2
+ AudioFrame = "AUDIO_FRAME",
3
+ SpeechStart = "SPEECH_START",
4
+ VADMisfire = "VAD_MISFIRE",
5
+ SpeechEnd = "SPEECH_END"
6
+ }
@@ -0,0 +1,23 @@
1
+ export type ONNXRuntimeAPI = any;
2
+ export type ModelFetcher = () => Promise<ArrayBuffer>;
3
+ export interface SpeechProbabilities {
4
+ notSpeech: number;
5
+ isSpeech: number;
6
+ }
7
+ export interface Model {
8
+ reset_state: () => void;
9
+ process: (arr: Float32Array) => Promise<SpeechProbabilities>;
10
+ }
11
+ export declare class Silero {
12
+ private ort;
13
+ private modelFetcher;
14
+ static new: (ort: ONNXRuntimeAPI, modelFetcher: ModelFetcher) => Promise<Silero>;
15
+ _session: any;
16
+ _h: any;
17
+ _c: any;
18
+ _sr: any;
19
+ constructor(ort: ONNXRuntimeAPI, modelFetcher: ModelFetcher);
20
+ init: () => Promise<void>;
21
+ reset_state: () => void;
22
+ process: (audioFrame: Float32Array) => Promise<SpeechProbabilities>;
23
+ }
@@ -0,0 +1,61 @@
1
+ /**
2
+ * This VAD module is a fork from the following project:
3
+ * https://github.com/ricky0123/vad/tree/master/packages/web
4
+ *
5
+ * It has been modified to only include what is required for our
6
+ * own VAD implementation.
7
+ *
8
+ * This module is a wrapper to bring the following ML Model into
9
+ * JS: https://github.com/snakers4/silero-vad
10
+ */
11
+ import { FrameProcessor, FrameProcessorOptions } from './frame-processor';
12
+ import { SpeechProbabilities } from './models';
13
+ interface RealTimeVADCallbacks {
14
+ /** Callback to run after each frame. The size (number of samples) of a frame is given by `frameSamples`. */
15
+ onFrameProcessed: (probabilities: SpeechProbabilities) => any;
16
+ /** Callback to run if speech start was detected but `onSpeechEnd` will not be run because the
17
+ * audio segment is smaller than `minSpeechFrames`.
18
+ */
19
+ onVADMisfire: () => any;
20
+ /** Callback to run when speech start is detected */
21
+ onSpeechStart: () => any;
22
+ /**
23
+ * Callback to run when speech end is detected.
24
+ * Takes as arg a Float32Array of audio samples between -1 and 1, sample rate 16000.
25
+ * This will not run if the audio segment is smaller than `minSpeechFrames`.
26
+ */
27
+ onSpeechEnd: (audio: Float32Array) => any;
28
+ }
29
+ interface RealTimeVADOptionsWithStream extends FrameProcessorOptions, RealTimeVADCallbacks {
30
+ stream: MediaStream;
31
+ assetBasePath: string;
32
+ }
33
+ export type RealTimeVADOptions = RealTimeVADOptionsWithStream;
34
+ export declare const defaultRealTimeVADOptions: RealTimeVADOptions;
35
+ export declare class MicVAD {
36
+ options: RealTimeVADOptions;
37
+ static new(options?: Partial<RealTimeVADOptions>): Promise<MicVAD>;
38
+ audioContext: AudioContext;
39
+ stream: MediaStream;
40
+ audioNodeVAD: AudioNodeVAD;
41
+ listening: boolean;
42
+ constructor(options: RealTimeVADOptions);
43
+ init: () => Promise<void>;
44
+ pause: () => void;
45
+ start: () => void;
46
+ }
47
+ export declare class AudioNodeVAD {
48
+ ctx: AudioContext;
49
+ options: RealTimeVADOptions;
50
+ static new(ctx: AudioContext, options?: Partial<RealTimeVADOptions>): Promise<AudioNodeVAD>;
51
+ frameProcessor: FrameProcessor;
52
+ entryNode: AudioNode;
53
+ constructor(ctx: AudioContext, options: RealTimeVADOptions);
54
+ pause: () => void;
55
+ start: () => void;
56
+ receive: (node: AudioNode) => void;
57
+ processFrame: (frame: Float32Array) => Promise<void>;
58
+ init: () => Promise<void>;
59
+ modelFetcher: () => Promise<ArrayBuffer>;
60
+ }
61
+ export {};
@@ -0,0 +1,24 @@
1
+ import { SpeechHandlerOptions } from '../types/SpeechHandlerOptions';
2
+ export declare class SpeechHandler {
3
+ private options;
4
+ private ws;
5
+ private mediaRecorder;
6
+ private logPrefix;
7
+ private speechBuffer;
8
+ private micVad;
9
+ private recordingLive;
10
+ private headerBlob;
11
+ constructor(options: SpeechHandlerOptions);
12
+ private initWebsocket;
13
+ private loadSavedAudioHeaders;
14
+ private initVoiceActivityDetection;
15
+ private onVadSpeechStart;
16
+ private onVadSpeechEnd;
17
+ private handleWebsocketMessage;
18
+ private sendStartMessage;
19
+ private sendStopMessage;
20
+ private initMicrophone;
21
+ private initMediaRecorder;
22
+ private mediaRecorderOnData;
23
+ private handleTranscriptionResult;
24
+ }
@@ -1,5 +1,6 @@
1
1
  import { ClientPerformanceStats } from './ClientPerformanceStats';
2
2
  import { PublishSubscribeState } from './PublishSubscribeState';
3
+ import { SpeechTranscriptionResult } from './SpeechTranscriptionResult';
3
4
  import { Devices } from './types';
4
5
  export interface UneeqMessage {
5
6
  uneeqMessageType: UneeqMessageType;
@@ -32,6 +33,9 @@ export declare enum UneeqMessageType {
32
33
  FinishedSpeaking = "FinishedSpeaking",
33
34
  AvatarTextInputFinished = "AvatarTextInputFinished",
34
35
  AvatarQuestionText = "AvatarQuestionText",
36
+ SpeechTranscription = "SpeechTranscription",
37
+ UserStartedSpeaking = "UserStartedSpeaking",
38
+ UserStoppedSpeaking = "UserStoppedSpeaking",
35
39
  DevicePermissionAllowed = "DevicePermissionAllowed",
36
40
  RecordingStarted = "RecordingStarted",
37
41
  RecordingStopped = "RecordingStopped",
@@ -243,6 +247,28 @@ export declare class AvatarQuestionMessage implements UneeqMessage {
243
247
  uneeqMessageType: UneeqMessageType;
244
248
  constructor(question: any, transcriptId: string);
245
249
  }
250
+ /**
251
+ * The question the user asked
252
+ */
253
+ export declare class SpeechTranscriptionMessage implements UneeqMessage {
254
+ readonly speechTranscription: SpeechTranscriptionResult;
255
+ uneeqMessageType: UneeqMessageType;
256
+ constructor(speechTranscription: SpeechTranscriptionResult);
257
+ }
258
+ /**
259
+ * Digital human has started speaking the message
260
+ */
261
+ export declare class UserStartedSpeakingMessage implements UneeqMessage {
262
+ uneeqMessageType: UneeqMessageType;
263
+ constructor();
264
+ }
265
+ /**
266
+ * Digital human has started speaking the message
267
+ */
268
+ export declare class UserStoppedSpeakingMessage implements UneeqMessage {
269
+ uneeqMessageType: UneeqMessageType;
270
+ constructor();
271
+ }
246
272
  /**
247
273
  * Digital human has started speaking the message
248
274
  */
@@ -0,0 +1,11 @@
1
+ import { Subject } from 'rxjs';
2
+ import { API } from '../service/api';
3
+ export interface SpeechHandlerOptions {
4
+ apiUrl: string;
5
+ sessionId: string;
6
+ api: API;
7
+ messages: Subject<any>;
8
+ assetBasePath?: string;
9
+ hintPhrases: string;
10
+ locales: string;
11
+ }
@@ -0,0 +1,4 @@
1
+ export declare enum SpeechRecognitionMessageAction {
2
+ startTranscription = "startTranscription",
3
+ stopTranscription = "stopTranscription"
4
+ }
@@ -0,0 +1,9 @@
1
+ import { SpeechRecognitionMessageAction } from './SpeechRecognitionMessageAction';
2
+ export interface SpeechRecognitionStartMessage {
3
+ action: SpeechRecognitionMessageAction.startTranscription;
4
+ channels: number;
5
+ sampleRate: number;
6
+ interimResults: boolean;
7
+ lang: string;
8
+ phrases: string;
9
+ }
@@ -0,0 +1,4 @@
1
+ import { SpeechRecognitionMessageAction } from './SpeechRecognitionMessageAction';
2
+ export interface SpeechRecognitionStopMessage {
3
+ action: SpeechRecognitionMessageAction.stopTranscription;
4
+ }
@@ -0,0 +1,6 @@
1
+ export interface SpeechTranscriptionResult {
2
+ confidence: number;
3
+ final: boolean;
4
+ language_code: string;
5
+ transcript: string;
6
+ }
@@ -72,4 +72,13 @@ export interface UneeqOptions {
72
72
  * Locales should be separated with a colon e.g "en-US:en-GB:en-AU".
73
73
  */
74
74
  speechToTextLocales?: string;
75
+ /**
76
+ * Override the asset base path.
77
+ */
78
+ assetBasePath?: string;
79
+ /**
80
+ * Speech Recognition Hint Phrases
81
+ * A space separated list of phrases the speech recognition system should expect.
82
+ */
83
+ speechRecognitionHintPhrases?: string;
75
84
  }
@@ -22,6 +22,7 @@ export declare class Uneeq {
22
22
  private messageHandler;
23
23
  private voiceInputManager;
24
24
  private unavailableAvatarMsgSent;
25
+ private speechHandler;
25
26
  private readonly options;
26
27
  private readonly messageCallback;
27
28
  private readonly startSessionData;