ugly-app 0.1.116 → 0.1.117

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/dist/cli/version.d.ts +1 -1
  2. package/dist/cli/version.js +1 -1
  3. package/dist/server/audio/stt/AudioStreamProcessor.d.ts +226 -0
  4. package/dist/server/audio/stt/AudioStreamProcessor.d.ts.map +1 -0
  5. package/dist/server/audio/stt/AudioStreamProcessor.js +1023 -0
  6. package/dist/server/audio/stt/AudioStreamProcessor.js.map +1 -0
  7. package/dist/server/audio/tts/LipSyncJa.d.ts +19 -0
  8. package/dist/server/audio/tts/LipSyncJa.d.ts.map +1 -0
  9. package/dist/server/audio/tts/LipSyncJa.js +336 -0
  10. package/dist/server/audio/tts/LipSyncJa.js.map +1 -0
  11. package/dist/server/audio/tts/LipSyncZh.d.ts +19 -0
  12. package/dist/server/audio/tts/LipSyncZh.d.ts.map +1 -0
  13. package/dist/server/audio/tts/LipSyncZh.js +203 -0
  14. package/dist/server/audio/tts/LipSyncZh.js.map +1 -0
  15. package/dist/server/audio/tts/TextToSpeech.d.ts +78 -0
  16. package/dist/server/audio/tts/TextToSpeech.d.ts.map +1 -0
  17. package/dist/server/audio/tts/TextToSpeech.js +530 -0
  18. package/dist/server/audio/tts/TextToSpeech.js.map +1 -0
  19. package/dist/server/audio/tts/TextToSpeechStream.d.ts +77 -0
  20. package/dist/server/audio/tts/TextToSpeechStream.d.ts.map +1 -0
  21. package/dist/server/audio/tts/TextToSpeechStream.js +691 -0
  22. package/dist/server/audio/tts/TextToSpeechStream.js.map +1 -0
  23. package/dist/server/audio/voice/index.d.ts +8 -0
  24. package/dist/server/audio/voice/index.d.ts.map +1 -0
  25. package/dist/server/audio/voice/index.js +200 -0
  26. package/dist/server/audio/voice/index.js.map +1 -0
  27. package/dist/server/index.d.ts +8 -0
  28. package/dist/server/index.d.ts.map +1 -1
  29. package/dist/server/index.js +5 -0
  30. package/dist/server/index.js.map +1 -1
  31. package/package.json +1 -1
  32. package/src/cli/version.ts +1 -1
  33. package/src/server/audio/stt/AudioStreamProcessor.ts +1390 -0
  34. package/src/server/audio/tts/LipSyncZh.ts +224 -0
  35. package/src/server/audio/tts/TextToSpeech.ts +851 -0
  36. package/src/server/audio/tts/TextToSpeechStream.ts +1091 -0
  37. package/src/server/audio/voice/index.ts +310 -0
  38. package/src/server/index.ts +24 -0
@@ -1,2 +1,2 @@
1
- export declare const CLI_VERSION = "0.1.115";
1
+ export declare const CLI_VERSION = "0.1.117";
2
2
  //# sourceMappingURL=version.d.ts.map
@@ -1,3 +1,3 @@
1
1
  // Auto-generated by prebuild — do not edit manually
2
- export const CLI_VERSION = "0.1.115";
2
+ export const CLI_VERSION = "0.1.117";
3
3
  //# sourceMappingURL=version.js.map
@@ -0,0 +1,226 @@
1
+ import type { STTCorrectionEvent, STTMuteEvent, VADStatus } from '../../../shared/Audio.js';
2
+ import type { STTProviderPriority, STTRequiredFeatures } from '../../../shared/Audio.js';
3
+ type STTForcedModel = 'auto' | 'deepgram' | 'whisper' | 'groq_whisper';
4
+ type UserLang = string;
5
+ type UserLangAutoT = string;
6
+ type STTMode = 'realtime' | 'batch' | 'auto';
7
+ export interface AudioStreamConfig {
8
+ userId: string;
9
+ lang: UserLangAutoT;
10
+ mode: STTMode;
11
+ conversationId: string | null;
12
+ vadEnabled: boolean;
13
+ priority?: STTProviderPriority;
14
+ forcedModel?: STTForcedModel;
15
+ diarizeEnabled?: boolean;
16
+ noiseGateEnabled?: boolean;
17
+ serverNoiseGateEnabled?: boolean;
18
+ interimWhisperEnabled?: boolean;
19
+ interimWhisperIntervalMs?: number;
20
+ onTranscript: (text: string, isFinal: boolean, lang: UserLang) => void;
21
+ onCorrection?: (event: STTCorrectionEvent) => void;
22
+ onSpeechStart?: () => void;
23
+ onSpeechEnd?: () => void;
24
+ onVAD?: (probability: number, status: VADStatus) => void;
25
+ onDiarization?: (segmentCount: number, primarySpeaker: number, activeSpeaker: number | null) => void;
26
+ onMuteChange?: (event: STTMuteEvent) => void;
27
+ onProcessedAudio?: (samples: Float32Array) => void;
28
+ onError?: (error: Error) => void;
29
+ selectProvider?: (abstractModel: string, priority: STTProviderPriority, requiredFeatures?: STTRequiredFeatures, reserved?: undefined, durationMs?: number) => Promise<{
30
+ offering: {
31
+ provider: string;
32
+ providerModel: string;
33
+ };
34
+ reason: string;
35
+ } | null>;
36
+ createDeepgramProvider?: (config: {
37
+ lang: string;
38
+ userId: string;
39
+ conversationId: string | null;
40
+ onTranscript: (text: string, isFinal: boolean) => void;
41
+ onError: (error: Error) => void;
42
+ onClose?: () => void;
43
+ onSpeechFinal?: () => void;
44
+ }) => {
45
+ connect: () => Promise<void>;
46
+ send: (samples: Float32Array) => void;
47
+ close: () => void;
48
+ isConnected: () => boolean;
49
+ };
50
+ createWhisperProvider?: (userId: string) => {
51
+ transcribe: (samples: Float32Array, lang: string, conversationId: string | null) => Promise<{
52
+ text: string;
53
+ lang: string;
54
+ }>;
55
+ };
56
+ createGroqWhisperProvider?: (userId: string) => {
57
+ transcribe: (samples: Float32Array, lang: string, conversationId: string | null) => Promise<{
58
+ text: string;
59
+ lang: string;
60
+ }>;
61
+ };
62
+ }
63
+ /**
64
+ * Main orchestrator for server-side speech-to-text processing.
65
+ *
66
+ * This class coordinates:
67
+ * - Audio resampling (48kHz -> 16kHz)
68
+ * - Voice Activity Detection (Silero VAD)
69
+ * - Provider routing based on priority (price, speed, quality, multilingual)
70
+ * - Transcript delivery via callbacks
71
+ */
72
+ export declare class AudioStreamProcessor {
73
+ private config;
74
+ private vad;
75
+ private resampler;
76
+ private segmentation;
77
+ private diarizeEnabled;
78
+ private noiseGateEnabled;
79
+ private serverNoiseGateEnabled;
80
+ private lastMuteState;
81
+ private deepgramProvider;
82
+ private noVadStreamingStarted;
83
+ private noVadBuffer;
84
+ private vadState;
85
+ private speakerHistory;
86
+ private readonly MIN_DURATION_TO_ESTABLISH;
87
+ private readonly TAKEOVER_ENERGY_MULTIPLIER;
88
+ private diarizationSilenceState;
89
+ private readonly DIARIZATION_CHECK_INTERVAL_MS;
90
+ private readonly DIARIZATION_SILENCE_THRESHOLD_MS;
91
+ private readonly vadChunkSize;
92
+ private vadRemainderBuffer;
93
+ private interimWhisper;
94
+ private readonly START_THRESHOLD;
95
+ private readonly STOP_THRESHOLD;
96
+ private readonly MIN_SPEECH_FRAMES;
97
+ private readonly SILENCE_FRAMES_TO_STOP;
98
+ private readonly PRE_ROLL_FRAMES;
99
+ private readonly MAX_UTTERANCE_FRAMES;
100
+ constructor(config: AudioStreamConfig);
101
+ private audioChunkCount;
102
+ private lastAudioLogTime;
103
+ /**
104
+ * Initialize the processor (loads VAD model if enabled)
105
+ */
106
+ init(): Promise<void>;
107
+ /**
108
+ * Start streaming without VAD (for bypass mode)
109
+ */
110
+ private startNoVadStreaming;
111
+ /**
112
+ * Process incoming audio samples from the client.
113
+ * Audio is expected at 48kHz mono Float32.
114
+ *
115
+ * @param samples48k PCM samples at 48kHz
116
+ */
117
+ processAudio(samples48k: Float32Array): Promise<void>;
118
+ /**
119
+ * Process audio without VAD - stream directly to provider
120
+ */
121
+ private processNoVadAudio;
122
+ private maxSpeechProb;
123
+ private lastVadLogTime;
124
+ /**
125
+ * Process a single VAD chunk and manage state machine transitions
126
+ */
127
+ private processVADChunk;
128
+ /**
129
+ * Start a new speech segment
130
+ */
131
+ private startSpeech;
132
+ /**
133
+ * End the current speech segment
134
+ */
135
+ private endSpeech;
136
+ /**
137
+ * Transcribe audio using the best available batch provider based on priority
138
+ */
139
+ private transcribeBatch;
140
+ /**
141
+ * Transcribe audio with a specific provider
142
+ */
143
+ private transcribeWithProvider;
144
+ /**
145
+ * Stream audio chunk to the active provider
146
+ */
147
+ private streamToProvider;
148
+ /**
149
+ * Determine the effective mode based on configuration, language, and priority
150
+ */
151
+ private getEffectiveMode;
152
+ /**
153
+ * Reset VAD state to idle
154
+ */
155
+ private resetToIdle;
156
+ /**
157
+ * Reset diarization silence detection state
158
+ */
159
+ private resetDiarizationSilenceState;
160
+ /**
161
+ * Reset interim Whisper state for a new speech segment
162
+ */
163
+ private resetInterimWhisperState;
164
+ /**
165
+ * Cancel any pending interim Whisper request
166
+ */
167
+ private cancelInterimWhisper;
168
+ /**
169
+ * Check if we should send an interim Whisper request
170
+ */
171
+ private shouldRunInterimWhisper;
172
+ /**
173
+ * Run interim Whisper transcription on accumulated audio
174
+ */
175
+ private runInterimWhisper;
176
+ /**
177
+ * Run full transcription in background and emit correction if different from interim
178
+ */
179
+ private runFullTranscriptionWithCorrection;
180
+ /**
181
+ * Emit mute change event if noise gate is enabled and state changed
182
+ */
183
+ private emitMuteChange;
184
+ /**
185
+ * Get recent audio from speech buffer for diarization analysis
186
+ */
187
+ private getRecentAudioBuffer;
188
+ /**
189
+ * Check if the primary speaker has stopped speaking (diarization-based silence)
190
+ * Returns true if we should trigger speech end due to primary speaker silence
191
+ */
192
+ private checkDiarizationSilence;
193
+ /**
194
+ * Concatenate multiple Float32Array buffers into one
195
+ */
196
+ private concatenateBuffers;
197
+ /**
198
+ * Calculate RMS energy for a segment of audio
199
+ */
200
+ private calculateSegmentEnergy;
201
+ /**
202
+ * Update speaker stats and select the user speaker with takeover logic
203
+ */
204
+ private updateAndSelectUserSpeaker;
205
+ /**
206
+ * Filter audio to keep only the established user speaker.
207
+ * Uses energy-based tracking to establish and maintain user identity.
208
+ */
209
+ private filterToPrimarySpeaker;
210
+ /**
211
+ * Force end the current utterance immediately — used for push-to-talk on button release.
212
+ * In no-VAD batch mode, transcribes the accumulated noVadBuffer right now.
213
+ * In VAD mode, triggers endSpeech() if currently speaking.
214
+ */
215
+ forceEndUtterance(): Promise<void>;
216
+ /**
217
+ * Clean up resources and finalize any pending transcriptions
218
+ */
219
+ destroy(): Promise<void>;
220
+ /**
221
+ * Get current VAD status
222
+ */
223
+ getStatus(): VADStatus;
224
+ }
225
+ export {};
226
+ //# sourceMappingURL=AudioStreamProcessor.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"AudioStreamProcessor.d.ts","sourceRoot":"","sources":["../../../../src/server/audio/stt/AudioStreamProcessor.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,kBAAkB,EAClB,YAAY,EACZ,SAAS,EACV,MAAM,0BAA0B,CAAC;AAClC,OAAO,KAAK,EAAE,mBAAmB,EAAE,mBAAmB,EAAE,MAAM,0BAA0B,CAAC;AAQzF,KAAK,cAAc,GAAG,MAAM,GAAG,UAAU,GAAG,SAAS,GAAG,cAAc,CAAC;AACvE,KAAK,QAAQ,GAAG,MAAM,CAAC;AACvB,KAAK,aAAa,GAAG,MAAM,CAAC;AAC5B,KAAK,OAAO,GAAG,UAAU,GAAG,OAAO,GAAG,MAAM,CAAC;AAI7C,MAAM,WAAW,iBAAiB;IAChC,MAAM,EAAE,MAAM,CAAC;IACf,IAAI,EAAE,aAAa,CAAC;IACpB,IAAI,EAAE,OAAO,CAAC;IACd,cAAc,EAAE,MAAM,GAAG,IAAI,CAAC;IAC9B,UAAU,EAAE,OAAO,CAAC;IACpB,QAAQ,CAAC,EAAE,mBAAmB,CAAC;IAE/B,WAAW,CAAC,EAAE,cAAc,CAAC;IAC7B,cAAc,CAAC,EAAE,OAAO,CAAC;IACzB,gBAAgB,CAAC,EAAE,OAAO,CAAC;IAC3B,sBAAsB,CAAC,EAAE,OAAO,CAAC;IACjC,qBAAqB,CAAC,EAAE,OAAO,CAAC;IAChC,wBAAwB,CAAC,EAAE,MAAM,CAAC;IAClC,YAAY,EAAE,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,QAAQ,KAAK,IAAI,CAAC;IACvE,YAAY,CAAC,EAAE,CAAC,KAAK,EAAE,kBAAkB,KAAK,IAAI,CAAC;IACnD,aAAa,CAAC,EAAE,MAAM,IAAI,CAAC;IAC3B,WAAW,CAAC,EAAE,MAAM,IAAI,CAAC;IACzB,KAAK,CAAC,EAAE,CAAC,WAAW,EAAE,MAAM,EAAE,MAAM,EAAE,SAAS,KAAK,IAAI,CAAC;IACzD,aAAa,CAAC,EAAE,CACd,YAAY,EAAE,MAAM,EACpB,cAAc,EAAE,MAAM,EACtB,aAAa,EAAE,MAAM,GAAG,IAAI,KACzB,IAAI,CAAC;IACV,YAAY,CAAC,EAAE,CAAC,KAAK,EAAE,YAAY,KAAK,IAAI,CAAC;IAC7C,gBAAgB,CAAC,EAAE,CAAC,OAAO,EAAE,YAAY,KAAK,IAAI,CAAC;IACnD,OAAO,CAAC,EAAE,CAAC,KAAK,EAAE,KAAK,KAAK,IAAI,CAAC;IAGjC,cAAc,CAAC,EAAE,CAAC,aAAa,EAAE,MAAM,EAAE,QAAQ,EAAE,mBAAmB,EAAE,gBAAgB,CAAC,EAAE,mBAAmB,EAAE,QAAQ,CAAC,EAAE,SAAS,EAAE,UAAU,CAAC,EAAE,MAAM,KAAK,OAAO,CAAC;QAAE,QAAQ,EAAE;YAAE,QAAQ,EAAE,MAAM,CAAC;YAAC,aAAa,EAAE,MAAM,CAAA;SAAE,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE,GAAG,IAAI,CAAC,CAAC;IACxP,sBAAsB,CAAC,EAAE,CAAC,MAAM,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAC;QAAC,cAAc,EAAE,MAAM,GAAG,IAAI,CAAC;QAAC,YAAY,EAAE,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,OAAO,KAAK,IAAI,CAAC;QAAC,OAAO,EAAE,CAAC,KAAK,EAAE,KAAK,KAAK,IAAI,CAAC;QAAC,OAAO,CAAC,EAAE,MAAM,IAAI,CAAC;QAAC,aAAa,CAAC,EAAE,MAAM,IAAI,CAAA;KAAE,KAAK;QAAE,OAAO,EAAE,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC;QAAC,IAAI,EAAE,CAAC,OAAO,EAAE,YAAY,KAAK,IAAI,CAAC;QAAC,KAAK,EAAE,MAAM,IAAI,CAAC;QAAC,WAAW,EAAE,MAAM,OAAO,CAAA;KAAE,CAAC;IACxW,qBAAqB,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK;QAAE,UAAU,EAAE,CAAC,OAAO,EAAE,YAAY,EAAE,IAAI,EAAE,MAAM,EAAE,cAAc,EAAE,MAAM,GAAG,IAAI,KAAK,OAAO,CAAC;YAAE,IAAI,EAAE,MAAM,CAAC;YAAC,IAAI,EAAE,MAAM,CAAA;SAAE,CAAC,CAAA;KAAE,CAAC;IAC5K,yBAAyB,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,KAAK;QAAE,UAAU,EAAE,CAAC,OAAO,EAAE,YAAY,EAAE,IAAI,EAAE,MAAM,EAAE,cAAc,EAAE,MAAM,GAAG,IAAI,KAAK,OAAO,CAAC;YAAE,IAAI,EAAE,MAAM,CAAC;YAAC,IAAI,EAAE,MAAM,CAAA;SAAE,CAAC,CAAA;KAAE,CAAC;CACjL;AAiBD;;;;;;;;GAQG;AACH,qBAAa,oBAAoB;IA8EnB,OAAO,CAAC,MAAM;IA7E1B,OAAO,CAAC,GAAG,CAAY;IACvB,OAAO,CAAC,SAAS,CAAY;IAC7B,OAAO,CAAC,YAAY,CAAuB;IAC3C,OAAO,CAAC,cAAc,CAAU;IAChC,OAAO,CAAC,gBAAgB,CAAU;IAClC,OAAO,CAAC,sBAAsB,CAAU;IACxC,OAAO,CAAC,aAAa,CAAS;IAC9B,OAAO,CAAC,gBAAgB,CAAyC;IACjE,OAAO,CAAC,qBAAqB,CAAS;IACtC,OAAO,CAAC,WAAW,CAAsB;IAEzC,OAAO,CAAC,QAAQ,CAMd;IAGF,OAAO,CAAC,cAAc,CAepB;IAGF,OAAO,CAAC,QAAQ,CAAC,yBAAyB,CAAO;IACjD,OAAO,CAAC,QAAQ,CAAC,0BAA0B,CAAO;IAGlD,OAAO,CAAC,uBAAuB,CAM7B;IAGF,OAAO,CAAC,QAAQ,CAAC,6BAA6B,CAAO;IACrD,OAAO,CAAC,QAAQ,CAAC,gCAAgC,CAAO;IACxD,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAO;IACpC,OAAO,CAAC,kBAAkB,CAAqC;IAG/D,OAAO,CAAC,cAAc,CAUN;IAGhB,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAS;IACzC,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAS;IACxC,OAAO,CAAC,QAAQ,CAAC,iBAAiB,CAAS;IAC3C,OAAO,CAAC,QAAQ,CAAC,sBAAsB,CAAS;IAChD,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAS;IACzC,OAAO,CAAC,QAAQ,CAAC,oBAAoB,CAAS;gBAE1B,MAAM,EAAE,iBAAiB;IA4C7C,OAAO,CAAC,eAAe,CAAK;IAC5B,OAAO,CAAC,gBAAgB,CAAK;IAE7B;;OAEG;IACG,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC;IAoC3B;;OAEG;YACW,mBAAmB;IAmCjC;;;;;OAKG;IACG,YAAY,CAAC,UAAU,EAAE,YAAY,GAAG,OAAO,CAAC,IAAI,CAAC;IA8E3D;;OAEG;IACH,OAAO,CAAC,iBAAiB;IAczB,OAAO,CAAC,aAAa,CAAK;IAC1B,OAAO,CAAC,cAAc,CAAK;IAE3B;;OAEG;YACW,eAAe;IA8H7B;;OAEG;YACW,WAAW;IA2DzB;;OAEG;YACW,SAAS;IA8DvB;;OAEG;YACW,eAAe;IAyD7B;;OAEG;YACW,sBAAsB;IA6CpC;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAMxB;;OAEG;IACH,OAAO,CAAC,gBAAgB;IA6BxB;;OAEG;IACH,OAAO,CAAC,WAAW;IAWnB;;OAEG;IACH,OAAO,CAAC,4BAA4B;IAOpC;;OAEG;IACH,OAAO,CAAC,wBAAwB;IAShC;;OAEG;IACH,OAAO,CAAC,oBAAoB;IAO5B;;OAEG;IACH,OAAO,CAAC,uBAAuB;IA6C/B;;OAEG;YACW,iBAAiB;IAgE/B;;OAEG;YACW,kCAAkC;IAiChD;;OAEG;IACH,OAAO,CAAC,cAAc;IAWtB;;OAEG;IACH,OAAO,CAAC,oBAAoB;IAuB5B;;;OAGG;YACW,uBAAuB;IAgErC;;OAEG;IACH,OAAO,CAAC,kBAAkB;IAW1B;;OAEG;IACH,OAAO,CAAC,sBAAsB;IAsB9B;;OAEG;IACH,OAAO,CAAC,0BAA0B;IAwGlC;;;OAGG;YACW,sBAAsB;IAyEpC;;;;OAIG;IACG,iBAAiB,IAAI,OAAO,CAAC,IAAI,CAAC;IA0BxC;;OAEG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;IA+B9B;;OAEG;IACH,SAAS,IAAI,SAAS;CAGvB"}