@speechos/core 0.2.9 → 0.2.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/backend.d.cts +1 -16
- package/dist/backend.d.ts +1 -16
- package/dist/index.cjs +236 -853
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +4 -3
- package/dist/index.d.ts +4 -3
- package/dist/index.js +234 -831
- package/dist/index.js.map +1 -1
- package/dist/speechos.d.cts +10 -39
- package/dist/speechos.d.ts +10 -39
- package/dist/state.d.cts +11 -1
- package/dist/state.d.ts +11 -1
- package/dist/tts.d.cts +74 -0
- package/dist/tts.d.ts +74 -0
- package/dist/types.d.cts +42 -22
- package/dist/types.d.ts +42 -22
- package/dist/websocket.d.cts +5 -4
- package/dist/websocket.d.ts +5 -4
- package/package.json +2 -5
- package/dist/livekit.d.cts +0 -199
- package/dist/livekit.d.ts +0 -199
package/dist/index.js
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
import { Room, RoomEvent, Track, createLocalAudioTrack } from "livekit-client";
|
|
2
|
-
|
|
3
1
|
//#region src/config.ts
|
|
4
2
|
/**
|
|
5
3
|
* Default host - can be overridden by SPEECHOS_HOST env var at build time
|
|
@@ -187,6 +185,8 @@ const initialState = {
|
|
|
187
185
|
isMicEnabled: false,
|
|
188
186
|
activeAction: null,
|
|
189
187
|
focusedElement: null,
|
|
188
|
+
selectionText: null,
|
|
189
|
+
selectionElement: null,
|
|
190
190
|
recordingState: "idle",
|
|
191
191
|
errorMessage: null
|
|
192
192
|
};
|
|
@@ -288,6 +288,26 @@ var StateManager = class {
|
|
|
288
288
|
this.setState({ focusedElement: element });
|
|
289
289
|
}
|
|
290
290
|
/**
|
|
291
|
+
* Set the current text selection
|
|
292
|
+
* @param text - Selected text (null to clear)
|
|
293
|
+
* @param element - Element associated with selection
|
|
294
|
+
*/
|
|
295
|
+
setSelection(text, element) {
|
|
296
|
+
this.setState({
|
|
297
|
+
selectionText: text,
|
|
298
|
+
selectionElement: element
|
|
299
|
+
});
|
|
300
|
+
}
|
|
301
|
+
/**
|
|
302
|
+
* Clear the current text selection
|
|
303
|
+
*/
|
|
304
|
+
clearSelection() {
|
|
305
|
+
this.setState({
|
|
306
|
+
selectionText: null,
|
|
307
|
+
selectionElement: null
|
|
308
|
+
});
|
|
309
|
+
}
|
|
310
|
+
/**
|
|
291
311
|
* Set the active action
|
|
292
312
|
* @param action - The action to set as active
|
|
293
313
|
*/
|
|
@@ -303,7 +323,7 @@ var StateManager = class {
|
|
|
303
323
|
}
|
|
304
324
|
/**
|
|
305
325
|
* Set the connection state
|
|
306
|
-
* @param isConnected - Whether connected to
|
|
326
|
+
* @param isConnected - Whether connected to the backend
|
|
307
327
|
*/
|
|
308
328
|
setConnected(isConnected) {
|
|
309
329
|
this.setState({ isConnected });
|
|
@@ -389,746 +409,6 @@ function createStateManager(initial) {
|
|
|
389
409
|
});
|
|
390
410
|
}
|
|
391
411
|
|
|
392
|
-
//#endregion
|
|
393
|
-
//#region src/livekit.ts
|
|
394
|
-
const MESSAGE_TYPE_REQUEST_TRANSCRIPT$1 = "request_transcript";
|
|
395
|
-
const MESSAGE_TYPE_TRANSCRIPT$1 = "transcript";
|
|
396
|
-
const MESSAGE_TYPE_EDIT_TEXT$1 = "edit_text";
|
|
397
|
-
const MESSAGE_TYPE_EDITED_TEXT$1 = "edited_text";
|
|
398
|
-
const MESSAGE_TYPE_EXECUTE_COMMAND$1 = "execute_command";
|
|
399
|
-
const MESSAGE_TYPE_COMMAND_RESULT$1 = "command_result";
|
|
400
|
-
const MESSAGE_TYPE_ERROR$1 = "error";
|
|
401
|
-
const TOPIC_SPEECHOS = "speechos";
|
|
402
|
-
const TOKEN_CACHE_TTL_MS = 4 * 60 * 1e3;
|
|
403
|
-
/**
|
|
404
|
-
* A deferred promise with timeout support.
|
|
405
|
-
* Encapsulates resolve/reject/timeout in a single object for cleaner async handling.
|
|
406
|
-
*/
|
|
407
|
-
var Deferred = class {
|
|
408
|
-
promise;
|
|
409
|
-
_resolve;
|
|
410
|
-
_reject;
|
|
411
|
-
_timeoutId = null;
|
|
412
|
-
_settled = false;
|
|
413
|
-
constructor() {
|
|
414
|
-
this.promise = new Promise((resolve, reject) => {
|
|
415
|
-
this._resolve = resolve;
|
|
416
|
-
this._reject = reject;
|
|
417
|
-
});
|
|
418
|
-
}
|
|
419
|
-
/**
|
|
420
|
-
* Set a timeout that will reject the promise with the given error
|
|
421
|
-
*/
|
|
422
|
-
setTimeout(ms, errorMessage, errorCode, errorSource) {
|
|
423
|
-
this._timeoutId = setTimeout(() => {
|
|
424
|
-
if (!this._settled) {
|
|
425
|
-
console.error(`[SpeechOS] Error: ${errorMessage} (${errorCode})`);
|
|
426
|
-
events.emit("error", {
|
|
427
|
-
code: errorCode,
|
|
428
|
-
message: errorMessage,
|
|
429
|
-
source: errorSource
|
|
430
|
-
});
|
|
431
|
-
this.reject(new Error(errorMessage));
|
|
432
|
-
}
|
|
433
|
-
}, ms);
|
|
434
|
-
}
|
|
435
|
-
resolve(value) {
|
|
436
|
-
if (!this._settled) {
|
|
437
|
-
this._settled = true;
|
|
438
|
-
this.clearTimeout();
|
|
439
|
-
this._resolve(value);
|
|
440
|
-
}
|
|
441
|
-
}
|
|
442
|
-
reject(error) {
|
|
443
|
-
if (!this._settled) {
|
|
444
|
-
this._settled = true;
|
|
445
|
-
this.clearTimeout();
|
|
446
|
-
this._reject(error);
|
|
447
|
-
}
|
|
448
|
-
}
|
|
449
|
-
clearTimeout() {
|
|
450
|
-
if (this._timeoutId !== null) {
|
|
451
|
-
clearTimeout(this._timeoutId);
|
|
452
|
-
this._timeoutId = null;
|
|
453
|
-
}
|
|
454
|
-
}
|
|
455
|
-
get isSettled() {
|
|
456
|
-
return this._settled;
|
|
457
|
-
}
|
|
458
|
-
};
|
|
459
|
-
/**
|
|
460
|
-
* LiveKit connection manager
|
|
461
|
-
*/
|
|
462
|
-
var LiveKitManager = class {
|
|
463
|
-
room = null;
|
|
464
|
-
tokenData = null;
|
|
465
|
-
micTrack = null;
|
|
466
|
-
cachedTokenData = null;
|
|
467
|
-
tokenCacheTimestamp = null;
|
|
468
|
-
tokenPrefetchPromise = null;
|
|
469
|
-
tokenRefreshTimer = null;
|
|
470
|
-
autoRefreshEnabled = false;
|
|
471
|
-
pendingTranscript = null;
|
|
472
|
-
pendingEditText = null;
|
|
473
|
-
pendingCommand = null;
|
|
474
|
-
pendingTrackSubscribed = null;
|
|
475
|
-
editOriginalText = null;
|
|
476
|
-
sessionSettings = {};
|
|
477
|
-
/**
|
|
478
|
-
* Check if the cached token is still valid (within TTL)
|
|
479
|
-
*/
|
|
480
|
-
isCachedTokenValid() {
|
|
481
|
-
if (!this.cachedTokenData || !this.tokenCacheTimestamp) return false;
|
|
482
|
-
const age = Date.now() - this.tokenCacheTimestamp;
|
|
483
|
-
return age < TOKEN_CACHE_TTL_MS;
|
|
484
|
-
}
|
|
485
|
-
/**
|
|
486
|
-
* Pre-fetch a LiveKit token for later use
|
|
487
|
-
* Call this early (e.g., when widget expands) to reduce latency when starting a voice session.
|
|
488
|
-
* If a prefetch is already in progress, returns the existing promise.
|
|
489
|
-
* If a valid cached token exists, returns it immediately.
|
|
490
|
-
*/
|
|
491
|
-
async prefetchToken() {
|
|
492
|
-
const config = getConfig();
|
|
493
|
-
if (this.isCachedTokenValid() && this.cachedTokenData) {
|
|
494
|
-
if (config.debug) console.log("[SpeechOS] Using cached token (prefetch hit)");
|
|
495
|
-
return this.cachedTokenData;
|
|
496
|
-
}
|
|
497
|
-
if (this.tokenPrefetchPromise) {
|
|
498
|
-
if (config.debug) console.log("[SpeechOS] Prefetch already in progress, awaiting...");
|
|
499
|
-
return this.tokenPrefetchPromise;
|
|
500
|
-
}
|
|
501
|
-
if (config.debug) console.log("[SpeechOS] Starting token prefetch...");
|
|
502
|
-
this.tokenPrefetchPromise = this.fetchTokenFromServer().then((data) => {
|
|
503
|
-
this.cachedTokenData = data;
|
|
504
|
-
this.tokenCacheTimestamp = Date.now();
|
|
505
|
-
this.tokenPrefetchPromise = null;
|
|
506
|
-
return data;
|
|
507
|
-
}).catch((error) => {
|
|
508
|
-
this.tokenPrefetchPromise = null;
|
|
509
|
-
throw error;
|
|
510
|
-
});
|
|
511
|
-
return this.tokenPrefetchPromise;
|
|
512
|
-
}
|
|
513
|
-
/**
|
|
514
|
-
* Fetch a LiveKit token from the backend
|
|
515
|
-
* Uses cached token if valid, otherwise fetches a fresh one.
|
|
516
|
-
* Includes language settings and user vocabulary which are stored in the VoiceSession.
|
|
517
|
-
*/
|
|
518
|
-
async fetchToken() {
|
|
519
|
-
const config = getConfig();
|
|
520
|
-
if (this.isCachedTokenValid() && this.cachedTokenData) {
|
|
521
|
-
if (config.debug) console.log("[SpeechOS] Using cached token");
|
|
522
|
-
this.tokenData = this.cachedTokenData;
|
|
523
|
-
return this.cachedTokenData;
|
|
524
|
-
}
|
|
525
|
-
if (this.tokenPrefetchPromise) {
|
|
526
|
-
if (config.debug) console.log("[SpeechOS] Waiting for prefetch to complete...");
|
|
527
|
-
const data$1 = await this.tokenPrefetchPromise;
|
|
528
|
-
this.tokenData = data$1;
|
|
529
|
-
return data$1;
|
|
530
|
-
}
|
|
531
|
-
const data = await this.fetchTokenFromServer();
|
|
532
|
-
this.cachedTokenData = data;
|
|
533
|
-
this.tokenCacheTimestamp = Date.now();
|
|
534
|
-
this.tokenData = data;
|
|
535
|
-
return data;
|
|
536
|
-
}
|
|
537
|
-
/**
|
|
538
|
-
* Internal method to fetch a fresh token from the server
|
|
539
|
-
*/
|
|
540
|
-
async fetchTokenFromServer() {
|
|
541
|
-
const config = getConfig();
|
|
542
|
-
const url = `${config.host}/livekit/api/token/`;
|
|
543
|
-
const settings = this.sessionSettings;
|
|
544
|
-
const inputLanguage = settings.inputLanguageCode ?? "en-US";
|
|
545
|
-
const outputLanguage = settings.outputLanguageCode ?? "en-US";
|
|
546
|
-
const smartFormat = settings.smartFormat ?? true;
|
|
547
|
-
const vocabulary = settings.vocabulary ?? [];
|
|
548
|
-
const snippets = settings.snippets ?? [];
|
|
549
|
-
if (config.debug) {
|
|
550
|
-
console.log("[SpeechOS] Fetching LiveKit token from:", url);
|
|
551
|
-
console.log("[SpeechOS] Session settings:", {
|
|
552
|
-
inputLanguage,
|
|
553
|
-
outputLanguage,
|
|
554
|
-
smartFormat,
|
|
555
|
-
snippetsCount: snippets.length,
|
|
556
|
-
vocabularyCount: vocabulary.length
|
|
557
|
-
});
|
|
558
|
-
}
|
|
559
|
-
const response = await fetch(url, {
|
|
560
|
-
method: "POST",
|
|
561
|
-
headers: {
|
|
562
|
-
"Content-Type": "application/json",
|
|
563
|
-
...config.apiKey ? { Authorization: `Api-Key ${config.apiKey}` } : {}
|
|
564
|
-
},
|
|
565
|
-
body: JSON.stringify({
|
|
566
|
-
user_id: config.userId || null,
|
|
567
|
-
input_language: inputLanguage,
|
|
568
|
-
output_language: outputLanguage,
|
|
569
|
-
smart_format: smartFormat,
|
|
570
|
-
custom_vocabulary: vocabulary,
|
|
571
|
-
custom_snippets: snippets
|
|
572
|
-
})
|
|
573
|
-
});
|
|
574
|
-
if (!response.ok) throw new Error(`Failed to fetch LiveKit token: ${response.status} ${response.statusText}`);
|
|
575
|
-
const data = await response.json();
|
|
576
|
-
if (config.debug) console.log("[SpeechOS] LiveKit token received:", {
|
|
577
|
-
room: data.room,
|
|
578
|
-
identity: data.identity,
|
|
579
|
-
ws_url: data.ws_url
|
|
580
|
-
});
|
|
581
|
-
return data;
|
|
582
|
-
}
|
|
583
|
-
/**
|
|
584
|
-
* Connect to a LiveKit room (fresh connection each time)
|
|
585
|
-
*/
|
|
586
|
-
async connect() {
|
|
587
|
-
const config = getConfig();
|
|
588
|
-
await this.fetchToken();
|
|
589
|
-
if (!this.tokenData) throw new Error("No token available for LiveKit connection");
|
|
590
|
-
this.room = new Room({
|
|
591
|
-
adaptiveStream: true,
|
|
592
|
-
dynacast: true
|
|
593
|
-
});
|
|
594
|
-
this.setupRoomEvents();
|
|
595
|
-
if (config.debug) console.log("[SpeechOS] Connecting to LiveKit room:", this.tokenData.room);
|
|
596
|
-
await this.room.connect(this.tokenData.ws_url, this.tokenData.token);
|
|
597
|
-
state.setConnected(true);
|
|
598
|
-
if (config.debug) console.log("[SpeechOS] Connected to LiveKit room:", this.room.name);
|
|
599
|
-
return this.room;
|
|
600
|
-
}
|
|
601
|
-
/**
|
|
602
|
-
* Wait until the agent is ready to receive audio
|
|
603
|
-
* Resolves when LocalTrackSubscribed event is received
|
|
604
|
-
*/
|
|
605
|
-
async waitUntilReady() {
|
|
606
|
-
if (!this.room || this.room.state !== "connected") throw new Error("Not connected to room");
|
|
607
|
-
if (this.pendingTrackSubscribed) return this.pendingTrackSubscribed.promise;
|
|
608
|
-
this.pendingTrackSubscribed = new Deferred();
|
|
609
|
-
this.pendingTrackSubscribed.setTimeout(15e3, "Connection timed out - agent not available", "connection_timeout", "connection");
|
|
610
|
-
return this.pendingTrackSubscribed.promise;
|
|
611
|
-
}
|
|
612
|
-
/**
|
|
613
|
-
* Set up LiveKit room event listeners
|
|
614
|
-
*/
|
|
615
|
-
setupRoomEvents() {
|
|
616
|
-
if (!this.room) return;
|
|
617
|
-
const config = getConfig();
|
|
618
|
-
this.room.on(RoomEvent.Connected, () => {
|
|
619
|
-
if (config.debug) console.log("[SpeechOS] Room connected");
|
|
620
|
-
state.setConnected(true);
|
|
621
|
-
});
|
|
622
|
-
this.room.on(RoomEvent.Disconnected, (reason) => {
|
|
623
|
-
if (config.debug) console.log("[SpeechOS] Room disconnected:", reason);
|
|
624
|
-
state.setConnected(false);
|
|
625
|
-
state.setMicEnabled(false);
|
|
626
|
-
});
|
|
627
|
-
this.room.on(RoomEvent.ParticipantConnected, (participant) => {
|
|
628
|
-
if (config.debug) console.log("[SpeechOS] Participant connected:", participant.identity);
|
|
629
|
-
});
|
|
630
|
-
this.room.on(RoomEvent.LocalTrackSubscribed, (publication) => {
|
|
631
|
-
if (config.debug) console.log("[SpeechOS] LocalTrackSubscribed event fired:", publication.trackSid);
|
|
632
|
-
if (this.pendingTrackSubscribed) {
|
|
633
|
-
this.pendingTrackSubscribed.resolve();
|
|
634
|
-
this.pendingTrackSubscribed = null;
|
|
635
|
-
}
|
|
636
|
-
});
|
|
637
|
-
this.room.on(RoomEvent.LocalTrackPublished, (publication) => {
|
|
638
|
-
if (config.debug) console.log("[SpeechOS] LocalTrackPublished:", publication.trackSid, publication.source);
|
|
639
|
-
});
|
|
640
|
-
this.room.on(RoomEvent.DataReceived, (data, participant) => {
|
|
641
|
-
this.handleDataMessage(data, participant);
|
|
642
|
-
});
|
|
643
|
-
}
|
|
644
|
-
/**
|
|
645
|
-
* Handle incoming data messages from the agent
|
|
646
|
-
*/
|
|
647
|
-
handleDataMessage(data, _participant) {
|
|
648
|
-
const config = getConfig();
|
|
649
|
-
try {
|
|
650
|
-
const message = JSON.parse(new TextDecoder().decode(data));
|
|
651
|
-
if (config.debug) console.log("[SpeechOS] Data received:", message);
|
|
652
|
-
if (message.type === MESSAGE_TYPE_TRANSCRIPT$1) {
|
|
653
|
-
const transcript = message.transcript || "";
|
|
654
|
-
if (config.debug) console.log("[SpeechOS] Transcript received:", transcript);
|
|
655
|
-
events.emit("transcription:complete", { text: transcript });
|
|
656
|
-
if (this.pendingTranscript) {
|
|
657
|
-
this.pendingTranscript.resolve(transcript);
|
|
658
|
-
this.pendingTranscript = null;
|
|
659
|
-
}
|
|
660
|
-
} else if (message.type === MESSAGE_TYPE_EDITED_TEXT$1) {
|
|
661
|
-
const editedText = message.text || "";
|
|
662
|
-
if (config.debug) console.log("[SpeechOS] Edited text received:", editedText);
|
|
663
|
-
events.emit("edit:complete", {
|
|
664
|
-
text: editedText,
|
|
665
|
-
originalText: this.editOriginalText || ""
|
|
666
|
-
});
|
|
667
|
-
if (this.pendingEditText) {
|
|
668
|
-
this.pendingEditText.resolve(editedText);
|
|
669
|
-
this.pendingEditText = null;
|
|
670
|
-
}
|
|
671
|
-
this.editOriginalText = null;
|
|
672
|
-
} else if (message.type === MESSAGE_TYPE_COMMAND_RESULT$1) {
|
|
673
|
-
const commandResult = message.command || null;
|
|
674
|
-
if (config.debug) console.log("[SpeechOS] Command result received:", commandResult);
|
|
675
|
-
events.emit("command:complete", { command: commandResult });
|
|
676
|
-
if (this.pendingCommand) {
|
|
677
|
-
this.pendingCommand.resolve(commandResult);
|
|
678
|
-
this.pendingCommand = null;
|
|
679
|
-
}
|
|
680
|
-
} else if (message.type === MESSAGE_TYPE_ERROR$1) {
|
|
681
|
-
const serverError = message;
|
|
682
|
-
const errorCode = serverError.code || "server_error";
|
|
683
|
-
const errorMessage = serverError.message || "A server error occurred";
|
|
684
|
-
console.error(`[SpeechOS] Error: ${errorMessage} (${errorCode})`);
|
|
685
|
-
if (config.debug && serverError.details) console.error("[SpeechOS] Error details:", serverError.details);
|
|
686
|
-
events.emit("error", {
|
|
687
|
-
code: errorCode,
|
|
688
|
-
message: errorMessage,
|
|
689
|
-
source: "server"
|
|
690
|
-
});
|
|
691
|
-
const error = new Error(errorMessage);
|
|
692
|
-
if (this.pendingTranscript) {
|
|
693
|
-
this.pendingTranscript.reject(error);
|
|
694
|
-
this.pendingTranscript = null;
|
|
695
|
-
}
|
|
696
|
-
if (this.pendingEditText) {
|
|
697
|
-
this.pendingEditText.reject(error);
|
|
698
|
-
this.pendingEditText = null;
|
|
699
|
-
}
|
|
700
|
-
if (this.pendingCommand) {
|
|
701
|
-
this.pendingCommand.reject(error);
|
|
702
|
-
this.pendingCommand = null;
|
|
703
|
-
}
|
|
704
|
-
}
|
|
705
|
-
} catch (error) {
|
|
706
|
-
console.error("[SpeechOS] Failed to parse data message:", error);
|
|
707
|
-
}
|
|
708
|
-
}
|
|
709
|
-
/**
|
|
710
|
-
* Publish microphone audio track
|
|
711
|
-
* Uses the device ID from session settings if set
|
|
712
|
-
*/
|
|
713
|
-
async enableMicrophone() {
|
|
714
|
-
if (!this.room || this.room.state !== "connected") throw new Error("Not connected to room");
|
|
715
|
-
const config = getConfig();
|
|
716
|
-
if (!this.micTrack) {
|
|
717
|
-
if (config.debug) console.log("[SpeechOS] Creating microphone track...");
|
|
718
|
-
const deviceId = this.sessionSettings.audioDeviceId;
|
|
719
|
-
const trackOptions = {
|
|
720
|
-
echoCancellation: true,
|
|
721
|
-
noiseSuppression: true
|
|
722
|
-
};
|
|
723
|
-
if (deviceId) {
|
|
724
|
-
trackOptions.deviceId = { exact: deviceId };
|
|
725
|
-
if (config.debug) console.log("[SpeechOS] Using audio device:", deviceId);
|
|
726
|
-
}
|
|
727
|
-
try {
|
|
728
|
-
this.micTrack = await createLocalAudioTrack(trackOptions);
|
|
729
|
-
} catch (error) {
|
|
730
|
-
if (deviceId && error instanceof Error) {
|
|
731
|
-
console.warn("[SpeechOS] Selected audio device unavailable, falling back to default:", error.message);
|
|
732
|
-
this.micTrack = await createLocalAudioTrack({
|
|
733
|
-
echoCancellation: true,
|
|
734
|
-
noiseSuppression: true
|
|
735
|
-
});
|
|
736
|
-
} else throw error;
|
|
737
|
-
}
|
|
738
|
-
this.logMicrophoneInfo();
|
|
739
|
-
}
|
|
740
|
-
const existingPub = this.room.localParticipant.getTrackPublication(Track.Source.Microphone);
|
|
741
|
-
if (!existingPub) {
|
|
742
|
-
await this.room.localParticipant.publishTrack(this.micTrack, { source: Track.Source.Microphone });
|
|
743
|
-
state.setMicEnabled(true);
|
|
744
|
-
if (config.debug) console.log("[SpeechOS] Microphone track published");
|
|
745
|
-
}
|
|
746
|
-
}
|
|
747
|
-
/**
|
|
748
|
-
* Log information about the current microphone track
|
|
749
|
-
*/
|
|
750
|
-
logMicrophoneInfo() {
|
|
751
|
-
if (!this.micTrack) return;
|
|
752
|
-
const config = getConfig();
|
|
753
|
-
const mediaTrack = this.micTrack.mediaStreamTrack;
|
|
754
|
-
const settings = mediaTrack.getSettings();
|
|
755
|
-
console.log("[SpeechOS] Microphone active:", {
|
|
756
|
-
deviceId: settings.deviceId || "unknown",
|
|
757
|
-
label: mediaTrack.label || "Unknown device",
|
|
758
|
-
sampleRate: settings.sampleRate,
|
|
759
|
-
channelCount: settings.channelCount,
|
|
760
|
-
echoCancellation: settings.echoCancellation,
|
|
761
|
-
noiseSuppression: settings.noiseSuppression
|
|
762
|
-
});
|
|
763
|
-
if (config.debug) console.log("[SpeechOS] Full audio track settings:", settings);
|
|
764
|
-
}
|
|
765
|
-
/**
|
|
766
|
-
* Disable microphone audio track
|
|
767
|
-
*/
|
|
768
|
-
async disableMicrophone() {
|
|
769
|
-
const config = getConfig();
|
|
770
|
-
if (this.micTrack) {
|
|
771
|
-
if (config.debug) console.log("[SpeechOS] Disabling microphone track...");
|
|
772
|
-
if (this.room?.state === "connected") try {
|
|
773
|
-
await this.room.localParticipant.unpublishTrack(this.micTrack);
|
|
774
|
-
if (config.debug) console.log("[SpeechOS] Microphone track unpublished");
|
|
775
|
-
} catch (error) {
|
|
776
|
-
console.warn("[SpeechOS] Error unpublishing track:", error);
|
|
777
|
-
}
|
|
778
|
-
this.micTrack.stop();
|
|
779
|
-
this.micTrack.detach();
|
|
780
|
-
this.micTrack = null;
|
|
781
|
-
state.setMicEnabled(false);
|
|
782
|
-
if (config.debug) console.log("[SpeechOS] Microphone track stopped and detached");
|
|
783
|
-
}
|
|
784
|
-
}
|
|
785
|
-
/**
|
|
786
|
-
* Send a data message to the room
|
|
787
|
-
*/
|
|
788
|
-
async sendDataMessage(message) {
|
|
789
|
-
if (!this.room || this.room.state !== "connected") throw new Error("Not connected to room");
|
|
790
|
-
const data = new TextEncoder().encode(JSON.stringify(message));
|
|
791
|
-
await this.room.localParticipant.publishData(data, {
|
|
792
|
-
reliable: true,
|
|
793
|
-
topic: TOPIC_SPEECHOS
|
|
794
|
-
});
|
|
795
|
-
}
|
|
796
|
-
/**
|
|
797
|
-
* Start a voice session with pre-connect audio buffering
|
|
798
|
-
* Fetches a fresh token, then enables mic with preConnectBuffer to capture audio while connecting.
|
|
799
|
-
* Agent subscription happens in the background - we don't block on it.
|
|
800
|
-
*
|
|
801
|
-
* @param options - Session options including action type and parameters
|
|
802
|
-
*/
|
|
803
|
-
async startVoiceSession(options) {
|
|
804
|
-
const config = getConfig();
|
|
805
|
-
if (config.debug) console.log("[SpeechOS] Starting voice session...");
|
|
806
|
-
this.sessionSettings = options?.settings || {};
|
|
807
|
-
await this.fetchToken();
|
|
808
|
-
if (!this.tokenData) throw new Error("No token available for LiveKit connection");
|
|
809
|
-
this.pendingTrackSubscribed = new Deferred();
|
|
810
|
-
this.pendingTrackSubscribed.setTimeout(15e3, "Connection timed out - agent not available", "connection_timeout", "connection");
|
|
811
|
-
this.room = new Room({
|
|
812
|
-
adaptiveStream: true,
|
|
813
|
-
dynacast: true
|
|
814
|
-
});
|
|
815
|
-
this.setupRoomEvents();
|
|
816
|
-
if (config.debug) console.log("[SpeechOS] Connecting to LiveKit room:", this.tokenData.room, "at", this.tokenData.ws_url);
|
|
817
|
-
await this.room.connect(this.tokenData.ws_url, this.tokenData.token);
|
|
818
|
-
if (config.debug) console.log("[SpeechOS] Connected, enabling microphone with preConnectBuffer...");
|
|
819
|
-
await this.enableMicrophoneWithPreConnectBuffer();
|
|
820
|
-
if (options?.onMicReady) options.onMicReady();
|
|
821
|
-
state.setConnected(true);
|
|
822
|
-
if (config.debug) console.log("[SpeechOS] Voice session ready - microphone active");
|
|
823
|
-
this.waitForAgentSubscription();
|
|
824
|
-
}
|
|
825
|
-
/**
|
|
826
|
-
* Wait for the agent to subscribe to our audio track in the background
|
|
827
|
-
* Handles timeout errors without blocking the main flow
|
|
828
|
-
*/
|
|
829
|
-
waitForAgentSubscription() {
|
|
830
|
-
const config = getConfig();
|
|
831
|
-
if (!this.pendingTrackSubscribed) return;
|
|
832
|
-
this.pendingTrackSubscribed.promise.then(() => {
|
|
833
|
-
if (config.debug) console.log("[SpeechOS] Agent subscribed to audio track - full duplex established");
|
|
834
|
-
this.pendingTrackSubscribed = null;
|
|
835
|
-
}).catch((error) => {
|
|
836
|
-
console.warn("[SpeechOS] Agent subscription timeout:", error.message);
|
|
837
|
-
this.pendingTrackSubscribed = null;
|
|
838
|
-
});
|
|
839
|
-
}
|
|
840
|
-
/**
|
|
841
|
-
* Enable microphone with pre-connect buffering
|
|
842
|
-
* This starts capturing audio locally before the room is connected,
|
|
843
|
-
* buffering it until the connection is established.
|
|
844
|
-
*/
|
|
845
|
-
async enableMicrophoneWithPreConnectBuffer() {
|
|
846
|
-
if (!this.room) throw new Error("Room not initialized");
|
|
847
|
-
const config = getConfig();
|
|
848
|
-
const deviceId = this.sessionSettings.audioDeviceId;
|
|
849
|
-
const constraints = {
|
|
850
|
-
echoCancellation: true,
|
|
851
|
-
noiseSuppression: true
|
|
852
|
-
};
|
|
853
|
-
if (deviceId) {
|
|
854
|
-
constraints.deviceId = { exact: deviceId };
|
|
855
|
-
if (config.debug) console.log("[SpeechOS] Using audio device:", deviceId);
|
|
856
|
-
}
|
|
857
|
-
try {
|
|
858
|
-
await this.room.localParticipant.setMicrophoneEnabled(true, constraints, { preConnectBuffer: true });
|
|
859
|
-
state.setMicEnabled(true);
|
|
860
|
-
const micPub = this.room.localParticipant.getTrackPublication(Track.Source.Microphone);
|
|
861
|
-
if (micPub?.track) {
|
|
862
|
-
this.micTrack = micPub.track;
|
|
863
|
-
this.logMicrophoneInfo();
|
|
864
|
-
}
|
|
865
|
-
if (config.debug) console.log("[SpeechOS] Microphone enabled with pre-connect buffer - audio is being captured");
|
|
866
|
-
} catch (error) {
|
|
867
|
-
if (deviceId && error instanceof Error) {
|
|
868
|
-
console.warn("[SpeechOS] Selected audio device unavailable, falling back to default:", error.message);
|
|
869
|
-
await this.room.localParticipant.setMicrophoneEnabled(true, {
|
|
870
|
-
echoCancellation: true,
|
|
871
|
-
noiseSuppression: true
|
|
872
|
-
}, { preConnectBuffer: true });
|
|
873
|
-
state.setMicEnabled(true);
|
|
874
|
-
} else throw error;
|
|
875
|
-
}
|
|
876
|
-
}
|
|
877
|
-
/**
|
|
878
|
-
* Stop the voice session and request the transcript
|
|
879
|
-
* Returns a promise that resolves with the transcript text
|
|
880
|
-
* @throws Error if timeout occurs waiting for transcript
|
|
881
|
-
*/
|
|
882
|
-
async stopVoiceSession() {
|
|
883
|
-
const config = getConfig();
|
|
884
|
-
const settings = this.sessionSettings;
|
|
885
|
-
const inputLanguage = settings.inputLanguageCode ?? "en-US";
|
|
886
|
-
const outputLanguage = settings.outputLanguageCode ?? "en-US";
|
|
887
|
-
console.log("[SpeechOS] Dictate command:", {
|
|
888
|
-
inputLanguage,
|
|
889
|
-
outputLanguage
|
|
890
|
-
});
|
|
891
|
-
if (config.debug) console.log("[SpeechOS] Stopping voice session, requesting transcript...");
|
|
892
|
-
await this.disableMicrophone();
|
|
893
|
-
if (config.debug) console.log("[SpeechOS] Requesting transcript from agent...");
|
|
894
|
-
this.pendingTranscript = new Deferred();
|
|
895
|
-
this.pendingTranscript.setTimeout(1e4, "Transcription timed out. Please try again.", "transcription_timeout", "timeout");
|
|
896
|
-
await this.sendDataMessage({ type: MESSAGE_TYPE_REQUEST_TRANSCRIPT$1 });
|
|
897
|
-
const result = await this.pendingTranscript.promise;
|
|
898
|
-
this.pendingTranscript = null;
|
|
899
|
-
return result;
|
|
900
|
-
}
|
|
901
|
-
/**
|
|
902
|
-
* Alias for stopVoiceSession - granular API naming
|
|
903
|
-
*/
|
|
904
|
-
async stopAndGetTranscript() {
|
|
905
|
-
return this.stopVoiceSession();
|
|
906
|
-
}
|
|
907
|
-
/**
|
|
908
|
-
* Request text editing using the transcript as instructions
|
|
909
|
-
* Sends the original text to the backend, which applies the spoken instructions
|
|
910
|
-
* Returns a promise that resolves with the edited text
|
|
911
|
-
* @throws Error if timeout occurs waiting for edited text
|
|
912
|
-
*/
|
|
913
|
-
async requestEditText(originalText) {
|
|
914
|
-
const config = getConfig();
|
|
915
|
-
const settings = this.sessionSettings;
|
|
916
|
-
const inputLanguage = settings.inputLanguageCode ?? "en-US";
|
|
917
|
-
const outputLanguage = settings.outputLanguageCode ?? "en-US";
|
|
918
|
-
console.log("[SpeechOS] Edit command:", {
|
|
919
|
-
inputLanguage,
|
|
920
|
-
outputLanguage,
|
|
921
|
-
originalTextLength: originalText.length
|
|
922
|
-
});
|
|
923
|
-
if (config.debug) console.log("[SpeechOS] Requesting text edit...");
|
|
924
|
-
this.editOriginalText = originalText;
|
|
925
|
-
await this.disableMicrophone();
|
|
926
|
-
if (config.debug) console.log("[SpeechOS] Sending edit_text request to agent...");
|
|
927
|
-
this.pendingEditText = new Deferred();
|
|
928
|
-
this.pendingEditText.setTimeout(15e3, "Edit request timed out. Please try again.", "edit_timeout", "timeout");
|
|
929
|
-
await this.sendDataMessage({
|
|
930
|
-
type: MESSAGE_TYPE_EDIT_TEXT$1,
|
|
931
|
-
text: originalText
|
|
932
|
-
});
|
|
933
|
-
const result = await this.pendingEditText.promise;
|
|
934
|
-
this.pendingEditText = null;
|
|
935
|
-
return result;
|
|
936
|
-
}
|
|
937
|
-
/**
|
|
938
|
-
* Alias for requestEditText - granular API naming
|
|
939
|
-
*/
|
|
940
|
-
async stopAndEdit(originalText) {
|
|
941
|
-
return this.requestEditText(originalText);
|
|
942
|
-
}
|
|
943
|
-
/**
|
|
944
|
-
* Request command matching using the transcript as input
|
|
945
|
-
* Sends command definitions to the backend, which matches the user's speech against them
|
|
946
|
-
* Returns a promise that resolves with the matched command or null if no match
|
|
947
|
-
* @throws Error if timeout occurs waiting for command result
|
|
948
|
-
*/
|
|
949
|
-
async requestCommand(commands) {
|
|
950
|
-
const config = getConfig();
|
|
951
|
-
const settings = this.sessionSettings;
|
|
952
|
-
const inputLanguage = settings.inputLanguageCode ?? "en-US";
|
|
953
|
-
console.log("[SpeechOS] Command request:", {
|
|
954
|
-
inputLanguage,
|
|
955
|
-
commandCount: commands.length
|
|
956
|
-
});
|
|
957
|
-
if (config.debug) console.log("[SpeechOS] Requesting command match...");
|
|
958
|
-
await this.disableMicrophone();
|
|
959
|
-
if (config.debug) console.log("[SpeechOS] Sending execute_command request to agent...");
|
|
960
|
-
this.pendingCommand = new Deferred();
|
|
961
|
-
this.pendingCommand.setTimeout(15e3, "Command request timed out. Please try again.", "command_timeout", "timeout");
|
|
962
|
-
await this.sendDataMessage({
|
|
963
|
-
type: MESSAGE_TYPE_EXECUTE_COMMAND$1,
|
|
964
|
-
commands
|
|
965
|
-
});
|
|
966
|
-
const result = await this.pendingCommand.promise;
|
|
967
|
-
this.pendingCommand = null;
|
|
968
|
-
return result;
|
|
969
|
-
}
|
|
970
|
-
/**
|
|
971
|
-
* Alias for requestCommand - granular API naming
|
|
972
|
-
*/
|
|
973
|
-
async stopAndCommand(commands) {
|
|
974
|
-
return this.requestCommand(commands);
|
|
975
|
-
}
|
|
976
|
-
/**
|
|
977
|
-
* Disconnect from the current room
|
|
978
|
-
* Clears the token so a fresh one is fetched for the next session
|
|
979
|
-
*/
|
|
980
|
-
async disconnect() {
|
|
981
|
-
const config = getConfig();
|
|
982
|
-
if (config.debug) console.log("[SpeechOS] Disconnecting from room...");
|
|
983
|
-
await this.disableMicrophone();
|
|
984
|
-
if (this.room) {
|
|
985
|
-
this.room.removeAllListeners();
|
|
986
|
-
await this.room.disconnect();
|
|
987
|
-
this.room = null;
|
|
988
|
-
state.setConnected(false);
|
|
989
|
-
if (config.debug) console.log("[SpeechOS] Room disconnected and cleaned up");
|
|
990
|
-
}
|
|
991
|
-
if (this.pendingTranscript) {
|
|
992
|
-
this.pendingTranscript.reject(new Error("Disconnected"));
|
|
993
|
-
this.pendingTranscript = null;
|
|
994
|
-
}
|
|
995
|
-
if (this.pendingEditText) {
|
|
996
|
-
this.pendingEditText.reject(new Error("Disconnected"));
|
|
997
|
-
this.pendingEditText = null;
|
|
998
|
-
}
|
|
999
|
-
if (this.pendingCommand) {
|
|
1000
|
-
this.pendingCommand.reject(new Error("Disconnected"));
|
|
1001
|
-
this.pendingCommand = null;
|
|
1002
|
-
}
|
|
1003
|
-
if (this.pendingTrackSubscribed) {
|
|
1004
|
-
this.pendingTrackSubscribed.reject(new Error("Disconnected"));
|
|
1005
|
-
this.pendingTrackSubscribed = null;
|
|
1006
|
-
}
|
|
1007
|
-
this.tokenData = null;
|
|
1008
|
-
this.editOriginalText = null;
|
|
1009
|
-
this.sessionSettings = {};
|
|
1010
|
-
if (config.debug) console.log("[SpeechOS] Session state cleared");
|
|
1011
|
-
}
|
|
1012
|
-
/**
|
|
1013
|
-
* Invalidate the cached token
|
|
1014
|
-
* Call this when settings change that would affect the token (language, vocabulary)
|
|
1015
|
-
*/
|
|
1016
|
-
invalidateTokenCache() {
|
|
1017
|
-
const config = getConfig();
|
|
1018
|
-
if (config.debug) console.log("[SpeechOS] Token cache invalidated");
|
|
1019
|
-
this.cachedTokenData = null;
|
|
1020
|
-
this.tokenCacheTimestamp = null;
|
|
1021
|
-
}
|
|
1022
|
-
/**
|
|
1023
|
-
* Start auto-refreshing the token while the widget is expanded.
|
|
1024
|
-
* Call this after a voice session completes to immediately fetch a fresh token
|
|
1025
|
-
* (since each command requires its own token) and keep it fresh for subsequent commands.
|
|
1026
|
-
*/
|
|
1027
|
-
startAutoRefresh() {
|
|
1028
|
-
const config = getConfig();
|
|
1029
|
-
this.autoRefreshEnabled = true;
|
|
1030
|
-
if (config.debug) console.log("[SpeechOS] Token auto-refresh enabled");
|
|
1031
|
-
this.invalidateTokenCache();
|
|
1032
|
-
this.prefetchToken().then(() => {
|
|
1033
|
-
this.scheduleTokenRefresh();
|
|
1034
|
-
}).catch((error) => {
|
|
1035
|
-
if (config.debug) console.warn("[SpeechOS] Failed to prefetch token after command:", error);
|
|
1036
|
-
if (this.autoRefreshEnabled) this.tokenRefreshTimer = setTimeout(() => {
|
|
1037
|
-
this.performAutoRefresh();
|
|
1038
|
-
}, 5 * 1e3);
|
|
1039
|
-
});
|
|
1040
|
-
}
|
|
1041
|
-
/**
|
|
1042
|
-
* Stop auto-refreshing the token.
|
|
1043
|
-
* Call this when the widget collapses or user navigates away.
|
|
1044
|
-
*/
|
|
1045
|
-
stopAutoRefresh() {
|
|
1046
|
-
const config = getConfig();
|
|
1047
|
-
this.autoRefreshEnabled = false;
|
|
1048
|
-
if (this.tokenRefreshTimer) {
|
|
1049
|
-
clearTimeout(this.tokenRefreshTimer);
|
|
1050
|
-
this.tokenRefreshTimer = null;
|
|
1051
|
-
}
|
|
1052
|
-
if (config.debug) console.log("[SpeechOS] Token auto-refresh disabled");
|
|
1053
|
-
}
|
|
1054
|
-
/**
|
|
1055
|
-
* Schedule a token refresh before the current cache expires.
|
|
1056
|
-
* Handles computer sleep by checking elapsed time on each refresh attempt.
|
|
1057
|
-
*/
|
|
1058
|
-
scheduleTokenRefresh() {
|
|
1059
|
-
if (!this.autoRefreshEnabled) return;
|
|
1060
|
-
if (this.tokenRefreshTimer) {
|
|
1061
|
-
clearTimeout(this.tokenRefreshTimer);
|
|
1062
|
-
this.tokenRefreshTimer = null;
|
|
1063
|
-
}
|
|
1064
|
-
const config = getConfig();
|
|
1065
|
-
const refreshBuffer = 30 * 1e3;
|
|
1066
|
-
let timeUntilRefresh;
|
|
1067
|
-
if (this.tokenCacheTimestamp) {
|
|
1068
|
-
const age = Date.now() - this.tokenCacheTimestamp;
|
|
1069
|
-
const timeRemaining = TOKEN_CACHE_TTL_MS - age;
|
|
1070
|
-
timeUntilRefresh = Math.max(0, timeRemaining - refreshBuffer);
|
|
1071
|
-
} else timeUntilRefresh = 0;
|
|
1072
|
-
if (config.debug) console.log(`[SpeechOS] Scheduling token refresh in ${Math.round(timeUntilRefresh / 1e3)}s`);
|
|
1073
|
-
this.tokenRefreshTimer = setTimeout(() => {
|
|
1074
|
-
this.performAutoRefresh();
|
|
1075
|
-
}, timeUntilRefresh);
|
|
1076
|
-
}
|
|
1077
|
-
/**
|
|
1078
|
-
* Perform the auto-refresh, handling computer sleep scenarios.
|
|
1079
|
-
*/
|
|
1080
|
-
async performAutoRefresh() {
|
|
1081
|
-
if (!this.autoRefreshEnabled) return;
|
|
1082
|
-
const config = getConfig();
|
|
1083
|
-
if (this.isCachedTokenValid()) {
|
|
1084
|
-
if (config.debug) console.log("[SpeechOS] Token still valid on refresh check, rescheduling");
|
|
1085
|
-
this.scheduleTokenRefresh();
|
|
1086
|
-
return;
|
|
1087
|
-
}
|
|
1088
|
-
if (config.debug) console.log("[SpeechOS] Auto-refreshing token...");
|
|
1089
|
-
try {
|
|
1090
|
-
const data = await this.fetchTokenFromServer();
|
|
1091
|
-
this.cachedTokenData = data;
|
|
1092
|
-
this.tokenCacheTimestamp = Date.now();
|
|
1093
|
-
if (config.debug) console.log("[SpeechOS] Token auto-refreshed successfully");
|
|
1094
|
-
this.scheduleTokenRefresh();
|
|
1095
|
-
} catch (error) {
|
|
1096
|
-
console.warn("[SpeechOS] Token auto-refresh failed:", error);
|
|
1097
|
-
if (this.autoRefreshEnabled) this.tokenRefreshTimer = setTimeout(() => {
|
|
1098
|
-
this.performAutoRefresh();
|
|
1099
|
-
}, 30 * 1e3);
|
|
1100
|
-
}
|
|
1101
|
-
}
|
|
1102
|
-
/**
|
|
1103
|
-
* Get the current room instance
|
|
1104
|
-
*/
|
|
1105
|
-
getRoom() {
|
|
1106
|
-
return this.room;
|
|
1107
|
-
}
|
|
1108
|
-
/**
|
|
1109
|
-
* Get the current token data
|
|
1110
|
-
*/
|
|
1111
|
-
getTokenData() {
|
|
1112
|
-
return this.tokenData;
|
|
1113
|
-
}
|
|
1114
|
-
/**
|
|
1115
|
-
* Check if connected to a room
|
|
1116
|
-
*/
|
|
1117
|
-
isConnected() {
|
|
1118
|
-
return this.room?.state === "connected";
|
|
1119
|
-
}
|
|
1120
|
-
/**
|
|
1121
|
-
* Check if microphone is enabled
|
|
1122
|
-
*/
|
|
1123
|
-
isMicrophoneEnabled() {
|
|
1124
|
-
return this.micTrack !== null;
|
|
1125
|
-
}
|
|
1126
|
-
};
|
|
1127
|
-
const livekit = new LiveKitManager();
|
|
1128
|
-
events.on("settings:changed", () => {
|
|
1129
|
-
livekit.invalidateTokenCache();
|
|
1130
|
-
});
|
|
1131
|
-
|
|
1132
412
|
//#endregion
|
|
1133
413
|
//#region src/audio-capture.ts
|
|
1134
414
|
/**
|
|
@@ -1426,7 +706,7 @@ const RESPONSE_TIMEOUT_MS = 15e3;
|
|
|
1426
706
|
/**
|
|
1427
707
|
* A deferred promise with timeout support.
|
|
1428
708
|
*/
|
|
1429
|
-
var Deferred
|
|
709
|
+
var Deferred = class {
|
|
1430
710
|
promise;
|
|
1431
711
|
_resolve;
|
|
1432
712
|
_reject;
|
|
@@ -1537,7 +817,7 @@ var WebSocketManager = class {
|
|
|
1537
817
|
state.setMicEnabled(true);
|
|
1538
818
|
const wsUrl = this.getWebSocketUrl();
|
|
1539
819
|
if (config.debug) console.log("[SpeechOS] Connecting to WebSocket:", wsUrl);
|
|
1540
|
-
this.pendingAuth = new Deferred
|
|
820
|
+
this.pendingAuth = new Deferred();
|
|
1541
821
|
this.pendingAuth.setTimeout(RESPONSE_TIMEOUT_MS, "Connection timed out", "connection_timeout", "connection");
|
|
1542
822
|
const factory = config.webSocketFactory ?? ((url) => new WebSocket(url));
|
|
1543
823
|
this.ws = factory(wsUrl);
|
|
@@ -1685,11 +965,11 @@ var WebSocketManager = class {
|
|
|
1685
965
|
this.editOriginalText = null;
|
|
1686
966
|
}
|
|
1687
967
|
handleCommandResult(message) {
|
|
1688
|
-
const
|
|
968
|
+
const commands = message.commands || [];
|
|
1689
969
|
this.lastInputText = message.transcript;
|
|
1690
|
-
events.emit("command:complete", {
|
|
970
|
+
events.emit("command:complete", { commands });
|
|
1691
971
|
if (this.pendingCommand) {
|
|
1692
|
-
this.pendingCommand.resolve(
|
|
972
|
+
this.pendingCommand.resolve(commands);
|
|
1693
973
|
this.pendingCommand = null;
|
|
1694
974
|
}
|
|
1695
975
|
}
|
|
@@ -1727,7 +1007,7 @@ var WebSocketManager = class {
|
|
|
1727
1007
|
const config = getConfig();
|
|
1728
1008
|
if (config.debug) console.log("[SpeechOS] Stopping voice session, requesting transcript...");
|
|
1729
1009
|
await this.stopAudioCapture();
|
|
1730
|
-
this.pendingTranscript = new Deferred
|
|
1010
|
+
this.pendingTranscript = new Deferred();
|
|
1731
1011
|
this.pendingTranscript.setTimeout(RESPONSE_TIMEOUT_MS, "Transcription timed out. Please try again.", "transcription_timeout", "timeout");
|
|
1732
1012
|
this.sendMessage({ type: MESSAGE_TYPE_REQUEST_TRANSCRIPT });
|
|
1733
1013
|
const result = await this.pendingTranscript.promise;
|
|
@@ -1742,7 +1022,7 @@ var WebSocketManager = class {
|
|
|
1742
1022
|
const config = getConfig();
|
|
1743
1023
|
if (config.debug) console.log("[SpeechOS] Requesting text edit...");
|
|
1744
1024
|
await this.stopAudioCapture();
|
|
1745
|
-
this.pendingEditText = new Deferred
|
|
1025
|
+
this.pendingEditText = new Deferred();
|
|
1746
1026
|
this.pendingEditText.setTimeout(RESPONSE_TIMEOUT_MS, "Edit request timed out. Please try again.", "edit_timeout", "timeout");
|
|
1747
1027
|
this.sendMessage({ type: MESSAGE_TYPE_EDIT_TEXT });
|
|
1748
1028
|
const result = await this.pendingEditText.promise;
|
|
@@ -1752,12 +1032,13 @@ var WebSocketManager = class {
|
|
|
1752
1032
|
/**
|
|
1753
1033
|
* Request command matching using the transcript as input.
|
|
1754
1034
|
* Note: The command definitions were already sent in the auth message via startVoiceSession.
|
|
1035
|
+
* Returns an array of matched commands (empty array if no matches).
|
|
1755
1036
|
*/
|
|
1756
1037
|
async requestCommand(_commands) {
|
|
1757
1038
|
const config = getConfig();
|
|
1758
1039
|
if (config.debug) console.log("[SpeechOS] Requesting command match...");
|
|
1759
1040
|
await this.stopAudioCapture();
|
|
1760
|
-
this.pendingCommand = new Deferred
|
|
1041
|
+
this.pendingCommand = new Deferred();
|
|
1761
1042
|
this.pendingCommand.setTimeout(RESPONSE_TIMEOUT_MS, "Command request timed out. Please try again.", "command_timeout", "timeout");
|
|
1762
1043
|
this.sendMessage({ type: MESSAGE_TYPE_EXECUTE_COMMAND });
|
|
1763
1044
|
const result = await this.pendingCommand.promise;
|
|
@@ -1795,7 +1076,7 @@ var WebSocketManager = class {
|
|
|
1795
1076
|
* Wait for the WebSocket send buffer to drain.
|
|
1796
1077
|
*
|
|
1797
1078
|
* This ensures all audio data has been transmitted before we request
|
|
1798
|
-
* the transcript.
|
|
1079
|
+
* the transcript.
|
|
1799
1080
|
*/
|
|
1800
1081
|
async waitForBufferDrain() {
|
|
1801
1082
|
if (!this.ws || this.ws.readyState !== WS_OPEN) return;
|
|
@@ -1871,7 +1152,7 @@ const websocket = new WebSocketManager();
|
|
|
1871
1152
|
//#endregion
|
|
1872
1153
|
//#region src/speechos.ts
|
|
1873
1154
|
/**
|
|
1874
|
-
* Get the active voice backend
|
|
1155
|
+
* Get the active voice backend
|
|
1875
1156
|
*/
|
|
1876
1157
|
function getBackend$1() {
|
|
1877
1158
|
return websocket;
|
|
@@ -1879,9 +1160,7 @@ function getBackend$1() {
|
|
|
1879
1160
|
/**
|
|
1880
1161
|
* SpeechOS Core SDK
|
|
1881
1162
|
*
|
|
1882
|
-
* Provides
|
|
1883
|
-
* 1. Low-level API: Granular control over LiveKit connection lifecycle
|
|
1884
|
-
* 2. High-level API: One-shot methods for common voice tasks
|
|
1163
|
+
* Provides a high-level API for common voice tasks.
|
|
1885
1164
|
*/
|
|
1886
1165
|
var SpeechOSCore = class {
|
|
1887
1166
|
initialized = false;
|
|
@@ -1905,67 +1184,7 @@ var SpeechOSCore = class {
|
|
|
1905
1184
|
return this.initialized;
|
|
1906
1185
|
}
|
|
1907
1186
|
/**
|
|
1908
|
-
*
|
|
1909
|
-
* Call this before other low-level methods
|
|
1910
|
-
*/
|
|
1911
|
-
async connect() {
|
|
1912
|
-
this.ensureInitialized();
|
|
1913
|
-
await livekit.connect();
|
|
1914
|
-
}
|
|
1915
|
-
/**
|
|
1916
|
-
* Wait until the agent is ready to receive audio
|
|
1917
|
-
* Resolves when the agent subscribes to our audio track
|
|
1918
|
-
*/
|
|
1919
|
-
async waitUntilReady() {
|
|
1920
|
-
return livekit.waitUntilReady();
|
|
1921
|
-
}
|
|
1922
|
-
/**
|
|
1923
|
-
* Enable microphone (user is now being recorded)
|
|
1924
|
-
*/
|
|
1925
|
-
async enableMicrophone() {
|
|
1926
|
-
await livekit.enableMicrophone();
|
|
1927
|
-
state.setRecordingState("recording");
|
|
1928
|
-
}
|
|
1929
|
-
/**
|
|
1930
|
-
* Stop recording and get the transcript
|
|
1931
|
-
* @returns The transcribed text
|
|
1932
|
-
*/
|
|
1933
|
-
async stopAndGetTranscript() {
|
|
1934
|
-
state.setRecordingState("processing");
|
|
1935
|
-
try {
|
|
1936
|
-
const transcript = await livekit.stopAndGetTranscript();
|
|
1937
|
-
state.completeRecording();
|
|
1938
|
-
return transcript;
|
|
1939
|
-
} catch (error) {
|
|
1940
|
-
state.setError(error instanceof Error ? error.message : "Transcription failed");
|
|
1941
|
-
throw error;
|
|
1942
|
-
}
|
|
1943
|
-
}
|
|
1944
|
-
/**
|
|
1945
|
-
* Stop recording and get edited text
|
|
1946
|
-
* @param originalText - The original text to edit based on voice instructions
|
|
1947
|
-
* @returns The edited text
|
|
1948
|
-
*/
|
|
1949
|
-
async stopAndEdit(originalText) {
|
|
1950
|
-
state.setRecordingState("processing");
|
|
1951
|
-
try {
|
|
1952
|
-
const editedText = await livekit.stopAndEdit(originalText);
|
|
1953
|
-
state.completeRecording();
|
|
1954
|
-
return editedText;
|
|
1955
|
-
} catch (error) {
|
|
1956
|
-
state.setError(error instanceof Error ? error.message : "Edit request failed");
|
|
1957
|
-
throw error;
|
|
1958
|
-
}
|
|
1959
|
-
}
|
|
1960
|
-
/**
|
|
1961
|
-
* Disconnect from LiveKit
|
|
1962
|
-
*/
|
|
1963
|
-
async disconnect() {
|
|
1964
|
-
await livekit.disconnect();
|
|
1965
|
-
state.completeRecording();
|
|
1966
|
-
}
|
|
1967
|
-
/**
|
|
1968
|
-
* One-shot dictation: connect, wait for agent, record, and get transcript
|
|
1187
|
+
* One-shot dictation: connect, record, and get transcript
|
|
1969
1188
|
* Automatically handles the full voice session lifecycle
|
|
1970
1189
|
*
|
|
1971
1190
|
* @returns The transcribed text
|
|
@@ -2024,7 +1243,7 @@ var SpeechOSCore = class {
|
|
|
2024
1243
|
}
|
|
2025
1244
|
}
|
|
2026
1245
|
/**
|
|
2027
|
-
* One-shot edit: connect,
|
|
1246
|
+
* One-shot edit: connect, record voice instructions, apply to text
|
|
2028
1247
|
* Automatically handles the full voice session lifecycle
|
|
2029
1248
|
*
|
|
2030
1249
|
* @param originalText - The text to edit
|
|
@@ -2089,11 +1308,11 @@ var SpeechOSCore = class {
|
|
|
2089
1308
|
}
|
|
2090
1309
|
}
|
|
2091
1310
|
/**
|
|
2092
|
-
* One-shot command: connect,
|
|
1311
|
+
* One-shot command: connect, record voice, match against commands
|
|
2093
1312
|
* Automatically handles the full voice session lifecycle
|
|
2094
1313
|
*
|
|
2095
1314
|
* @param commands - Array of command definitions to match against
|
|
2096
|
-
* @returns
|
|
1315
|
+
* @returns Array of matched commands (empty array if no matches)
|
|
2097
1316
|
*/
|
|
2098
1317
|
async command(commands) {
|
|
2099
1318
|
this.ensureInitialized();
|
|
@@ -2123,22 +1342,24 @@ var SpeechOSCore = class {
|
|
|
2123
1342
|
_commandResolve;
|
|
2124
1343
|
_commandReject;
|
|
2125
1344
|
/**
|
|
2126
|
-
* Stop command recording and get the matched
|
|
1345
|
+
* Stop command recording and get the matched commands
|
|
2127
1346
|
* Call this after command() when user stops speaking
|
|
1347
|
+
*
|
|
1348
|
+
* @returns Array of matched commands (empty array if no matches)
|
|
2128
1349
|
*/
|
|
2129
1350
|
async stopCommand() {
|
|
2130
1351
|
state.setRecordingState("processing");
|
|
2131
1352
|
try {
|
|
2132
1353
|
const backend = getBackend$1();
|
|
2133
1354
|
const commands = this._commandCommands || [];
|
|
2134
|
-
const
|
|
1355
|
+
const results = await backend.requestCommand(commands);
|
|
2135
1356
|
state.completeRecording();
|
|
2136
1357
|
if (this._commandResolve) {
|
|
2137
|
-
this._commandResolve(
|
|
1358
|
+
this._commandResolve(results);
|
|
2138
1359
|
this._commandResolve = void 0;
|
|
2139
1360
|
this._commandReject = void 0;
|
|
2140
1361
|
}
|
|
2141
|
-
return
|
|
1362
|
+
return results;
|
|
2142
1363
|
} catch (error) {
|
|
2143
1364
|
const err = error instanceof Error ? error : new Error("Command request failed");
|
|
2144
1365
|
state.setError(err.message);
|
|
@@ -2240,15 +1461,10 @@ const websocketBackend = {
|
|
|
2240
1461
|
requestCommand: (commands) => websocket.requestCommand(commands),
|
|
2241
1462
|
disconnect: () => websocket.disconnect(),
|
|
2242
1463
|
isConnected: () => websocket.isConnected(),
|
|
2243
|
-
getLastInputText: () => websocket.getLastInputText()
|
|
2244
|
-
prefetchToken: () => Promise.resolve({}),
|
|
2245
|
-
startAutoRefresh: () => {},
|
|
2246
|
-
stopAutoRefresh: () => {},
|
|
2247
|
-
invalidateTokenCache: () => {}
|
|
1464
|
+
getLastInputText: () => websocket.getLastInputText()
|
|
2248
1465
|
};
|
|
2249
1466
|
/**
|
|
2250
1467
|
* Get the active voice backend.
|
|
2251
|
-
* Always returns WebSocket backend (LiveKit is legacy).
|
|
2252
1468
|
*
|
|
2253
1469
|
* @returns The websocket backend
|
|
2254
1470
|
*/
|
|
@@ -2256,10 +1472,197 @@ function getBackend() {
|
|
|
2256
1472
|
return websocketBackend;
|
|
2257
1473
|
}
|
|
2258
1474
|
|
|
1475
|
+
//#endregion
|
|
1476
|
+
//#region src/tts.ts
|
|
1477
|
+
/**
|
|
1478
|
+
* Default TTS voice ID (matches server default).
|
|
1479
|
+
* The server validates voice IDs - pass any valid voice ID or omit to use default.
|
|
1480
|
+
*/
|
|
1481
|
+
const DEFAULT_TTS_VOICE_ID = "21m00Tcm4TlvDq8ikWAM";
|
|
1482
|
+
/**
|
|
1483
|
+
* Map HTTP status codes to TTS error codes
|
|
1484
|
+
*/
|
|
1485
|
+
function mapHttpStatusToErrorCode(status) {
|
|
1486
|
+
switch (status) {
|
|
1487
|
+
case 400: return "invalid_request";
|
|
1488
|
+
case 402: return "usage_limit_exceeded";
|
|
1489
|
+
case 403: return "authentication_failed";
|
|
1490
|
+
default: return "unknown_error";
|
|
1491
|
+
}
|
|
1492
|
+
}
|
|
1493
|
+
/**
|
|
1494
|
+
* TTS Client for synthesizing speech from text
|
|
1495
|
+
*/
|
|
1496
|
+
var TTSClient = class {
|
|
1497
|
+
/**
|
|
1498
|
+
* Synthesize text to speech and return audio bytes
|
|
1499
|
+
*
|
|
1500
|
+
* @param text - Text to synthesize (max 1000 chars)
|
|
1501
|
+
* @param options - Optional synthesis options
|
|
1502
|
+
* @returns Audio data and content type
|
|
1503
|
+
*
|
|
1504
|
+
* @example
|
|
1505
|
+
* ```typescript
|
|
1506
|
+
* const result = await tts.synthesize('Hello world');
|
|
1507
|
+
* console.log(result.audio); // ArrayBuffer
|
|
1508
|
+
* console.log(result.contentType); // 'audio/mpeg'
|
|
1509
|
+
* ```
|
|
1510
|
+
*/
|
|
1511
|
+
async synthesize(text, options) {
|
|
1512
|
+
const config = getConfig();
|
|
1513
|
+
if (!config.apiKey) {
|
|
1514
|
+
const error = {
|
|
1515
|
+
code: "authentication_failed",
|
|
1516
|
+
message: "API key not configured. Call SpeechOS.init() first.",
|
|
1517
|
+
phase: "synthesize"
|
|
1518
|
+
};
|
|
1519
|
+
events.emit("tts:error", error);
|
|
1520
|
+
throw new Error(error.message);
|
|
1521
|
+
}
|
|
1522
|
+
events.emit("tts:synthesize:start", { text });
|
|
1523
|
+
try {
|
|
1524
|
+
const response = await fetch(`${config.host}/api/tts/`, {
|
|
1525
|
+
method: "POST",
|
|
1526
|
+
headers: {
|
|
1527
|
+
"Authorization": `Api-Key ${config.apiKey}`,
|
|
1528
|
+
"Content-Type": "application/json"
|
|
1529
|
+
},
|
|
1530
|
+
signal: options?.signal,
|
|
1531
|
+
body: JSON.stringify({
|
|
1532
|
+
text,
|
|
1533
|
+
voice_id: options?.voiceId,
|
|
1534
|
+
language: options?.language ?? "en",
|
|
1535
|
+
user_id: config.userId || void 0
|
|
1536
|
+
})
|
|
1537
|
+
});
|
|
1538
|
+
if (!response.ok) {
|
|
1539
|
+
const errorCode = mapHttpStatusToErrorCode(response.status);
|
|
1540
|
+
let errorMessage;
|
|
1541
|
+
try {
|
|
1542
|
+
const errorData = await response.json();
|
|
1543
|
+
errorMessage = errorData.detail || errorData.message || `HTTP ${response.status}`;
|
|
1544
|
+
} catch {
|
|
1545
|
+
errorMessage = `HTTP ${response.status}: ${response.statusText}`;
|
|
1546
|
+
}
|
|
1547
|
+
const error = {
|
|
1548
|
+
code: errorCode,
|
|
1549
|
+
message: errorMessage,
|
|
1550
|
+
phase: "synthesize"
|
|
1551
|
+
};
|
|
1552
|
+
events.emit("tts:error", error);
|
|
1553
|
+
throw new Error(errorMessage);
|
|
1554
|
+
}
|
|
1555
|
+
const contentType = response.headers.get("Content-Type") || "audio/mpeg";
|
|
1556
|
+
const arrayBuffer = await response.arrayBuffer();
|
|
1557
|
+
events.emit("tts:synthesize:complete", { text });
|
|
1558
|
+
return {
|
|
1559
|
+
audio: arrayBuffer,
|
|
1560
|
+
contentType
|
|
1561
|
+
};
|
|
1562
|
+
} catch (error) {
|
|
1563
|
+
if (error instanceof Error && error.name === "AbortError") throw error;
|
|
1564
|
+
if (error instanceof Error && error.message.includes("HTTP")) throw error;
|
|
1565
|
+
const networkError = {
|
|
1566
|
+
code: "network_error",
|
|
1567
|
+
message: error instanceof Error ? error.message : "Network request failed",
|
|
1568
|
+
phase: "synthesize"
|
|
1569
|
+
};
|
|
1570
|
+
events.emit("tts:error", networkError);
|
|
1571
|
+
throw new Error(networkError.message);
|
|
1572
|
+
}
|
|
1573
|
+
}
|
|
1574
|
+
/**
|
|
1575
|
+
* Stream TTS audio chunks as they arrive from the server
|
|
1576
|
+
*
|
|
1577
|
+
* Useful for progressive playback or processing large texts.
|
|
1578
|
+
*
|
|
1579
|
+
* @param text - Text to synthesize (max 1000 chars)
|
|
1580
|
+
* @param options - Optional synthesis options
|
|
1581
|
+
* @yields Audio chunks as Uint8Array
|
|
1582
|
+
*
|
|
1583
|
+
* @example
|
|
1584
|
+
* ```typescript
|
|
1585
|
+
* const chunks: Uint8Array[] = [];
|
|
1586
|
+
* for await (const chunk of tts.stream('Hello world')) {
|
|
1587
|
+
* chunks.push(chunk);
|
|
1588
|
+
* }
|
|
1589
|
+
* ```
|
|
1590
|
+
*/
|
|
1591
|
+
async *stream(text, options) {
|
|
1592
|
+
const config = getConfig();
|
|
1593
|
+
if (!config.apiKey) {
|
|
1594
|
+
const error = {
|
|
1595
|
+
code: "authentication_failed",
|
|
1596
|
+
message: "API key not configured. Call SpeechOS.init() first.",
|
|
1597
|
+
phase: "synthesize"
|
|
1598
|
+
};
|
|
1599
|
+
events.emit("tts:error", error);
|
|
1600
|
+
throw new Error(error.message);
|
|
1601
|
+
}
|
|
1602
|
+
events.emit("tts:synthesize:start", { text });
|
|
1603
|
+
try {
|
|
1604
|
+
const response = await fetch(`${config.host}/api/tts/`, {
|
|
1605
|
+
method: "POST",
|
|
1606
|
+
headers: {
|
|
1607
|
+
"Authorization": `Api-Key ${config.apiKey}`,
|
|
1608
|
+
"Content-Type": "application/json"
|
|
1609
|
+
},
|
|
1610
|
+
signal: options?.signal,
|
|
1611
|
+
body: JSON.stringify({
|
|
1612
|
+
text,
|
|
1613
|
+
voice_id: options?.voiceId,
|
|
1614
|
+
language: options?.language ?? "en",
|
|
1615
|
+
user_id: config.userId || void 0
|
|
1616
|
+
})
|
|
1617
|
+
});
|
|
1618
|
+
if (!response.ok) {
|
|
1619
|
+
const errorCode = mapHttpStatusToErrorCode(response.status);
|
|
1620
|
+
let errorMessage;
|
|
1621
|
+
try {
|
|
1622
|
+
const errorData = await response.json();
|
|
1623
|
+
errorMessage = errorData.detail || errorData.message || `HTTP ${response.status}`;
|
|
1624
|
+
} catch {
|
|
1625
|
+
errorMessage = `HTTP ${response.status}: ${response.statusText}`;
|
|
1626
|
+
}
|
|
1627
|
+
const error = {
|
|
1628
|
+
code: errorCode,
|
|
1629
|
+
message: errorMessage,
|
|
1630
|
+
phase: "synthesize"
|
|
1631
|
+
};
|
|
1632
|
+
events.emit("tts:error", error);
|
|
1633
|
+
throw new Error(errorMessage);
|
|
1634
|
+
}
|
|
1635
|
+
const reader = response.body?.getReader();
|
|
1636
|
+
if (!reader) throw new Error("Response body is not readable");
|
|
1637
|
+
try {
|
|
1638
|
+
while (true) {
|
|
1639
|
+
const { done, value } = await reader.read();
|
|
1640
|
+
if (done) break;
|
|
1641
|
+
yield value;
|
|
1642
|
+
}
|
|
1643
|
+
} finally {
|
|
1644
|
+
reader.releaseLock();
|
|
1645
|
+
}
|
|
1646
|
+
events.emit("tts:synthesize:complete", { text });
|
|
1647
|
+
} catch (error) {
|
|
1648
|
+
if (error instanceof Error && error.name === "AbortError") return;
|
|
1649
|
+
if (error instanceof Error && error.message.includes("HTTP")) throw error;
|
|
1650
|
+
const networkError = {
|
|
1651
|
+
code: "network_error",
|
|
1652
|
+
message: error instanceof Error ? error.message : "Network request failed",
|
|
1653
|
+
phase: "synthesize"
|
|
1654
|
+
};
|
|
1655
|
+
events.emit("tts:error", networkError);
|
|
1656
|
+
throw new Error(networkError.message);
|
|
1657
|
+
}
|
|
1658
|
+
}
|
|
1659
|
+
};
|
|
1660
|
+
const tts = new TTSClient();
|
|
1661
|
+
|
|
2259
1662
|
//#endregion
|
|
2260
1663
|
//#region src/index.ts
|
|
2261
1664
|
const VERSION = "0.1.0";
|
|
2262
1665
|
|
|
2263
1666
|
//#endregion
|
|
2264
|
-
export { DEFAULT_HOST, Deferred, SpeechOSEventEmitter, VERSION, clearSettingsToken, createStateManager, events, getBackend, getConfig, getSettingsToken,
|
|
1667
|
+
export { DEFAULT_HOST, DEFAULT_TTS_VOICE_ID, Deferred, SpeechOSEventEmitter, TTSClient, VERSION, clearSettingsToken, createStateManager, events, getBackend, getConfig, getSettingsToken, resetConfig, setConfig, speechOS, state, tts, updateUserId, validateConfig, websocket };
|
|
2265
1668
|
//# sourceMappingURL=index.js.map
|