@runtypelabs/persona 1.46.0 → 1.47.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@runtypelabs/persona",
3
- "version": "1.46.0",
3
+ "version": "1.47.0",
4
4
  "description": "Themeable, pluggable streaming agent widget for websites, in plain JS with support for voice input and reasoning / tool output.",
5
5
  "type": "module",
6
6
  "main": "dist/index.cjs",
@@ -253,13 +253,16 @@ export const buildComposer = (context: ComposerBuildContext): ComposerElements =
253
253
  let micButton: HTMLButtonElement | null = null;
254
254
  let micButtonWrapper: HTMLElement | null = null;
255
255
 
256
- // Check browser support for speech recognition
256
+ // Check browser support for speech recognition or Runtype provider
257
257
  const hasSpeechRecognition =
258
258
  typeof window !== "undefined" &&
259
259
  (typeof (window as any).webkitSpeechRecognition !== "undefined" ||
260
260
  typeof (window as any).SpeechRecognition !== "undefined");
261
+ const hasRuntypeProvider =
262
+ voiceRecognitionConfig.provider?.type === "runtype";
263
+ const hasVoiceInput = hasSpeechRecognition || hasRuntypeProvider;
261
264
 
262
- if (voiceRecognitionEnabled && hasSpeechRecognition) {
265
+ if (voiceRecognitionEnabled && hasVoiceInput) {
263
266
  micButtonWrapper = createElement("div", "tvw-send-button-wrapper");
264
267
  micButton = createElement(
265
268
  "button",
package/src/index.ts CHANGED
@@ -210,4 +210,17 @@ export {
210
210
  } from "./components/feedback";
211
211
  export type { CSATFeedbackOptions, NPSFeedbackOptions } from "./components/feedback";
212
212
 
213
+ // Voice module exports
214
+ export {
215
+ createVoiceProvider,
216
+ createBestAvailableVoiceProvider,
217
+ isVoiceSupported
218
+ } from "./voice";
219
+ export type {
220
+ VoiceProvider,
221
+ VoiceResult,
222
+ VoiceStatus,
223
+ VoiceConfig
224
+ } from "./types";
225
+
213
226
  export default initAgentWidgetFn;
package/src/session.ts CHANGED
@@ -17,6 +17,18 @@ import {
17
17
  generateAssistantMessageId
18
18
  } from "./utils/message-id";
19
19
  import { IMAGE_ONLY_MESSAGE_FALLBACK_TEXT } from "./utils/content";
20
+ import type {
21
+ VoiceProvider,
22
+ VoiceResult,
23
+ VoiceStatus,
24
+ VoiceConfig,
25
+ TextToSpeechConfig
26
+ } from "./types";
27
+ import {
28
+ createVoiceProvider,
29
+ createBestAvailableVoiceProvider,
30
+ isVoiceSupported
31
+ } from "./voice";
20
32
 
21
33
  export type AgentWidgetSessionStatus =
22
34
  | "idle"
@@ -29,6 +41,7 @@ type SessionCallbacks = {
29
41
  onStatusChanged: (status: AgentWidgetSessionStatus) => void;
30
42
  onStreamingChanged: (streaming: boolean) => void;
31
43
  onError?: (error: Error) => void;
44
+ onVoiceStatusChanged?: (status: VoiceStatus) => void;
32
45
  };
33
46
 
34
47
  export class AgentWidgetSession {
@@ -45,6 +58,11 @@ export class AgentWidgetSession {
45
58
  // Agent execution state
46
59
  private agentExecution: AgentExecutionState | null = null;
47
60
 
61
+ // Voice support
62
+ private voiceProvider: VoiceProvider | null = null;
63
+ private voiceActive = false;
64
+ private voiceStatus: VoiceStatus = 'disconnected';
65
+
48
66
  constructor(
49
67
  private config: AgentWidgetConfig = {},
50
68
  private callbacks: SessionCallbacks
@@ -97,6 +115,237 @@ export class AgentWidgetSession {
97
115
  return this.agentExecution?.status === 'running';
98
116
  }
99
117
 
118
+ /**
119
+ * Check if voice is supported
120
+ */
121
+ public isVoiceSupported(): boolean {
122
+ return isVoiceSupported(this.config.voiceRecognition?.provider);
123
+ }
124
+
125
+ /**
126
+ * Check if voice is currently active
127
+ */
128
+ public isVoiceActive(): boolean {
129
+ return this.voiceActive;
130
+ }
131
+
132
+ /**
133
+ * Get current voice status
134
+ */
135
+ public getVoiceStatus(): VoiceStatus {
136
+ return this.voiceStatus;
137
+ }
138
+
139
+ // Pending placeholder IDs for Runtype two-phase voice flow
140
+ private pendingVoiceUserMessageId: string | null = null;
141
+ private pendingVoiceAssistantMessageId: string | null = null;
142
+
143
+ // Track message IDs where the Runtype provider already played TTS audio
144
+ // so browser TTS doesn't double-speak them
145
+ private ttsSpokenMessageIds = new Set<string>();
146
+
147
+ /**
148
+ * Setup voice recognition with the given configuration
149
+ */
150
+ public setupVoice(config?: VoiceConfig) {
151
+ try {
152
+ const voiceConfig = config || this.getVoiceConfigFromConfig();
153
+ if (!voiceConfig) {
154
+ throw new Error('Voice configuration not provided');
155
+ }
156
+
157
+ this.voiceProvider = createVoiceProvider(voiceConfig);
158
+
159
+ // Read configurable text from widget config
160
+ const voiceRecognitionConfig = this.config.voiceRecognition ?? {};
161
+ const processingText = voiceRecognitionConfig.processingText ?? '\u{1F3A4} Processing voice...';
162
+ const processingErrorText = voiceRecognitionConfig.processingErrorText ?? 'Voice processing failed. Please try again.';
163
+
164
+ // Phase A: When recording stops and audio is about to be sent,
165
+ // inject placeholder messages and show typing indicator immediately.
166
+ // Placeholders are tagged with voiceProcessing=true so consumers can
167
+ // detect them in messageTransform and render custom UI.
168
+ if (this.voiceProvider.onProcessingStart) {
169
+ this.voiceProvider.onProcessingStart(() => {
170
+ // Inject user message placeholder
171
+ const userMsg = this.injectMessage({
172
+ role: 'user',
173
+ content: processingText,
174
+ streaming: false,
175
+ voiceProcessing: true
176
+ });
177
+ this.pendingVoiceUserMessageId = userMsg.id;
178
+
179
+ // Inject empty assistant message with streaming=true for typing indicator
180
+ const assistantMsg = this.injectMessage({
181
+ role: 'assistant',
182
+ content: '',
183
+ streaming: true,
184
+ voiceProcessing: true
185
+ });
186
+ this.pendingVoiceAssistantMessageId = assistantMsg.id;
187
+
188
+ // Trigger typing indicator in the UI
189
+ this.setStreaming(true);
190
+ });
191
+ }
192
+
193
+ // Phase B: When server responds with transcript + agent response,
194
+ // upsert the placeholder messages with actual content and clear voiceProcessing flag
195
+ this.voiceProvider.onResult((result) => {
196
+ if (result.provider === 'browser') {
197
+ // Browser STT: send transcript as a user message (agent runs via normal chat)
198
+ if (result.text && result.text.trim()) {
199
+ this.sendMessage(result.text, { viaVoice: true });
200
+ }
201
+ } else if (result.provider === 'runtype') {
202
+ // Runtype provider: agent already executed server-side, audio playback
203
+ // is handled by the provider itself. Update placeholders with actual content.
204
+ if (this.pendingVoiceUserMessageId && result.transcript?.trim()) {
205
+ this.upsertMessage({
206
+ id: this.pendingVoiceUserMessageId,
207
+ role: 'user',
208
+ content: result.transcript.trim(),
209
+ createdAt: new Date().toISOString(),
210
+ streaming: false,
211
+ voiceProcessing: false
212
+ });
213
+ } else if (result.transcript?.trim()) {
214
+ this.injectUserMessage({ content: result.transcript.trim() });
215
+ }
216
+
217
+ if (this.pendingVoiceAssistantMessageId && result.text?.trim()) {
218
+ this.upsertMessage({
219
+ id: this.pendingVoiceAssistantMessageId,
220
+ role: 'assistant',
221
+ content: result.text.trim(),
222
+ createdAt: new Date().toISOString(),
223
+ streaming: false,
224
+ voiceProcessing: false
225
+ });
226
+ } else if (result.text?.trim()) {
227
+ this.injectAssistantMessage({ content: result.text.trim() });
228
+ }
229
+
230
+ // If Runtype provider returned audio (server-side TTS), mark the
231
+ // assistant message as already spoken so browser TTS doesn't double-speak
232
+ if (result.audio?.base64) {
233
+ const spokenId = this.pendingVoiceAssistantMessageId
234
+ ?? [...this.messages].reverse().find(m => m.role === 'assistant')?.id;
235
+ if (spokenId) this.ttsSpokenMessageIds.add(spokenId);
236
+ }
237
+
238
+ // Clear streaming state and pending IDs
239
+ this.setStreaming(false);
240
+ this.pendingVoiceUserMessageId = null;
241
+ this.pendingVoiceAssistantMessageId = null;
242
+ }
243
+ });
244
+
245
+ this.voiceProvider.onError((error) => {
246
+ console.error('Voice error:', error);
247
+
248
+ // If error occurs while placeholders are pending, update assistant with error text
249
+ if (this.pendingVoiceAssistantMessageId) {
250
+ this.upsertMessage({
251
+ id: this.pendingVoiceAssistantMessageId,
252
+ role: 'assistant',
253
+ content: processingErrorText,
254
+ createdAt: new Date().toISOString(),
255
+ streaming: false,
256
+ voiceProcessing: false
257
+ });
258
+ this.setStreaming(false);
259
+ this.pendingVoiceUserMessageId = null;
260
+ this.pendingVoiceAssistantMessageId = null;
261
+ }
262
+ });
263
+
264
+ this.voiceProvider.onStatusChange((status) => {
265
+ this.voiceStatus = status;
266
+ this.voiceActive = status === 'listening';
267
+ this.callbacks.onVoiceStatusChanged?.(status);
268
+ });
269
+
270
+ this.voiceProvider.connect();
271
+
272
+ } catch (error) {
273
+ console.error('Failed to setup voice:', error);
274
+ }
275
+ }
276
+
277
+ /**
278
+ * Toggle voice recognition on/off
279
+ */
280
+ public async toggleVoice() {
281
+ if (!this.voiceProvider) {
282
+ console.error('Voice not configured');
283
+ return;
284
+ }
285
+
286
+ if (this.voiceActive) {
287
+ await this.voiceProvider.stopListening();
288
+ } else {
289
+ // Stop any in-progress TTS so the mic doesn't pick it up
290
+ this.stopSpeaking();
291
+ try {
292
+ await this.voiceProvider.startListening();
293
+ } catch (error) {
294
+ console.error('Failed to start voice:', error);
295
+ }
296
+ }
297
+ }
298
+
299
+ /**
300
+ * Cleanup voice resources
301
+ */
302
+ public cleanupVoice() {
303
+ if (this.voiceProvider) {
304
+ this.voiceProvider.disconnect();
305
+ this.voiceProvider = null;
306
+ }
307
+ this.voiceActive = false;
308
+ this.voiceStatus = 'disconnected';
309
+ }
310
+
311
+ /**
312
+ * Extract voice configuration from widget config
313
+ */
314
+ private getVoiceConfigFromConfig(): VoiceConfig | undefined {
315
+ if (!this.config.voiceRecognition?.provider) {
316
+ return undefined;
317
+ }
318
+
319
+ const providerConfig = this.config.voiceRecognition.provider;
320
+
321
+ switch (providerConfig.type) {
322
+ case 'runtype':
323
+ return {
324
+ type: 'runtype',
325
+ runtype: {
326
+ agentId: providerConfig.runtype?.agentId || '',
327
+ clientToken: providerConfig.runtype?.clientToken || '',
328
+ host: providerConfig.runtype?.host,
329
+ voiceId: providerConfig.runtype?.voiceId,
330
+ pauseDuration: providerConfig.runtype?.pauseDuration,
331
+ silenceThreshold: providerConfig.runtype?.silenceThreshold
332
+ }
333
+ };
334
+
335
+ case 'browser':
336
+ return {
337
+ type: 'browser',
338
+ browser: {
339
+ language: providerConfig.browser?.language || 'en-US',
340
+ continuous: providerConfig.browser?.continuous
341
+ }
342
+ };
343
+
344
+ default:
345
+ return undefined;
346
+ }
347
+ }
348
+
100
349
  /**
101
350
  * Initialize the client session (for client token mode).
102
351
  * This is called automatically on first message, but can be called
@@ -267,7 +516,8 @@ export class AgentWidgetSession {
267
516
  id,
268
517
  createdAt,
269
518
  sequence,
270
- streaming = false
519
+ streaming = false,
520
+ voiceProcessing
271
521
  } = options;
272
522
 
273
523
  // Generate appropriate ID based on role
@@ -288,7 +538,8 @@ export class AgentWidgetSession {
288
538
  streaming,
289
539
  // Only include optional fields if provided
290
540
  ...(llmContent !== undefined && { llmContent }),
291
- ...(contentParts !== undefined && { contentParts })
541
+ ...(contentParts !== undefined && { contentParts }),
542
+ ...(voiceProcessing !== undefined && { voiceProcessing })
292
543
  };
293
544
 
294
545
  // Use upsert to handle both new messages and updates (streaming)
@@ -411,6 +662,7 @@ export class AgentWidgetSession {
411
662
  // Allow sending if there's text OR attachments
412
663
  if (!input && (!options?.contentParts || options.contentParts.length === 0)) return;
413
664
 
665
+ this.stopSpeaking();
414
666
  this.abortController?.abort();
415
667
 
416
668
  // Generate IDs for both user message and expected assistant response
@@ -551,6 +803,20 @@ export class AgentWidgetSession {
551
803
  ): Promise<void> {
552
804
  if (this.streaming) return;
553
805
  this.abortController?.abort();
806
+
807
+ // Finalize any stale streaming messages from the previous stream
808
+ // (e.g., tool messages interrupted by approval pause)
809
+ let hasStale = false;
810
+ for (const msg of this.messages) {
811
+ if (msg.streaming) {
812
+ msg.streaming = false;
813
+ hasStale = true;
814
+ }
815
+ }
816
+ if (hasStale) {
817
+ this.callbacks.onMessagesChanged([...this.messages]);
818
+ }
819
+
554
820
  this.setStreaming(true);
555
821
 
556
822
  try {
@@ -668,6 +934,7 @@ export class AgentWidgetSession {
668
934
  }
669
935
 
670
936
  public clearMessages() {
937
+ this.stopSpeaking();
671
938
  this.abortController?.abort();
672
939
  this.abortController = null;
673
940
  this.messages = [];
@@ -742,8 +1009,127 @@ export class AgentWidgetSession {
742
1009
 
743
1010
  private setStreaming(streaming: boolean) {
744
1011
  if (this.streaming === streaming) return;
1012
+ const wasStreaming = this.streaming;
745
1013
  this.streaming = streaming;
746
1014
  this.callbacks.onStreamingChanged(streaming);
1015
+
1016
+ // Speak the latest assistant message when streaming completes
1017
+ if (wasStreaming && !streaming) {
1018
+ this.speakLatestAssistantMessage();
1019
+ }
1020
+ }
1021
+
1022
+ /**
1023
+ * Speak the latest assistant message using the Web Speech API
1024
+ * if text-to-speech is enabled in the config.
1025
+ */
1026
+ private speakLatestAssistantMessage() {
1027
+ const ttsConfig = this.config.textToSpeech;
1028
+ if (!ttsConfig?.enabled) return;
1029
+
1030
+ // Determine if browser TTS should fire:
1031
+ // - provider 'browser' (or unset): always use browser TTS
1032
+ // - provider 'runtype': only if browserFallback is enabled
1033
+ const useBrowserTts =
1034
+ !ttsConfig.provider ||
1035
+ ttsConfig.provider === 'browser' ||
1036
+ (ttsConfig.provider === 'runtype' && ttsConfig.browserFallback);
1037
+ if (!useBrowserTts) return;
1038
+
1039
+ // Find the last assistant message with actual content
1040
+ const lastAssistant = [...this.messages]
1041
+ .reverse()
1042
+ .find(m => m.role === 'assistant' && m.content && !m.voiceProcessing);
1043
+
1044
+ if (!lastAssistant) return;
1045
+
1046
+ // Skip if already spoken by Runtype provider's audio playback
1047
+ if (this.ttsSpokenMessageIds.has(lastAssistant.id)) {
1048
+ this.ttsSpokenMessageIds.delete(lastAssistant.id);
1049
+ return;
1050
+ }
1051
+
1052
+ const text = lastAssistant.content;
1053
+ if (!text.trim()) return;
1054
+
1055
+ this.speak(text, ttsConfig);
1056
+ }
1057
+
1058
+ /**
1059
+ * Speak text using the Web Speech API.
1060
+ * Cancels any in-progress speech before starting.
1061
+ */
1062
+ private speak(text: string, config: TextToSpeechConfig) {
1063
+ if (typeof window === 'undefined' || !('speechSynthesis' in window)) return;
1064
+
1065
+ const synth = window.speechSynthesis;
1066
+ synth.cancel();
1067
+
1068
+ const utterance = new SpeechSynthesisUtterance(text);
1069
+ const voices = synth.getVoices();
1070
+
1071
+ if (config.voice) {
1072
+ const match = voices.find(v => v.name === config.voice);
1073
+ if (match) utterance.voice = match;
1074
+ } else if (voices.length > 0) {
1075
+ // Use custom picker if provided, otherwise auto-detect
1076
+ utterance.voice = config.pickVoice
1077
+ ? config.pickVoice(voices)
1078
+ : AgentWidgetSession.pickBestVoice(voices);
1079
+ }
1080
+
1081
+ if (config.rate !== undefined) utterance.rate = config.rate;
1082
+ if (config.pitch !== undefined) utterance.pitch = config.pitch;
1083
+
1084
+ // Chrome bug: cancel() immediately followed by speak() can ignore
1085
+ // rate/pitch. A microtask delay lets the engine reset properly.
1086
+ setTimeout(() => synth.speak(utterance), 50);
1087
+ }
1088
+
1089
+ /**
1090
+ * Pick the best available English voice from a list of SpeechSynthesisVoices.
1091
+ * Prefers high-quality remote/natural voices, then enhanced local voices,
1092
+ * then standard local voices.
1093
+ */
1094
+ static pickBestVoice(voices: SpeechSynthesisVoice[]): SpeechSynthesisVoice {
1095
+ // Priority list: high-quality voices across browsers/platforms
1096
+ const preferred = [
1097
+ // Edge Online Natural (highest quality)
1098
+ 'Microsoft Jenny Online (Natural) - English (United States)',
1099
+ 'Microsoft Aria Online (Natural) - English (United States)',
1100
+ 'Microsoft Guy Online (Natural) - English (United States)',
1101
+ // Google remote (good quality, cross-platform in Chrome)
1102
+ 'Google US English',
1103
+ 'Google UK English Female',
1104
+ // Apple premium/enhanced (macOS)
1105
+ 'Ava (Premium)',
1106
+ 'Evan (Enhanced)',
1107
+ 'Samantha (Enhanced)',
1108
+ // Apple standard (macOS/iOS)
1109
+ 'Samantha',
1110
+ 'Daniel',
1111
+ 'Karen',
1112
+ // Windows SAPI
1113
+ 'Microsoft David Desktop - English (United States)',
1114
+ 'Microsoft Zira Desktop - English (United States)',
1115
+ ];
1116
+
1117
+ for (const name of preferred) {
1118
+ const match = voices.find(v => v.name === name);
1119
+ if (match) return match;
1120
+ }
1121
+
1122
+ // Fallback: any English voice, then first available
1123
+ return voices.find(v => v.lang.startsWith('en')) ?? voices[0];
1124
+ }
1125
+
1126
+ /**
1127
+ * Stop any in-progress text-to-speech playback.
1128
+ */
1129
+ public stopSpeaking() {
1130
+ if (typeof window !== 'undefined' && 'speechSynthesis' in window) {
1131
+ window.speechSynthesis.cancel();
1132
+ }
747
1133
  }
748
1134
 
749
1135
  private appendMessage(message: AgentWidgetMessage) {