@livekit/agents 1.1.0-dev.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (292) hide show
  1. package/dist/cli.cjs +2 -0
  2. package/dist/cli.cjs.map +1 -1
  3. package/dist/cli.d.ts.map +1 -1
  4. package/dist/cli.js +2 -0
  5. package/dist/cli.js.map +1 -1
  6. package/dist/constants.cjs +3 -0
  7. package/dist/constants.cjs.map +1 -1
  8. package/dist/constants.d.cts +1 -0
  9. package/dist/constants.d.ts +1 -0
  10. package/dist/constants.d.ts.map +1 -1
  11. package/dist/constants.js +2 -0
  12. package/dist/constants.js.map +1 -1
  13. package/dist/cpu.cjs +189 -0
  14. package/dist/cpu.cjs.map +1 -0
  15. package/dist/cpu.d.cts +24 -0
  16. package/dist/cpu.d.ts +24 -0
  17. package/dist/cpu.d.ts.map +1 -0
  18. package/dist/cpu.js +152 -0
  19. package/dist/cpu.js.map +1 -0
  20. package/dist/cpu.test.cjs +227 -0
  21. package/dist/cpu.test.cjs.map +1 -0
  22. package/dist/cpu.test.js +204 -0
  23. package/dist/cpu.test.js.map +1 -0
  24. package/dist/index.cjs +12 -10
  25. package/dist/index.cjs.map +1 -1
  26. package/dist/index.d.cts +13 -13
  27. package/dist/index.d.ts +13 -13
  28. package/dist/index.d.ts.map +1 -1
  29. package/dist/index.js +11 -10
  30. package/dist/index.js.map +1 -1
  31. package/dist/inference/interruption/defaults.cjs +1 -1
  32. package/dist/inference/interruption/defaults.cjs.map +1 -1
  33. package/dist/inference/interruption/defaults.d.cts +1 -1
  34. package/dist/inference/interruption/defaults.d.ts +1 -1
  35. package/dist/inference/interruption/defaults.d.ts.map +1 -1
  36. package/dist/inference/interruption/defaults.js +1 -1
  37. package/dist/inference/interruption/defaults.js.map +1 -1
  38. package/dist/inference/interruption/http_transport.cjs +44 -28
  39. package/dist/inference/interruption/http_transport.cjs.map +1 -1
  40. package/dist/inference/interruption/http_transport.d.ts.map +1 -1
  41. package/dist/inference/interruption/http_transport.js +45 -29
  42. package/dist/inference/interruption/http_transport.js.map +1 -1
  43. package/dist/inference/interruption/interruption_detector.cjs +22 -5
  44. package/dist/inference/interruption/interruption_detector.cjs.map +1 -1
  45. package/dist/inference/interruption/interruption_detector.d.cts +2 -2
  46. package/dist/inference/interruption/interruption_detector.d.ts +2 -2
  47. package/dist/inference/interruption/interruption_detector.d.ts.map +1 -1
  48. package/dist/inference/interruption/interruption_detector.js +22 -5
  49. package/dist/inference/interruption/interruption_detector.js.map +1 -1
  50. package/dist/inference/interruption/interruption_stream.cjs +4 -4
  51. package/dist/inference/interruption/interruption_stream.cjs.map +1 -1
  52. package/dist/inference/interruption/interruption_stream.js +4 -4
  53. package/dist/inference/interruption/interruption_stream.js.map +1 -1
  54. package/dist/inference/interruption/types.cjs.map +1 -1
  55. package/dist/inference/interruption/types.d.cts +2 -2
  56. package/dist/inference/interruption/types.d.ts +2 -2
  57. package/dist/inference/interruption/types.d.ts.map +1 -1
  58. package/dist/inference/interruption/ws_transport.cjs +60 -47
  59. package/dist/inference/interruption/ws_transport.cjs.map +1 -1
  60. package/dist/inference/interruption/ws_transport.d.ts.map +1 -1
  61. package/dist/inference/interruption/ws_transport.js +60 -47
  62. package/dist/inference/interruption/ws_transport.js.map +1 -1
  63. package/dist/inference/llm.cjs.map +1 -1
  64. package/dist/inference/llm.d.cts +1 -1
  65. package/dist/inference/llm.d.ts +1 -1
  66. package/dist/inference/llm.d.ts.map +1 -1
  67. package/dist/inference/llm.js.map +1 -1
  68. package/dist/inference/stt.cjs +20 -12
  69. package/dist/inference/stt.cjs.map +1 -1
  70. package/dist/inference/stt.d.cts +3 -2
  71. package/dist/inference/stt.d.ts +3 -2
  72. package/dist/inference/stt.d.ts.map +1 -1
  73. package/dist/inference/stt.js +20 -12
  74. package/dist/inference/stt.js.map +1 -1
  75. package/dist/inference/stt.test.cjs +14 -0
  76. package/dist/inference/stt.test.cjs.map +1 -1
  77. package/dist/inference/stt.test.js +14 -0
  78. package/dist/inference/stt.test.js.map +1 -1
  79. package/dist/inference/tts.cjs +13 -4
  80. package/dist/inference/tts.cjs.map +1 -1
  81. package/dist/inference/tts.d.cts +8 -1
  82. package/dist/inference/tts.d.ts +8 -1
  83. package/dist/inference/tts.d.ts.map +1 -1
  84. package/dist/inference/tts.js +13 -4
  85. package/dist/inference/tts.js.map +1 -1
  86. package/dist/inference/tts.test.cjs +10 -0
  87. package/dist/inference/tts.test.cjs.map +1 -1
  88. package/dist/inference/tts.test.js +10 -0
  89. package/dist/inference/tts.test.js.map +1 -1
  90. package/dist/ipc/job_proc_lazy_main.cjs +41 -23
  91. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
  92. package/dist/ipc/job_proc_lazy_main.js +41 -23
  93. package/dist/ipc/job_proc_lazy_main.js.map +1 -1
  94. package/dist/job.cjs +1 -1
  95. package/dist/job.cjs.map +1 -1
  96. package/dist/job.js +1 -1
  97. package/dist/job.js.map +1 -1
  98. package/dist/language.cjs +394 -0
  99. package/dist/language.cjs.map +1 -0
  100. package/dist/language.d.cts +15 -0
  101. package/dist/language.d.ts +15 -0
  102. package/dist/language.d.ts.map +1 -0
  103. package/dist/language.js +363 -0
  104. package/dist/language.js.map +1 -0
  105. package/dist/language.test.cjs +43 -0
  106. package/dist/language.test.cjs.map +1 -0
  107. package/dist/language.test.js +49 -0
  108. package/dist/language.test.js.map +1 -0
  109. package/dist/llm/index.cjs +2 -0
  110. package/dist/llm/index.cjs.map +1 -1
  111. package/dist/llm/index.d.cts +1 -1
  112. package/dist/llm/index.d.ts +1 -1
  113. package/dist/llm/index.d.ts.map +1 -1
  114. package/dist/llm/index.js +2 -0
  115. package/dist/llm/index.js.map +1 -1
  116. package/dist/stream/deferred_stream.cjs +6 -2
  117. package/dist/stream/deferred_stream.cjs.map +1 -1
  118. package/dist/stream/deferred_stream.d.ts.map +1 -1
  119. package/dist/stream/deferred_stream.js +6 -2
  120. package/dist/stream/deferred_stream.js.map +1 -1
  121. package/dist/stt/stt.cjs.map +1 -1
  122. package/dist/stt/stt.d.cts +2 -1
  123. package/dist/stt/stt.d.ts +2 -1
  124. package/dist/stt/stt.d.ts.map +1 -1
  125. package/dist/stt/stt.js.map +1 -1
  126. package/dist/utils.cjs +15 -0
  127. package/dist/utils.cjs.map +1 -1
  128. package/dist/utils.d.cts +8 -0
  129. package/dist/utils.d.ts +8 -0
  130. package/dist/utils.d.ts.map +1 -1
  131. package/dist/utils.js +13 -0
  132. package/dist/utils.js.map +1 -1
  133. package/dist/version.cjs +1 -1
  134. package/dist/version.js +1 -1
  135. package/dist/voice/agent.cjs +14 -17
  136. package/dist/voice/agent.cjs.map +1 -1
  137. package/dist/voice/agent.d.cts +10 -11
  138. package/dist/voice/agent.d.ts +10 -11
  139. package/dist/voice/agent.d.ts.map +1 -1
  140. package/dist/voice/agent.js +15 -18
  141. package/dist/voice/agent.js.map +1 -1
  142. package/dist/voice/agent.test.cjs +194 -0
  143. package/dist/voice/agent.test.cjs.map +1 -1
  144. package/dist/voice/agent.test.js +195 -1
  145. package/dist/voice/agent.test.js.map +1 -1
  146. package/dist/voice/agent_activity.cjs +116 -39
  147. package/dist/voice/agent_activity.cjs.map +1 -1
  148. package/dist/voice/agent_activity.d.cts +2 -0
  149. package/dist/voice/agent_activity.d.ts +2 -0
  150. package/dist/voice/agent_activity.d.ts.map +1 -1
  151. package/dist/voice/agent_activity.js +117 -40
  152. package/dist/voice/agent_activity.js.map +1 -1
  153. package/dist/voice/agent_activity.test.cjs +135 -0
  154. package/dist/voice/agent_activity.test.cjs.map +1 -0
  155. package/dist/voice/agent_activity.test.js +134 -0
  156. package/dist/voice/agent_activity.test.js.map +1 -0
  157. package/dist/voice/agent_session.cjs +38 -38
  158. package/dist/voice/agent_session.cjs.map +1 -1
  159. package/dist/voice/agent_session.d.cts +65 -56
  160. package/dist/voice/agent_session.d.ts +65 -56
  161. package/dist/voice/agent_session.d.ts.map +1 -1
  162. package/dist/voice/agent_session.js +37 -37
  163. package/dist/voice/agent_session.js.map +1 -1
  164. package/dist/voice/audio_recognition.cjs +106 -52
  165. package/dist/voice/audio_recognition.cjs.map +1 -1
  166. package/dist/voice/audio_recognition.d.cts +4 -2
  167. package/dist/voice/audio_recognition.d.ts +4 -2
  168. package/dist/voice/audio_recognition.d.ts.map +1 -1
  169. package/dist/voice/audio_recognition.js +106 -52
  170. package/dist/voice/audio_recognition.js.map +1 -1
  171. package/dist/voice/audio_recognition_span.test.cjs +84 -22
  172. package/dist/voice/audio_recognition_span.test.cjs.map +1 -1
  173. package/dist/voice/audio_recognition_span.test.js +90 -23
  174. package/dist/voice/audio_recognition_span.test.js.map +1 -1
  175. package/dist/voice/events.cjs +1 -1
  176. package/dist/voice/events.cjs.map +1 -1
  177. package/dist/voice/events.d.cts +4 -3
  178. package/dist/voice/events.d.ts +4 -3
  179. package/dist/voice/events.d.ts.map +1 -1
  180. package/dist/voice/events.js +1 -1
  181. package/dist/voice/events.js.map +1 -1
  182. package/dist/voice/index.cjs +9 -1
  183. package/dist/voice/index.cjs.map +1 -1
  184. package/dist/voice/index.d.cts +1 -1
  185. package/dist/voice/index.d.ts +1 -1
  186. package/dist/voice/index.d.ts.map +1 -1
  187. package/dist/voice/index.js +10 -1
  188. package/dist/voice/index.js.map +1 -1
  189. package/dist/voice/remote_session.cjs +922 -0
  190. package/dist/voice/remote_session.cjs.map +1 -0
  191. package/dist/voice/remote_session.d.cts +108 -0
  192. package/dist/voice/remote_session.d.ts +108 -0
  193. package/dist/voice/remote_session.d.ts.map +1 -0
  194. package/dist/voice/remote_session.js +887 -0
  195. package/dist/voice/remote_session.js.map +1 -0
  196. package/dist/voice/report.cjs +11 -10
  197. package/dist/voice/report.cjs.map +1 -1
  198. package/dist/voice/report.d.cts +5 -3
  199. package/dist/voice/report.d.ts +5 -3
  200. package/dist/voice/report.d.ts.map +1 -1
  201. package/dist/voice/report.js +11 -10
  202. package/dist/voice/report.js.map +1 -1
  203. package/dist/voice/report.test.cjs +15 -0
  204. package/dist/voice/report.test.cjs.map +1 -1
  205. package/dist/voice/report.test.js +15 -0
  206. package/dist/voice/report.test.js.map +1 -1
  207. package/dist/voice/room_io/room_io.cjs +39 -0
  208. package/dist/voice/room_io/room_io.cjs.map +1 -1
  209. package/dist/voice/room_io/room_io.d.cts +3 -1
  210. package/dist/voice/room_io/room_io.d.ts +3 -1
  211. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  212. package/dist/voice/room_io/room_io.js +40 -1
  213. package/dist/voice/room_io/room_io.js.map +1 -1
  214. package/dist/voice/turn_config/interruption.cjs.map +1 -1
  215. package/dist/voice/turn_config/interruption.d.cts +1 -1
  216. package/dist/voice/turn_config/interruption.d.ts +1 -1
  217. package/dist/voice/turn_config/interruption.d.ts.map +1 -1
  218. package/dist/voice/turn_config/interruption.js.map +1 -1
  219. package/dist/voice/turn_config/utils.cjs +95 -35
  220. package/dist/voice/turn_config/utils.cjs.map +1 -1
  221. package/dist/voice/turn_config/utils.d.cts +17 -5
  222. package/dist/voice/turn_config/utils.d.ts +17 -5
  223. package/dist/voice/turn_config/utils.d.ts.map +1 -1
  224. package/dist/voice/turn_config/utils.js +93 -35
  225. package/dist/voice/turn_config/utils.js.map +1 -1
  226. package/dist/voice/turn_config/utils.test.cjs +83 -41
  227. package/dist/voice/turn_config/utils.test.cjs.map +1 -1
  228. package/dist/voice/turn_config/utils.test.js +84 -42
  229. package/dist/voice/turn_config/utils.test.js.map +1 -1
  230. package/dist/worker.cjs +6 -29
  231. package/dist/worker.cjs.map +1 -1
  232. package/dist/worker.d.ts.map +1 -1
  233. package/dist/worker.js +6 -19
  234. package/dist/worker.js.map +1 -1
  235. package/package.json +3 -2
  236. package/src/cli.ts +2 -0
  237. package/src/constants.ts +1 -0
  238. package/src/cpu.test.ts +239 -0
  239. package/src/cpu.ts +173 -0
  240. package/src/index.ts +13 -15
  241. package/src/inference/interruption/defaults.ts +1 -1
  242. package/src/inference/interruption/http_transport.ts +49 -30
  243. package/src/inference/interruption/interruption_detector.ts +22 -6
  244. package/src/inference/interruption/interruption_stream.ts +4 -4
  245. package/src/inference/interruption/types.ts +2 -2
  246. package/src/inference/interruption/ws_transport.ts +63 -59
  247. package/src/inference/llm.ts +3 -1
  248. package/src/inference/stt.test.ts +17 -0
  249. package/src/inference/stt.ts +22 -14
  250. package/src/inference/tts.test.ts +12 -0
  251. package/src/inference/tts.ts +22 -6
  252. package/src/ipc/job_proc_lazy_main.ts +44 -24
  253. package/src/job.ts +1 -1
  254. package/src/language.test.ts +62 -0
  255. package/src/language.ts +380 -0
  256. package/src/llm/index.ts +2 -0
  257. package/src/stream/deferred_stream.ts +5 -1
  258. package/src/stt/stt.ts +2 -1
  259. package/src/utils.ts +20 -0
  260. package/src/voice/agent.test.ts +208 -1
  261. package/src/voice/agent.ts +21 -22
  262. package/src/voice/agent_activity.test.ts +194 -0
  263. package/src/voice/agent_activity.ts +161 -43
  264. package/src/voice/agent_session.ts +103 -92
  265. package/src/voice/audio_recognition.ts +124 -61
  266. package/src/voice/audio_recognition_span.test.ts +115 -35
  267. package/src/voice/events.ts +4 -3
  268. package/src/voice/index.ts +10 -1
  269. package/src/voice/remote_session.ts +1083 -0
  270. package/src/voice/report.test.ts +22 -3
  271. package/src/voice/report.ts +31 -14
  272. package/src/voice/room_io/room_io.ts +52 -2
  273. package/src/voice/turn_config/interruption.ts +1 -1
  274. package/src/voice/turn_config/utils.test.ts +91 -43
  275. package/src/voice/turn_config/utils.ts +120 -56
  276. package/src/worker.ts +34 -50
  277. package/dist/voice/client_events.cjs +0 -554
  278. package/dist/voice/client_events.cjs.map +0 -1
  279. package/dist/voice/client_events.d.cts +0 -195
  280. package/dist/voice/client_events.d.ts +0 -195
  281. package/dist/voice/client_events.d.ts.map +0 -1
  282. package/dist/voice/client_events.js +0 -548
  283. package/dist/voice/client_events.js.map +0 -1
  284. package/dist/voice/wire_format.cjs +0 -798
  285. package/dist/voice/wire_format.cjs.map +0 -1
  286. package/dist/voice/wire_format.d.cts +0 -5503
  287. package/dist/voice/wire_format.d.ts +0 -5503
  288. package/dist/voice/wire_format.d.ts.map +0 -1
  289. package/dist/voice/wire_format.js +0 -728
  290. package/dist/voice/wire_format.js.map +0 -1
  291. package/src/voice/client_events.ts +0 -838
  292. package/src/voice/wire_format.ts +0 -827
@@ -0,0 +1,1083 @@
1
+ // SPDX-FileCopyrightText: 2026 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import { Timestamp } from '@bufbuild/protobuf';
5
+ import { AgentSession as pb } from '@livekit/protocol';
6
+ import type { ByteStreamReader, Room, TextStreamInfo } from '@livekit/rtc-node';
7
+ import type { TypedEventEmitter } from '@livekit/typed-emitter';
8
+ import EventEmitter from 'events';
9
+ import { TOPIC_SESSION_MESSAGES } from '../constants.js';
10
+ import type { OverlappingSpeechEvent } from '../inference/interruption/types.js';
11
+ import type {
12
+ ChatItem,
13
+ FunctionCall as FCItem,
14
+ FunctionCallOutput as FCOItem,
15
+ } from '../llm/chat_context.js';
16
+ import type { ToolContext } from '../llm/tool_context.js';
17
+ import { log } from '../log.js';
18
+ import type {
19
+ InterruptionModelUsage,
20
+ LLMModelUsage,
21
+ STTModelUsage,
22
+ TTSModelUsage,
23
+ } from '../metrics/model_usage.js';
24
+ import { Future, Task, shortuuid } from '../utils.js';
25
+ import type { AgentSession, AgentSessionUsage } from './agent_session.js';
26
+ import {
27
+ AgentSessionEventTypes,
28
+ type AgentState,
29
+ type AgentStateChangedEvent,
30
+ type ConversationItemAddedEvent,
31
+ type ErrorEvent,
32
+ type FunctionToolsExecutedEvent,
33
+ type MetricsCollectedEvent,
34
+ type UserInputTranscribedEvent,
35
+ type UserState,
36
+ type UserStateChangedEvent,
37
+ } from './events.js';
38
+ import type { RoomIO } from './room_io/room_io.js';
39
+
40
+ // ===========================================================================
41
+ // Shared types (TextInput, Client event types, wire format aliases)
42
+ // ===========================================================================
43
+
44
+ export interface TextInputEvent {
45
+ text: string;
46
+ info?: TextStreamInfo;
47
+ participantIdentity?: string;
48
+ }
49
+
50
+ export type TextInputCallback = (session: AgentSession, ev: TextInputEvent) => void | Promise<void>;
51
+
52
+ /** @experimental */
53
+ export type RemoteSessionEventTypes =
54
+ | 'agent_state_changed'
55
+ | 'user_state_changed'
56
+ | 'conversation_item_added'
57
+ | 'user_input_transcribed'
58
+ | 'function_tools_executed'
59
+ | 'overlapping_speech'
60
+ | 'session_usage'
61
+ | 'error';
62
+
63
+ /** @experimental */
64
+ export type RemoteSessionCallbacks = {
65
+ agent_state_changed: (ev: pb.AgentSessionEvent_AgentStateChanged) => void;
66
+ user_state_changed: (ev: pb.AgentSessionEvent_UserStateChanged) => void;
67
+ conversation_item_added: (ev: pb.AgentSessionEvent_ConversationItemAdded) => void;
68
+ user_input_transcribed: (ev: pb.AgentSessionEvent_UserInputTranscribed) => void;
69
+ function_tools_executed: (ev: pb.AgentSessionEvent_FunctionToolsExecuted) => void;
70
+ overlapping_speech: (ev: pb.AgentSessionEvent_OverlappingSpeech) => void;
71
+ session_usage: (ev: pb.AgentSessionEvent_SessionUsageUpdated) => void;
72
+ error: (ev: pb.AgentSessionEvent_Error) => void;
73
+ };
74
+
75
+ // ===========================================================================
76
+ // SessionTransport
77
+ // ===========================================================================
78
+
79
+ export abstract class SessionTransport {
80
+ async start(): Promise<void> {}
81
+ abstract sendMessage(msg: pb.AgentSessionMessage): Promise<void>;
82
+ abstract close(): Promise<void>;
83
+ abstract [Symbol.asyncIterator](): AsyncIterator<pb.AgentSessionMessage>;
84
+ }
85
+
86
+ export class RoomSessionTransport extends SessionTransport {
87
+ private readonly room: Room;
88
+ private handlerRegistered = false;
89
+ private closed = false;
90
+ private pendingMessages: pb.AgentSessionMessage[] = [];
91
+ private waitingResolve: ((value: IteratorResult<pb.AgentSessionMessage>) => void) | null = null;
92
+ private roomIO: RoomIO;
93
+
94
+ constructor(room: Room, roomIO: RoomIO) {
95
+ super();
96
+ this.room = room;
97
+ this.roomIO = roomIO;
98
+ }
99
+
100
+ private getRemoteIdentity() {
101
+ return this.roomIO.linkedParticipant?.identity;
102
+ }
103
+
104
+ override async start(): Promise<void> {
105
+ if (this.handlerRegistered) return;
106
+ this.room.registerByteStreamHandler(TOPIC_SESSION_MESSAGES, this.onByteStream);
107
+ this.handlerRegistered = true;
108
+ }
109
+
110
+ private onByteStream = (reader: ByteStreamReader, participantInfo: { identity: string }) => {
111
+ if (this.getRemoteIdentity() && participantInfo.identity !== this.getRemoteIdentity()) {
112
+ return;
113
+ }
114
+ this.readStream(reader).catch((e) => {
115
+ log().warn({ error: e }, 'failed to read binary stream message');
116
+ });
117
+ };
118
+
119
+ private async readStream(reader: ByteStreamReader): Promise<void> {
120
+ try {
121
+ const chunks = await reader.readAll();
122
+ let totalLength = 0;
123
+ for (const chunk of chunks) {
124
+ totalLength += chunk.length;
125
+ }
126
+ const data = new Uint8Array(totalLength);
127
+ let offset = 0;
128
+ for (const chunk of chunks) {
129
+ data.set(chunk, offset);
130
+ offset += chunk.length;
131
+ }
132
+ const msg = pb.AgentSessionMessage.fromBinary(data);
133
+ this.enqueue(msg);
134
+ } catch (e) {
135
+ if (!this.closed) {
136
+ log().warn({ error: e }, 'failed to parse binary stream message');
137
+ }
138
+ }
139
+ }
140
+
141
+ override async sendMessage(msg: pb.AgentSessionMessage): Promise<void> {
142
+ if (this.closed || !this.room.isConnected) return;
143
+
144
+ try {
145
+ const data = msg.toBinary();
146
+ const opts: Record<string, unknown> = {
147
+ topic: TOPIC_SESSION_MESSAGES,
148
+ name: shortuuid('AS_'),
149
+ };
150
+ const remoteIdentity = this.getRemoteIdentity();
151
+ if (remoteIdentity) {
152
+ opts.destinationIdentities = [remoteIdentity];
153
+ }
154
+ const writer = await this.room.localParticipant!.streamBytes(opts);
155
+ await writer.write(new Uint8Array(data));
156
+ await writer.close();
157
+ } catch (e) {
158
+ log().warn({ error: e }, 'failed to send binary stream message');
159
+ }
160
+ }
161
+
162
+ override async close(): Promise<void> {
163
+ if (this.closed) return;
164
+ this.closed = true;
165
+
166
+ if (this.handlerRegistered) {
167
+ try {
168
+ this.room.unregisterByteStreamHandler(TOPIC_SESSION_MESSAGES);
169
+ } catch (e) {
170
+ log().debug({ error: e }, 'byte stream handler already unregistered');
171
+ }
172
+ this.handlerRegistered = false;
173
+ }
174
+
175
+ if (this.waitingResolve) {
176
+ this.waitingResolve({
177
+ value: undefined as unknown as pb.AgentSessionMessage,
178
+ done: true,
179
+ });
180
+ this.waitingResolve = null;
181
+ }
182
+ }
183
+
184
+ private enqueue(msg: pb.AgentSessionMessage): void {
185
+ if (this.closed) return;
186
+
187
+ if (this.waitingResolve) {
188
+ const resolve = this.waitingResolve;
189
+ this.waitingResolve = null;
190
+ resolve({ value: msg, done: false });
191
+ } else {
192
+ this.pendingMessages.push(msg);
193
+ }
194
+ }
195
+
196
+ override [Symbol.asyncIterator](): AsyncIterator<pb.AgentSessionMessage> {
197
+ return {
198
+ next: (): Promise<IteratorResult<pb.AgentSessionMessage>> => {
199
+ if (this.closed && this.pendingMessages.length === 0) {
200
+ return Promise.resolve({
201
+ value: undefined as unknown as pb.AgentSessionMessage,
202
+ done: true,
203
+ });
204
+ }
205
+
206
+ const pending = this.pendingMessages.shift();
207
+ if (pending) {
208
+ return Promise.resolve({ value: pending, done: false });
209
+ }
210
+
211
+ return new Promise<IteratorResult<pb.AgentSessionMessage>>((resolve) => {
212
+ this.waitingResolve = resolve;
213
+ });
214
+ },
215
+ return: (): Promise<IteratorResult<pb.AgentSessionMessage>> => {
216
+ this.close();
217
+ return Promise.resolve({
218
+ value: undefined as unknown as pb.AgentSessionMessage,
219
+ done: true,
220
+ });
221
+ },
222
+ };
223
+ }
224
+ }
225
+
226
+ // ===========================================================================
227
+ // Enum maps
228
+ // ===========================================================================
229
+ const AGENT_STATE_MAP: Record<AgentState, pb.AgentState> = {
230
+ initializing: pb.AgentState.AS_INITIALIZING,
231
+ idle: pb.AgentState.AS_IDLE,
232
+ listening: pb.AgentState.AS_LISTENING,
233
+ thinking: pb.AgentState.AS_THINKING,
234
+ speaking: pb.AgentState.AS_SPEAKING,
235
+ };
236
+
237
+ const USER_STATE_MAP: Record<UserState, pb.UserState> = {
238
+ speaking: pb.UserState.US_SPEAKING,
239
+ listening: pb.UserState.US_LISTENING,
240
+ away: pb.UserState.US_AWAY,
241
+ };
242
+
243
+ // ===========================================================================
244
+ // Chat item / timestamp conversion helpers
245
+ // ===========================================================================
246
+ function msToTimestamp(ms: number): Timestamp {
247
+ return Timestamp.fromDate(new Date(ms));
248
+ }
249
+
250
+ function nowTimestamp(): Timestamp {
251
+ return Timestamp.fromDate(new Date());
252
+ }
253
+
254
+ function chatItemToProto(item: ChatItem): pb.ChatContext_ChatItem {
255
+ switch (item.type) {
256
+ case 'message': {
257
+ const msg = item;
258
+ const roleMap: Record<string, pb.ChatRole> = {
259
+ developer: pb.ChatRole.DEVELOPER,
260
+ system: pb.ChatRole.SYSTEM,
261
+ user: pb.ChatRole.USER,
262
+ assistant: pb.ChatRole.ASSISTANT,
263
+ };
264
+ const content: pb.ChatMessage_ChatContent[] = [];
265
+ for (const c of msg.content) {
266
+ if (typeof c === 'string') {
267
+ content.push(new pb.ChatMessage_ChatContent({ payload: { case: 'text', value: c } }));
268
+ }
269
+ }
270
+
271
+ const metricsReport = new pb.MetricsReport();
272
+ if (msg.metrics.transcriptionDelay !== undefined)
273
+ metricsReport.transcriptionDelay = msg.metrics.transcriptionDelay;
274
+ if (msg.metrics.endOfTurnDelay !== undefined)
275
+ metricsReport.endOfTurnDelay = msg.metrics.endOfTurnDelay;
276
+ if (msg.metrics.onUserTurnCompletedDelay !== undefined)
277
+ metricsReport.onUserTurnCompletedDelay = msg.metrics.onUserTurnCompletedDelay;
278
+ if (msg.metrics.llmNodeTtft !== undefined)
279
+ metricsReport.llmNodeTtft = msg.metrics.llmNodeTtft;
280
+ if (msg.metrics.ttsNodeTtfb !== undefined)
281
+ metricsReport.ttsNodeTtfb = msg.metrics.ttsNodeTtfb;
282
+ if (msg.metrics.e2eLatency !== undefined) metricsReport.e2eLatency = msg.metrics.e2eLatency;
283
+
284
+ const pbMsg = new pb.ChatMessage({
285
+ id: msg.id,
286
+ role: roleMap[msg.role] ?? pb.ChatRole.ASSISTANT,
287
+ content,
288
+ interrupted: msg.interrupted,
289
+ metrics: metricsReport,
290
+ createdAt: msToTimestamp(msg.createdAt),
291
+ });
292
+ if (msg.transcriptConfidence !== undefined) {
293
+ pbMsg.transcriptConfidence = msg.transcriptConfidence;
294
+ }
295
+ return new pb.ChatContext_ChatItem({ item: { case: 'message', value: pbMsg } });
296
+ }
297
+ case 'function_call': {
298
+ const fc = item;
299
+ return new pb.ChatContext_ChatItem({
300
+ item: {
301
+ case: 'functionCall',
302
+ value: new pb.FunctionCall({
303
+ id: fc.id,
304
+ callId: fc.callId,
305
+ name: fc.name,
306
+ arguments: fc.args,
307
+ createdAt: msToTimestamp(fc.createdAt),
308
+ }),
309
+ },
310
+ });
311
+ }
312
+ case 'function_call_output': {
313
+ const fco = item;
314
+ return new pb.ChatContext_ChatItem({
315
+ item: {
316
+ case: 'functionCallOutput',
317
+ value: new pb.FunctionCallOutput({
318
+ id: fco.id,
319
+ callId: fco.callId,
320
+ name: fco.name,
321
+ output: fco.output,
322
+ isError: fco.isError,
323
+ createdAt: msToTimestamp(fco.createdAt),
324
+ }),
325
+ },
326
+ });
327
+ }
328
+ case 'agent_handoff': {
329
+ const ah = item;
330
+ return new pb.ChatContext_ChatItem({
331
+ item: {
332
+ case: 'agentHandoff',
333
+ value: new pb.AgentHandoff({
334
+ id: ah.id,
335
+ oldAgentId: ah.oldAgentId,
336
+ newAgentId: ah.newAgentId,
337
+ createdAt: msToTimestamp(ah.createdAt),
338
+ }),
339
+ },
340
+ });
341
+ }
342
+ }
343
+ }
344
+
345
+ // ===========================================================================
346
+ // Usage conversion helpers
347
+ // ===========================================================================
348
+ function sessionUsageToProto(usage: AgentSessionUsage): pb.AgentSessionUsage {
349
+ const modelUsages: pb.ModelUsage[] = [];
350
+ for (const mu of usage.modelUsage) {
351
+ switch (mu.type) {
352
+ case 'llm_usage': {
353
+ const lu = mu as Partial<LLMModelUsage>;
354
+ modelUsages.push(
355
+ new pb.ModelUsage({
356
+ usage: {
357
+ case: 'llm',
358
+ value: new pb.LLMModelUsage({
359
+ provider: lu.provider ?? '',
360
+ model: lu.model ?? '',
361
+ inputTokens: lu.inputTokens ?? 0,
362
+ inputCachedTokens: lu.inputCachedTokens ?? 0,
363
+ inputAudioTokens: lu.inputAudioTokens ?? 0,
364
+ inputCachedAudioTokens: lu.inputCachedAudioTokens ?? 0,
365
+ inputTextTokens: lu.inputTextTokens ?? 0,
366
+ inputCachedTextTokens: lu.inputCachedTextTokens ?? 0,
367
+ inputImageTokens: lu.inputImageTokens ?? 0,
368
+ inputCachedImageTokens: lu.inputCachedImageTokens ?? 0,
369
+ outputTokens: lu.outputTokens ?? 0,
370
+ outputAudioTokens: lu.outputAudioTokens ?? 0,
371
+ outputTextTokens: lu.outputTextTokens ?? 0,
372
+ sessionDuration: (lu.sessionDurationMs ?? 0) / 1000,
373
+ }),
374
+ },
375
+ }),
376
+ );
377
+ break;
378
+ }
379
+ case 'tts_usage': {
380
+ const tu = mu as Partial<TTSModelUsage>;
381
+ modelUsages.push(
382
+ new pb.ModelUsage({
383
+ usage: {
384
+ case: 'tts',
385
+ value: new pb.TTSModelUsage({
386
+ provider: tu.provider ?? '',
387
+ model: tu.model ?? '',
388
+ inputTokens: tu.inputTokens ?? 0,
389
+ outputTokens: tu.outputTokens ?? 0,
390
+ charactersCount: tu.charactersCount ?? 0,
391
+ audioDuration: (tu.audioDurationMs ?? 0) / 1000,
392
+ }),
393
+ },
394
+ }),
395
+ );
396
+ break;
397
+ }
398
+ case 'stt_usage': {
399
+ const su = mu as Partial<STTModelUsage>;
400
+ modelUsages.push(
401
+ new pb.ModelUsage({
402
+ usage: {
403
+ case: 'stt',
404
+ value: new pb.STTModelUsage({
405
+ provider: su.provider ?? '',
406
+ model: su.model ?? '',
407
+ inputTokens: su.inputTokens ?? 0,
408
+ outputTokens: su.outputTokens ?? 0,
409
+ audioDuration: (su.audioDurationMs ?? 0) / 1000,
410
+ }),
411
+ },
412
+ }),
413
+ );
414
+ break;
415
+ }
416
+ case 'interruption_usage': {
417
+ const iu = mu as Partial<InterruptionModelUsage>;
418
+ modelUsages.push(
419
+ new pb.ModelUsage({
420
+ usage: {
421
+ case: 'interruption',
422
+ value: new pb.InterruptionModelUsage({
423
+ provider: iu.provider ?? '',
424
+ model: iu.model ?? '',
425
+ totalRequests: iu.totalRequests ?? 0,
426
+ }),
427
+ },
428
+ }),
429
+ );
430
+ break;
431
+ }
432
+ }
433
+ }
434
+ return new pb.AgentSessionUsage({ modelUsage: modelUsages });
435
+ }
436
+
437
+ function toolNames(toolCtx: ToolContext | undefined): string[] {
438
+ if (!toolCtx) return [];
439
+ return Object.keys(toolCtx);
440
+ }
441
+
442
+ function protoSerializeOptions(opts: {
443
+ turnHandling?: { endpointing?: unknown; interruption?: unknown };
444
+ maxToolSteps?: number;
445
+ userAwayTimeout?: number | null;
446
+ preemptiveGeneration?: boolean;
447
+ useTtsAlignedTranscript?: boolean;
448
+ }): Record<string, string> {
449
+ return {
450
+ endpointing: JSON.stringify(opts.turnHandling?.endpointing ?? {}),
451
+ interruption: JSON.stringify(opts.turnHandling?.interruption ?? {}),
452
+ max_tool_steps: String(opts.maxToolSteps ?? 0),
453
+ user_away_timeout: String(opts.userAwayTimeout ?? ''),
454
+ preemptive_generation: String(opts.preemptiveGeneration ?? false),
455
+ use_tts_aligned_transcript: String(opts.useTtsAlignedTranscript ?? false),
456
+ };
457
+ }
458
+
459
+ // ===========================================================================
460
+ // SessionHost (protobuf-based server-side handler)
461
+ // ===========================================================================
462
+ export class SessionHost {
463
+ private readonly transport: SessionTransport;
464
+ private session: AgentSession | undefined;
465
+ private started = false;
466
+ private eventsRegistered = false;
467
+ private recvTask: Task<void> | undefined;
468
+ private readonly tasks = new Set<Task<void>>();
469
+ private textInputCb: TextInputCallback | undefined;
470
+
471
+ constructor(transport: SessionTransport) {
472
+ this.transport = transport;
473
+ }
474
+
475
+ registerSession(session: AgentSession): void {
476
+ this.session = session;
477
+ if (!this.eventsRegistered) {
478
+ this.eventsRegistered = true;
479
+ session.on(AgentSessionEventTypes.AgentStateChanged, this.onAgentStateChanged);
480
+ session.on(AgentSessionEventTypes.UserStateChanged, this.onUserStateChanged);
481
+ session.on(AgentSessionEventTypes.ConversationItemAdded, this.onConversationItemAdded);
482
+ session.on(AgentSessionEventTypes.UserInputTranscribed, this.onUserInputTranscribed);
483
+ session.on(AgentSessionEventTypes.FunctionToolsExecuted, this.onFunctionToolsExecuted);
484
+ session.on(AgentSessionEventTypes.MetricsCollected, this.onMetricsCollected);
485
+ session.on(AgentSessionEventTypes.OverlappingSpeech, this.onOverlappingSpeech);
486
+ session.on(AgentSessionEventTypes.Error, this.onHostError);
487
+ }
488
+ }
489
+
490
+ registerTextInput(textInputCb: TextInputCallback): void {
491
+ this.textInputCb = textInputCb;
492
+ }
493
+
494
+ async start(): Promise<void> {
495
+ if (this.started) return;
496
+ this.started = true;
497
+ await this.transport.start();
498
+ this.recvTask = Task.from(async () => this.recvLoop());
499
+ }
500
+
501
+ async close(): Promise<void> {
502
+ if (!this.started) return;
503
+ this.started = false;
504
+
505
+ if (this.session && this.eventsRegistered) {
506
+ this.eventsRegistered = false;
507
+ this.session.off(AgentSessionEventTypes.AgentStateChanged, this.onAgentStateChanged);
508
+ this.session.off(AgentSessionEventTypes.UserStateChanged, this.onUserStateChanged);
509
+ this.session.off(AgentSessionEventTypes.ConversationItemAdded, this.onConversationItemAdded);
510
+ this.session.off(AgentSessionEventTypes.UserInputTranscribed, this.onUserInputTranscribed);
511
+ this.session.off(AgentSessionEventTypes.FunctionToolsExecuted, this.onFunctionToolsExecuted);
512
+ this.session.off(AgentSessionEventTypes.MetricsCollected, this.onMetricsCollected);
513
+ this.session.off(AgentSessionEventTypes.OverlappingSpeech, this.onOverlappingSpeech);
514
+ this.session.off(AgentSessionEventTypes.Error, this.onHostError);
515
+ }
516
+
517
+ if (this.recvTask) {
518
+ this.recvTask.cancel();
519
+ }
520
+
521
+ await Promise.allSettled([...this.tasks].map((task) => task.cancelAndWait()));
522
+ this.tasks.clear();
523
+
524
+ await this.transport.close();
525
+ }
526
+
527
+ private async recvLoop(): Promise<void> {
528
+ try {
529
+ for await (const msg of this.transport) {
530
+ if (msg.message.case === 'request') {
531
+ if (this.session) {
532
+ this.trackTask(
533
+ Task.from(async () => this.handleRequestSafe(msg.message.value as pb.SessionRequest)),
534
+ );
535
+ }
536
+ }
537
+ }
538
+ } catch (e) {
539
+ if (this.started) {
540
+ log().warn({ error: e }, 'error processing session message');
541
+ }
542
+ }
543
+ }
544
+
545
+ private sendEvent(event: pb.AgentSessionEvent): void {
546
+ const msg = new pb.AgentSessionMessage({
547
+ message: { case: 'event', value: event },
548
+ });
549
+ this.trackTask(Task.from(async () => this.transport.sendMessage(msg)));
550
+ }
551
+
552
+ private emitEvent<Event extends pb.AgentSessionEvent['event']>(
553
+ event: Event,
554
+ createdAt?: number,
555
+ ): void {
556
+ this.sendEvent(
557
+ new pb.AgentSessionEvent({
558
+ createdAt: createdAt ? msToTimestamp(createdAt) : nowTimestamp(),
559
+ event: event,
560
+ }),
561
+ );
562
+ }
563
+
564
+ private onAgentStateChanged = (event: AgentStateChangedEvent): void => {
565
+ this.emitEvent(
566
+ {
567
+ case: 'agentStateChanged',
568
+ value: new pb.AgentSessionEvent_AgentStateChanged({
569
+ oldState: AGENT_STATE_MAP[event.oldState],
570
+ newState: AGENT_STATE_MAP[event.newState],
571
+ }),
572
+ },
573
+ event.createdAt,
574
+ );
575
+ };
576
+
577
+ private onUserStateChanged = (event: UserStateChangedEvent): void => {
578
+ this.emitEvent(
579
+ {
580
+ case: 'userStateChanged',
581
+ value: new pb.AgentSessionEvent_UserStateChanged({
582
+ oldState: USER_STATE_MAP[event.oldState],
583
+ newState: USER_STATE_MAP[event.newState],
584
+ }),
585
+ },
586
+ event.createdAt,
587
+ );
588
+ };
589
+
590
+ private onUserInputTranscribed = (event: UserInputTranscribedEvent): void => {
591
+ this.emitEvent(
592
+ {
593
+ case: 'userInputTranscribed',
594
+ value: new pb.AgentSessionEvent_UserInputTranscribed({
595
+ transcript: event.transcript,
596
+ isFinal: event.isFinal,
597
+ }),
598
+ },
599
+ event.createdAt,
600
+ );
601
+ };
602
+
603
+ private onConversationItemAdded = (event: ConversationItemAddedEvent): void => {
604
+ this.emitEvent(
605
+ {
606
+ case: 'conversationItemAdded',
607
+ value: new pb.AgentSessionEvent_ConversationItemAdded({
608
+ item: chatItemToProto(event.item),
609
+ }),
610
+ },
611
+ event.createdAt,
612
+ );
613
+ };
614
+
615
+ private onFunctionToolsExecuted = (event: FunctionToolsExecutedEvent): void => {
616
+ const pbCalls = event.functionCalls.map(
617
+ (fc: FCItem) => new pb.FunctionCall({ name: fc.name, arguments: fc.args, callId: fc.callId }),
618
+ );
619
+ const pbOutputs = event.functionCallOutputs
620
+ .filter((fco): fco is FCOItem => fco != null)
621
+ .map(
622
+ (fco: FCOItem) =>
623
+ new pb.FunctionCallOutput({
624
+ callId: fco.callId,
625
+ output: fco.output,
626
+ isError: fco.isError,
627
+ }),
628
+ );
629
+ this.emitEvent(
630
+ {
631
+ case: 'functionToolsExecuted',
632
+ value: new pb.AgentSessionEvent_FunctionToolsExecuted({
633
+ functionCalls: pbCalls,
634
+ functionCallOutputs: pbOutputs,
635
+ }),
636
+ },
637
+ event.createdAt,
638
+ );
639
+ };
640
+
641
+ private onOverlappingSpeech = (event: OverlappingSpeechEvent): void => {
642
+ const value = new pb.AgentSessionEvent_OverlappingSpeech({
643
+ isInterruption: event.isInterruption,
644
+ detectionDelay: event.detectionDelayInS,
645
+ detectedAt: msToTimestamp(event.detectedAt),
646
+ });
647
+ if (event.overlapStartedAt != null) {
648
+ value.overlapStartedAt = msToTimestamp(event.overlapStartedAt);
649
+ }
650
+ this.emitEvent({ case: 'overlappingSpeech', value });
651
+ };
652
+
653
+ private onMetricsCollected = (event: MetricsCollectedEvent): void => {
654
+ if (!this.session) return;
655
+ this.emitEvent(
656
+ {
657
+ case: 'sessionUsageUpdated',
658
+ value: new pb.AgentSessionEvent_SessionUsageUpdated({
659
+ usage: sessionUsageToProto(this.session.usage),
660
+ }),
661
+ },
662
+ event.createdAt,
663
+ );
664
+ };
665
+
666
+ private onHostError = (event: ErrorEvent): void => {
667
+ this.emitEvent(
668
+ {
669
+ case: 'error',
670
+ value: new pb.AgentSessionEvent_Error({
671
+ message: event.error ? String(event.error) : 'Unknown error',
672
+ }),
673
+ },
674
+ event.createdAt,
675
+ );
676
+ };
677
+
678
+ private async handleRequestSafe(req: pb.SessionRequest): Promise<void> {
679
+ try {
680
+ await this.handleRequest(req);
681
+ } catch (e) {
682
+ log().warn({ error: e, requestId: req.requestId }, 'error handling session request');
683
+ try {
684
+ const resp = new pb.AgentSessionMessage({
685
+ message: {
686
+ case: 'response',
687
+ value: new pb.SessionResponse({
688
+ requestId: req.requestId,
689
+ error: 'internal error',
690
+ }),
691
+ },
692
+ });
693
+ await this.transport.sendMessage(resp);
694
+ } catch (e) {
695
+ log().debug({ error: e }, 'failed to send error response');
696
+ }
697
+ }
698
+ }
699
+
700
+ private async handleRequest(req: pb.SessionRequest): Promise<void> {
701
+ if (!this.session) return;
702
+
703
+ switch (req.request.case) {
704
+ case 'ping':
705
+ return this.sendResponse(req.requestId, {
706
+ case: 'pong',
707
+ value: new pb.SessionResponse_Pong(),
708
+ });
709
+ case 'getChatHistory':
710
+ return this.handleGetChatHistory(req.requestId);
711
+ case 'getAgentInfo':
712
+ return this.handleGetAgentInfo(req.requestId);
713
+ case 'runInput':
714
+ return this.handleRunInput(req.requestId, req.request.value);
715
+ case 'getSessionState':
716
+ return this.handleGetSessionState(req.requestId);
717
+ case 'getRtcStats':
718
+ return this.sendResponse(req.requestId, {
719
+ case: 'getRtcStats',
720
+ value: new pb.SessionResponse_GetRTCStatsResponse({
721
+ publisherStats: [],
722
+ subscriberStats: [],
723
+ }),
724
+ });
725
+ case 'getSessionUsage':
726
+ return this.handleGetSessionUsage(req.requestId);
727
+ }
728
+ }
729
+
730
+ private async handleGetChatHistory(requestId: string): Promise<void> {
731
+ const items = this.session!.history.items.map(chatItemToProto);
732
+ return this.sendResponse(requestId, {
733
+ case: 'getChatHistory',
734
+ value: new pb.SessionResponse_GetChatHistoryResponse({ items }),
735
+ });
736
+ }
737
+
738
+ private async handleGetAgentInfo(requestId: string): Promise<void> {
739
+ const agent = this.session!.currentAgent;
740
+ return this.sendResponse(requestId, {
741
+ case: 'getAgentInfo',
742
+ value: new pb.SessionResponse_GetAgentInfoResponse({
743
+ id: agent.id,
744
+ instructions: agent.instructions,
745
+ tools: toolNames(agent.toolCtx),
746
+ chatCtx: agent.chatCtx.items.map(chatItemToProto),
747
+ }),
748
+ });
749
+ }
750
+
751
+ private async handleRunInput(
752
+ requestId: string,
753
+ input: pb.SessionRequest_RunInput,
754
+ ): Promise<void> {
755
+ const text = input.text;
756
+ let items: pb.ChatContext_ChatItem[] = [];
757
+ let error: string | undefined;
758
+
759
+ if (text) {
760
+ if (this.textInputCb) {
761
+ const cbResult = this.textInputCb(this.session!, { text });
762
+ if (cbResult instanceof Promise) {
763
+ await cbResult;
764
+ }
765
+ } else {
766
+ try {
767
+ await this.session!.interrupt({ force: true }).await;
768
+ } catch {
769
+ // ignore
770
+ }
771
+
772
+ const result = this.session!.run({ userInput: text });
773
+ try {
774
+ await result.wait();
775
+ } catch (e) {
776
+ error = e instanceof Error ? e.message : String(e);
777
+ }
778
+ items = result.events.map((ev) => chatItemToProto(ev.item));
779
+ }
780
+ }
781
+
782
+ return this.sendResponse(
783
+ requestId,
784
+ {
785
+ case: 'runInput',
786
+ value: new pb.SessionResponse_RunInputResponse({ items }),
787
+ },
788
+ error,
789
+ );
790
+ }
791
+
792
+ private async handleGetSessionState(requestId: string): Promise<void> {
793
+ const agent = this.session!.currentAgent;
794
+ const startedAt = this.session!._startedAt ?? Date.now();
795
+ return this.sendResponse(requestId, {
796
+ case: 'getSessionState',
797
+ value: new pb.SessionResponse_GetSessionStateResponse({
798
+ agentState: AGENT_STATE_MAP[this.session!.agentState],
799
+ userState: USER_STATE_MAP[this.session!.userState],
800
+ agentId: agent.id,
801
+ options: protoSerializeOptions({
802
+ turnHandling: this.session!.sessionOptions.turnHandling,
803
+ maxToolSteps: this.session!.sessionOptions.maxToolSteps,
804
+ userAwayTimeout: this.session!.sessionOptions.userAwayTimeout,
805
+ preemptiveGeneration: this.session!.sessionOptions.preemptiveGeneration,
806
+ useTtsAlignedTranscript: this.session!.sessionOptions.useTtsAlignedTranscript,
807
+ }),
808
+ createdAt: msToTimestamp(startedAt),
809
+ }),
810
+ });
811
+ }
812
+
813
+ private async handleGetSessionUsage(requestId: string): Promise<void> {
814
+ return this.sendResponse(requestId, {
815
+ case: 'getSessionUsage',
816
+ value: new pb.SessionResponse_GetSessionUsageResponse({
817
+ usage: sessionUsageToProto(this.session!.usage),
818
+ createdAt: nowTimestamp(),
819
+ }),
820
+ });
821
+ }
822
+
823
+ private async sendResponse(
824
+ requestId: string,
825
+ response: pb.SessionResponse['response'],
826
+ error?: string,
827
+ ): Promise<void> {
828
+ await this.transport.sendMessage(
829
+ new pb.AgentSessionMessage({
830
+ message: {
831
+ case: 'response',
832
+ value: new pb.SessionResponse({ requestId, response, error }),
833
+ },
834
+ }),
835
+ );
836
+ }
837
+
838
+ private trackTask(task: Task<void>): void {
839
+ this.tasks.add(task);
840
+ task.addDoneCallback(() => {
841
+ this.tasks.delete(task);
842
+ });
843
+ }
844
+ }
845
+
846
+ // ===========================================================================
847
+ // RemoteSession (protobuf-based client-side interface)
848
+ // ===========================================================================
849
+
850
+ /** @experimental */
851
+ export class RemoteSession extends (EventEmitter as new () => TypedEventEmitter<RemoteSessionCallbacks>) {
852
+ private readonly transport: SessionTransport;
853
+ private started = false;
854
+
855
+ private readonly tasks = new Set<Task<void>>();
856
+ private readonly pendingRequests = new Map<string, Future<pb.SessionResponse>>();
857
+ private recvTask: Task<void> | undefined;
858
+ private readonly _logger = log();
859
+
860
+ constructor(transport: SessionTransport) {
861
+ super();
862
+ this.transport = transport;
863
+ }
864
+
865
+ static fromRoom(room: Room, roomIO: RoomIO): RemoteSession {
866
+ const transport = new RoomSessionTransport(room, roomIO);
867
+ return new RemoteSession(transport);
868
+ }
869
+
870
+ async start(): Promise<void> {
871
+ if (this.started) return;
872
+ this.started = true;
873
+ await this.transport.start();
874
+ this.recvTask = Task.from(async () => this.recvLoop());
875
+ }
876
+
877
+ async close(): Promise<void> {
878
+ if (!this.started) return;
879
+ this.started = false;
880
+
881
+ if (this.recvTask) {
882
+ this.recvTask.cancel();
883
+ }
884
+
885
+ for (const pending of this.pendingRequests.values()) {
886
+ pending.reject(new Error('RemoteSession closed'));
887
+ }
888
+ this.pendingRequests.clear();
889
+
890
+ for (const task of this.tasks) {
891
+ task.cancel();
892
+ }
893
+ this.tasks.clear();
894
+
895
+ await this.transport.close();
896
+ }
897
+
898
+ private async recvLoop(): Promise<void> {
899
+ try {
900
+ for await (const msg of this.transport) {
901
+ switch (msg.message.case) {
902
+ case 'event':
903
+ this.dispatchEvent(msg.message.value);
904
+ break;
905
+ case 'response':
906
+ this.dispatchResponse(msg.message.value);
907
+ break;
908
+ }
909
+ }
910
+ } catch (e) {
911
+ if (this.started) {
912
+ this._logger.warn({ error: e }, 'error in RemoteSession recv loop');
913
+ }
914
+ }
915
+ }
916
+
917
+ private dispatchEvent(event: pb.AgentSessionEvent): void {
918
+ const ev = event.event;
919
+ switch (ev.case) {
920
+ case 'agentStateChanged':
921
+ this.emit('agent_state_changed', ev.value);
922
+ break;
923
+ case 'userStateChanged':
924
+ this.emit('user_state_changed', ev.value);
925
+ break;
926
+ case 'userInputTranscribed':
927
+ this.emit('user_input_transcribed', ev.value);
928
+ break;
929
+ case 'conversationItemAdded':
930
+ this.emit('conversation_item_added', ev.value);
931
+ break;
932
+ case 'functionToolsExecuted':
933
+ this.emit('function_tools_executed', ev.value);
934
+ break;
935
+ case 'overlappingSpeech':
936
+ this.emit('overlapping_speech', ev.value);
937
+ break;
938
+ case 'sessionUsageUpdated':
939
+ this.emit('session_usage', ev.value);
940
+ break;
941
+ case 'error':
942
+ this.emit('error', ev.value);
943
+ break;
944
+ }
945
+ }
946
+
947
+ private dispatchResponse(response: pb.SessionResponse): void {
948
+ const future = this.pendingRequests.get(response.requestId);
949
+ this.pendingRequests.delete(response.requestId);
950
+ if (future && !future.done) {
951
+ future.resolve(response);
952
+ }
953
+ }
954
+
955
+ private async sendRequest(
956
+ buildReq: (requestId: string) => pb.SessionRequest,
957
+ timeout = 60000,
958
+ ): Promise<pb.SessionResponse> {
959
+ const requestId = shortuuid('req_');
960
+ const req = buildReq(requestId);
961
+ req.requestId = requestId;
962
+
963
+ const future = new Future<pb.SessionResponse>();
964
+ this.pendingRequests.set(requestId, future);
965
+
966
+ const msg = new pb.AgentSessionMessage({
967
+ message: { case: 'request', value: req },
968
+ });
969
+ await this.transport.sendMessage(msg);
970
+
971
+ const timer = setTimeout(() => {
972
+ if (!future.done) {
973
+ this.pendingRequests.delete(requestId);
974
+ future.reject(new Error('RemoteSession request timed out'));
975
+ }
976
+ }, timeout);
977
+
978
+ try {
979
+ const response = await future.await;
980
+ if (response.error) {
981
+ throw new Error(response.error);
982
+ }
983
+ return response;
984
+ } finally {
985
+ clearTimeout(timer);
986
+ }
987
+ }
988
+
989
+ async fetchSessionState(): Promise<pb.SessionResponse_GetSessionStateResponse> {
990
+ const resp = await this.sendRequest(
991
+ (id) =>
992
+ new pb.SessionRequest({
993
+ requestId: id,
994
+ request: { case: 'getSessionState', value: new pb.SessionRequest_GetSessionState() },
995
+ }),
996
+ );
997
+ if (resp.response.case !== 'getSessionState') {
998
+ throw new Error('unexpected response type');
999
+ }
1000
+ return resp.response.value;
1001
+ }
1002
+
1003
+ async fetchChatHistory(): Promise<pb.SessionResponse_GetChatHistoryResponse> {
1004
+ const resp = await this.sendRequest(
1005
+ (id) =>
1006
+ new pb.SessionRequest({
1007
+ requestId: id,
1008
+ request: { case: 'getChatHistory', value: new pb.SessionRequest_GetChatHistory() },
1009
+ }),
1010
+ );
1011
+ if (resp.response.case !== 'getChatHistory') {
1012
+ throw new Error('unexpected response type');
1013
+ }
1014
+ return resp.response.value;
1015
+ }
1016
+
1017
+ async fetchAgentInfo(): Promise<pb.SessionResponse_GetAgentInfoResponse> {
1018
+ const resp = await this.sendRequest(
1019
+ (id) =>
1020
+ new pb.SessionRequest({
1021
+ requestId: id,
1022
+ request: { case: 'getAgentInfo', value: new pb.SessionRequest_GetAgentInfo() },
1023
+ }),
1024
+ );
1025
+ if (resp.response.case !== 'getAgentInfo') {
1026
+ throw new Error('unexpected response type');
1027
+ }
1028
+ return resp.response.value;
1029
+ }
1030
+
1031
+ async sendMessage(
1032
+ text: string,
1033
+ responseTimeout = 60000,
1034
+ ): Promise<pb.SessionResponse_RunInputResponse> {
1035
+ const resp = await this.sendRequest(
1036
+ (id) =>
1037
+ new pb.SessionRequest({
1038
+ requestId: id,
1039
+ request: { case: 'runInput', value: new pb.SessionRequest_RunInput({ text }) },
1040
+ }),
1041
+ responseTimeout,
1042
+ );
1043
+ if (resp.response.case !== 'runInput') {
1044
+ throw new Error('unexpected response type');
1045
+ }
1046
+ return resp.response.value;
1047
+ }
1048
+
1049
+ async fetchRtcStats(): Promise<pb.SessionResponse_GetRTCStatsResponse> {
1050
+ const resp = await this.sendRequest(
1051
+ (id) =>
1052
+ new pb.SessionRequest({
1053
+ requestId: id,
1054
+ request: { case: 'getRtcStats', value: new pb.SessionRequest_GetRTCStats() },
1055
+ }),
1056
+ );
1057
+ if (resp.response.case !== 'getRtcStats') {
1058
+ throw new Error('unexpected response type');
1059
+ }
1060
+ return resp.response.value;
1061
+ }
1062
+
1063
+ async fetchSessionUsage(): Promise<pb.SessionResponse_GetSessionUsageResponse> {
1064
+ const resp = await this.sendRequest(
1065
+ (id) =>
1066
+ new pb.SessionRequest({
1067
+ requestId: id,
1068
+ request: { case: 'getSessionUsage', value: new pb.SessionRequest_GetSessionUsage() },
1069
+ }),
1070
+ );
1071
+ if (resp.response.case !== 'getSessionUsage') {
1072
+ throw new Error('unexpected response type');
1073
+ }
1074
+ return resp.response.value;
1075
+ }
1076
+
1077
+ private trackTask(task: Task<void>): void {
1078
+ this.tasks.add(task);
1079
+ task.addDoneCallback(() => {
1080
+ this.tasks.delete(task);
1081
+ });
1082
+ }
1083
+ }