@livekit/agents 1.1.0-dev.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (292) hide show
  1. package/dist/cli.cjs +2 -0
  2. package/dist/cli.cjs.map +1 -1
  3. package/dist/cli.d.ts.map +1 -1
  4. package/dist/cli.js +2 -0
  5. package/dist/cli.js.map +1 -1
  6. package/dist/constants.cjs +3 -0
  7. package/dist/constants.cjs.map +1 -1
  8. package/dist/constants.d.cts +1 -0
  9. package/dist/constants.d.ts +1 -0
  10. package/dist/constants.d.ts.map +1 -1
  11. package/dist/constants.js +2 -0
  12. package/dist/constants.js.map +1 -1
  13. package/dist/cpu.cjs +189 -0
  14. package/dist/cpu.cjs.map +1 -0
  15. package/dist/cpu.d.cts +24 -0
  16. package/dist/cpu.d.ts +24 -0
  17. package/dist/cpu.d.ts.map +1 -0
  18. package/dist/cpu.js +152 -0
  19. package/dist/cpu.js.map +1 -0
  20. package/dist/cpu.test.cjs +227 -0
  21. package/dist/cpu.test.cjs.map +1 -0
  22. package/dist/cpu.test.js +204 -0
  23. package/dist/cpu.test.js.map +1 -0
  24. package/dist/index.cjs +12 -10
  25. package/dist/index.cjs.map +1 -1
  26. package/dist/index.d.cts +13 -13
  27. package/dist/index.d.ts +13 -13
  28. package/dist/index.d.ts.map +1 -1
  29. package/dist/index.js +11 -10
  30. package/dist/index.js.map +1 -1
  31. package/dist/inference/interruption/defaults.cjs +1 -1
  32. package/dist/inference/interruption/defaults.cjs.map +1 -1
  33. package/dist/inference/interruption/defaults.d.cts +1 -1
  34. package/dist/inference/interruption/defaults.d.ts +1 -1
  35. package/dist/inference/interruption/defaults.d.ts.map +1 -1
  36. package/dist/inference/interruption/defaults.js +1 -1
  37. package/dist/inference/interruption/defaults.js.map +1 -1
  38. package/dist/inference/interruption/http_transport.cjs +44 -28
  39. package/dist/inference/interruption/http_transport.cjs.map +1 -1
  40. package/dist/inference/interruption/http_transport.d.ts.map +1 -1
  41. package/dist/inference/interruption/http_transport.js +45 -29
  42. package/dist/inference/interruption/http_transport.js.map +1 -1
  43. package/dist/inference/interruption/interruption_detector.cjs +22 -5
  44. package/dist/inference/interruption/interruption_detector.cjs.map +1 -1
  45. package/dist/inference/interruption/interruption_detector.d.cts +2 -2
  46. package/dist/inference/interruption/interruption_detector.d.ts +2 -2
  47. package/dist/inference/interruption/interruption_detector.d.ts.map +1 -1
  48. package/dist/inference/interruption/interruption_detector.js +22 -5
  49. package/dist/inference/interruption/interruption_detector.js.map +1 -1
  50. package/dist/inference/interruption/interruption_stream.cjs +4 -4
  51. package/dist/inference/interruption/interruption_stream.cjs.map +1 -1
  52. package/dist/inference/interruption/interruption_stream.js +4 -4
  53. package/dist/inference/interruption/interruption_stream.js.map +1 -1
  54. package/dist/inference/interruption/types.cjs.map +1 -1
  55. package/dist/inference/interruption/types.d.cts +2 -2
  56. package/dist/inference/interruption/types.d.ts +2 -2
  57. package/dist/inference/interruption/types.d.ts.map +1 -1
  58. package/dist/inference/interruption/ws_transport.cjs +60 -47
  59. package/dist/inference/interruption/ws_transport.cjs.map +1 -1
  60. package/dist/inference/interruption/ws_transport.d.ts.map +1 -1
  61. package/dist/inference/interruption/ws_transport.js +60 -47
  62. package/dist/inference/interruption/ws_transport.js.map +1 -1
  63. package/dist/inference/llm.cjs.map +1 -1
  64. package/dist/inference/llm.d.cts +1 -1
  65. package/dist/inference/llm.d.ts +1 -1
  66. package/dist/inference/llm.d.ts.map +1 -1
  67. package/dist/inference/llm.js.map +1 -1
  68. package/dist/inference/stt.cjs +20 -12
  69. package/dist/inference/stt.cjs.map +1 -1
  70. package/dist/inference/stt.d.cts +3 -2
  71. package/dist/inference/stt.d.ts +3 -2
  72. package/dist/inference/stt.d.ts.map +1 -1
  73. package/dist/inference/stt.js +20 -12
  74. package/dist/inference/stt.js.map +1 -1
  75. package/dist/inference/stt.test.cjs +14 -0
  76. package/dist/inference/stt.test.cjs.map +1 -1
  77. package/dist/inference/stt.test.js +14 -0
  78. package/dist/inference/stt.test.js.map +1 -1
  79. package/dist/inference/tts.cjs +13 -4
  80. package/dist/inference/tts.cjs.map +1 -1
  81. package/dist/inference/tts.d.cts +8 -1
  82. package/dist/inference/tts.d.ts +8 -1
  83. package/dist/inference/tts.d.ts.map +1 -1
  84. package/dist/inference/tts.js +13 -4
  85. package/dist/inference/tts.js.map +1 -1
  86. package/dist/inference/tts.test.cjs +10 -0
  87. package/dist/inference/tts.test.cjs.map +1 -1
  88. package/dist/inference/tts.test.js +10 -0
  89. package/dist/inference/tts.test.js.map +1 -1
  90. package/dist/ipc/job_proc_lazy_main.cjs +41 -23
  91. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
  92. package/dist/ipc/job_proc_lazy_main.js +41 -23
  93. package/dist/ipc/job_proc_lazy_main.js.map +1 -1
  94. package/dist/job.cjs +1 -1
  95. package/dist/job.cjs.map +1 -1
  96. package/dist/job.js +1 -1
  97. package/dist/job.js.map +1 -1
  98. package/dist/language.cjs +394 -0
  99. package/dist/language.cjs.map +1 -0
  100. package/dist/language.d.cts +15 -0
  101. package/dist/language.d.ts +15 -0
  102. package/dist/language.d.ts.map +1 -0
  103. package/dist/language.js +363 -0
  104. package/dist/language.js.map +1 -0
  105. package/dist/language.test.cjs +43 -0
  106. package/dist/language.test.cjs.map +1 -0
  107. package/dist/language.test.js +49 -0
  108. package/dist/language.test.js.map +1 -0
  109. package/dist/llm/index.cjs +2 -0
  110. package/dist/llm/index.cjs.map +1 -1
  111. package/dist/llm/index.d.cts +1 -1
  112. package/dist/llm/index.d.ts +1 -1
  113. package/dist/llm/index.d.ts.map +1 -1
  114. package/dist/llm/index.js +2 -0
  115. package/dist/llm/index.js.map +1 -1
  116. package/dist/stream/deferred_stream.cjs +6 -2
  117. package/dist/stream/deferred_stream.cjs.map +1 -1
  118. package/dist/stream/deferred_stream.d.ts.map +1 -1
  119. package/dist/stream/deferred_stream.js +6 -2
  120. package/dist/stream/deferred_stream.js.map +1 -1
  121. package/dist/stt/stt.cjs.map +1 -1
  122. package/dist/stt/stt.d.cts +2 -1
  123. package/dist/stt/stt.d.ts +2 -1
  124. package/dist/stt/stt.d.ts.map +1 -1
  125. package/dist/stt/stt.js.map +1 -1
  126. package/dist/utils.cjs +15 -0
  127. package/dist/utils.cjs.map +1 -1
  128. package/dist/utils.d.cts +8 -0
  129. package/dist/utils.d.ts +8 -0
  130. package/dist/utils.d.ts.map +1 -1
  131. package/dist/utils.js +13 -0
  132. package/dist/utils.js.map +1 -1
  133. package/dist/version.cjs +1 -1
  134. package/dist/version.js +1 -1
  135. package/dist/voice/agent.cjs +14 -17
  136. package/dist/voice/agent.cjs.map +1 -1
  137. package/dist/voice/agent.d.cts +10 -11
  138. package/dist/voice/agent.d.ts +10 -11
  139. package/dist/voice/agent.d.ts.map +1 -1
  140. package/dist/voice/agent.js +15 -18
  141. package/dist/voice/agent.js.map +1 -1
  142. package/dist/voice/agent.test.cjs +194 -0
  143. package/dist/voice/agent.test.cjs.map +1 -1
  144. package/dist/voice/agent.test.js +195 -1
  145. package/dist/voice/agent.test.js.map +1 -1
  146. package/dist/voice/agent_activity.cjs +116 -39
  147. package/dist/voice/agent_activity.cjs.map +1 -1
  148. package/dist/voice/agent_activity.d.cts +2 -0
  149. package/dist/voice/agent_activity.d.ts +2 -0
  150. package/dist/voice/agent_activity.d.ts.map +1 -1
  151. package/dist/voice/agent_activity.js +117 -40
  152. package/dist/voice/agent_activity.js.map +1 -1
  153. package/dist/voice/agent_activity.test.cjs +135 -0
  154. package/dist/voice/agent_activity.test.cjs.map +1 -0
  155. package/dist/voice/agent_activity.test.js +134 -0
  156. package/dist/voice/agent_activity.test.js.map +1 -0
  157. package/dist/voice/agent_session.cjs +38 -38
  158. package/dist/voice/agent_session.cjs.map +1 -1
  159. package/dist/voice/agent_session.d.cts +65 -56
  160. package/dist/voice/agent_session.d.ts +65 -56
  161. package/dist/voice/agent_session.d.ts.map +1 -1
  162. package/dist/voice/agent_session.js +37 -37
  163. package/dist/voice/agent_session.js.map +1 -1
  164. package/dist/voice/audio_recognition.cjs +106 -52
  165. package/dist/voice/audio_recognition.cjs.map +1 -1
  166. package/dist/voice/audio_recognition.d.cts +4 -2
  167. package/dist/voice/audio_recognition.d.ts +4 -2
  168. package/dist/voice/audio_recognition.d.ts.map +1 -1
  169. package/dist/voice/audio_recognition.js +106 -52
  170. package/dist/voice/audio_recognition.js.map +1 -1
  171. package/dist/voice/audio_recognition_span.test.cjs +84 -22
  172. package/dist/voice/audio_recognition_span.test.cjs.map +1 -1
  173. package/dist/voice/audio_recognition_span.test.js +90 -23
  174. package/dist/voice/audio_recognition_span.test.js.map +1 -1
  175. package/dist/voice/events.cjs +1 -1
  176. package/dist/voice/events.cjs.map +1 -1
  177. package/dist/voice/events.d.cts +4 -3
  178. package/dist/voice/events.d.ts +4 -3
  179. package/dist/voice/events.d.ts.map +1 -1
  180. package/dist/voice/events.js +1 -1
  181. package/dist/voice/events.js.map +1 -1
  182. package/dist/voice/index.cjs +9 -1
  183. package/dist/voice/index.cjs.map +1 -1
  184. package/dist/voice/index.d.cts +1 -1
  185. package/dist/voice/index.d.ts +1 -1
  186. package/dist/voice/index.d.ts.map +1 -1
  187. package/dist/voice/index.js +10 -1
  188. package/dist/voice/index.js.map +1 -1
  189. package/dist/voice/remote_session.cjs +922 -0
  190. package/dist/voice/remote_session.cjs.map +1 -0
  191. package/dist/voice/remote_session.d.cts +108 -0
  192. package/dist/voice/remote_session.d.ts +108 -0
  193. package/dist/voice/remote_session.d.ts.map +1 -0
  194. package/dist/voice/remote_session.js +887 -0
  195. package/dist/voice/remote_session.js.map +1 -0
  196. package/dist/voice/report.cjs +11 -10
  197. package/dist/voice/report.cjs.map +1 -1
  198. package/dist/voice/report.d.cts +5 -3
  199. package/dist/voice/report.d.ts +5 -3
  200. package/dist/voice/report.d.ts.map +1 -1
  201. package/dist/voice/report.js +11 -10
  202. package/dist/voice/report.js.map +1 -1
  203. package/dist/voice/report.test.cjs +15 -0
  204. package/dist/voice/report.test.cjs.map +1 -1
  205. package/dist/voice/report.test.js +15 -0
  206. package/dist/voice/report.test.js.map +1 -1
  207. package/dist/voice/room_io/room_io.cjs +39 -0
  208. package/dist/voice/room_io/room_io.cjs.map +1 -1
  209. package/dist/voice/room_io/room_io.d.cts +3 -1
  210. package/dist/voice/room_io/room_io.d.ts +3 -1
  211. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  212. package/dist/voice/room_io/room_io.js +40 -1
  213. package/dist/voice/room_io/room_io.js.map +1 -1
  214. package/dist/voice/turn_config/interruption.cjs.map +1 -1
  215. package/dist/voice/turn_config/interruption.d.cts +1 -1
  216. package/dist/voice/turn_config/interruption.d.ts +1 -1
  217. package/dist/voice/turn_config/interruption.d.ts.map +1 -1
  218. package/dist/voice/turn_config/interruption.js.map +1 -1
  219. package/dist/voice/turn_config/utils.cjs +95 -35
  220. package/dist/voice/turn_config/utils.cjs.map +1 -1
  221. package/dist/voice/turn_config/utils.d.cts +17 -5
  222. package/dist/voice/turn_config/utils.d.ts +17 -5
  223. package/dist/voice/turn_config/utils.d.ts.map +1 -1
  224. package/dist/voice/turn_config/utils.js +93 -35
  225. package/dist/voice/turn_config/utils.js.map +1 -1
  226. package/dist/voice/turn_config/utils.test.cjs +83 -41
  227. package/dist/voice/turn_config/utils.test.cjs.map +1 -1
  228. package/dist/voice/turn_config/utils.test.js +84 -42
  229. package/dist/voice/turn_config/utils.test.js.map +1 -1
  230. package/dist/worker.cjs +6 -29
  231. package/dist/worker.cjs.map +1 -1
  232. package/dist/worker.d.ts.map +1 -1
  233. package/dist/worker.js +6 -19
  234. package/dist/worker.js.map +1 -1
  235. package/package.json +3 -2
  236. package/src/cli.ts +2 -0
  237. package/src/constants.ts +1 -0
  238. package/src/cpu.test.ts +239 -0
  239. package/src/cpu.ts +173 -0
  240. package/src/index.ts +13 -15
  241. package/src/inference/interruption/defaults.ts +1 -1
  242. package/src/inference/interruption/http_transport.ts +49 -30
  243. package/src/inference/interruption/interruption_detector.ts +22 -6
  244. package/src/inference/interruption/interruption_stream.ts +4 -4
  245. package/src/inference/interruption/types.ts +2 -2
  246. package/src/inference/interruption/ws_transport.ts +63 -59
  247. package/src/inference/llm.ts +3 -1
  248. package/src/inference/stt.test.ts +17 -0
  249. package/src/inference/stt.ts +22 -14
  250. package/src/inference/tts.test.ts +12 -0
  251. package/src/inference/tts.ts +22 -6
  252. package/src/ipc/job_proc_lazy_main.ts +44 -24
  253. package/src/job.ts +1 -1
  254. package/src/language.test.ts +62 -0
  255. package/src/language.ts +380 -0
  256. package/src/llm/index.ts +2 -0
  257. package/src/stream/deferred_stream.ts +5 -1
  258. package/src/stt/stt.ts +2 -1
  259. package/src/utils.ts +20 -0
  260. package/src/voice/agent.test.ts +208 -1
  261. package/src/voice/agent.ts +21 -22
  262. package/src/voice/agent_activity.test.ts +194 -0
  263. package/src/voice/agent_activity.ts +161 -43
  264. package/src/voice/agent_session.ts +103 -92
  265. package/src/voice/audio_recognition.ts +124 -61
  266. package/src/voice/audio_recognition_span.test.ts +115 -35
  267. package/src/voice/events.ts +4 -3
  268. package/src/voice/index.ts +10 -1
  269. package/src/voice/remote_session.ts +1083 -0
  270. package/src/voice/report.test.ts +22 -3
  271. package/src/voice/report.ts +31 -14
  272. package/src/voice/room_io/room_io.ts +52 -2
  273. package/src/voice/turn_config/interruption.ts +1 -1
  274. package/src/voice/turn_config/utils.test.ts +91 -43
  275. package/src/voice/turn_config/utils.ts +120 -56
  276. package/src/worker.ts +34 -50
  277. package/dist/voice/client_events.cjs +0 -554
  278. package/dist/voice/client_events.cjs.map +0 -1
  279. package/dist/voice/client_events.d.cts +0 -195
  280. package/dist/voice/client_events.d.ts +0 -195
  281. package/dist/voice/client_events.d.ts.map +0 -1
  282. package/dist/voice/client_events.js +0 -548
  283. package/dist/voice/client_events.js.map +0 -1
  284. package/dist/voice/wire_format.cjs +0 -798
  285. package/dist/voice/wire_format.cjs.map +0 -1
  286. package/dist/voice/wire_format.d.cts +0 -5503
  287. package/dist/voice/wire_format.d.ts +0 -5503
  288. package/dist/voice/wire_format.d.ts.map +0 -1
  289. package/dist/voice/wire_format.js +0 -728
  290. package/dist/voice/wire_format.js.map +0 -1
  291. package/src/voice/client_events.ts +0 -838
  292. package/src/voice/wire_format.ts +0 -827
@@ -1,827 +0,0 @@
1
- // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
2
- //
3
- // SPDX-License-Identifier: Apache-2.0
4
- //
5
- // Explicit wire-format converters that produce the exact JSON shape emitted by
6
- // Python Pydantic models (snake_case keys, durations in seconds).
7
- // The agents-playground frontend (types.ts / useClientEvents.ts) consumes this
8
- // format directly via JSON.parse — any mismatch breaks the UI.
9
- import { z } from 'zod';
10
- import type {
11
- AgentHandoffItem,
12
- AudioContent,
13
- ChatContent,
14
- ChatItem,
15
- ChatMessage,
16
- FunctionCall,
17
- FunctionCallOutput,
18
- ImageContent,
19
- MetricsReport,
20
- } from '../llm/chat_context.js';
21
- import type {
22
- AgentMetrics,
23
- EOUMetrics,
24
- InterruptionMetrics,
25
- LLMMetrics,
26
- MetricsMetadata,
27
- RealtimeModelMetrics,
28
- RealtimeModelMetricsCachedTokenDetails,
29
- RealtimeModelMetricsInputTokenDetails,
30
- RealtimeModelMetricsOutputTokenDetails,
31
- STTMetrics,
32
- TTSMetrics,
33
- VADMetrics,
34
- } from '../metrics/base.js';
35
- import type {
36
- InterruptionModelUsage,
37
- LLMModelUsage,
38
- ModelUsage,
39
- STTModelUsage,
40
- TTSModelUsage,
41
- } from '../metrics/model_usage.js';
42
- import type { AgentSessionUsage } from './agent_session.js';
43
-
44
- // ---------------------------------------------------------------------------
45
- // Helpers
46
- // ---------------------------------------------------------------------------
47
-
48
- type WireObject = Record<string, unknown>;
49
-
50
- export function msToS(ms: number): number {
51
- return ms / 1000;
52
- }
53
-
54
- function omitUndefined(obj: WireObject): WireObject {
55
- const result: WireObject = {};
56
- for (const [k, v] of Object.entries(obj)) {
57
- if (v !== undefined) {
58
- result[k] = v;
59
- }
60
- }
61
- return result;
62
- }
63
-
64
- function imageContentToWire(img: ImageContent): WireObject {
65
- return omitUndefined({
66
- id: img.id,
67
- type: img.type,
68
- image: typeof img.image === 'string' ? img.image : undefined,
69
- inference_detail: img.inferenceDetail,
70
- inference_width: img.inferenceWidth,
71
- inference_height: img.inferenceHeight,
72
- mime_type: img.mimeType,
73
- });
74
- }
75
-
76
- function audioContentToWire(audio: AudioContent): WireObject {
77
- return omitUndefined({
78
- type: audio.type,
79
- transcript: audio.transcript,
80
- });
81
- }
82
-
83
- function chatContentToWire(content: ChatContent): unknown {
84
- if (typeof content === 'string') return content;
85
- if (content.type === 'image_content') return imageContentToWire(content);
86
- return audioContentToWire(content);
87
- }
88
-
89
- function metricsReportToWire(m: MetricsReport): WireObject {
90
- return omitUndefined({
91
- started_speaking_at: m.startedSpeakingAt,
92
- stopped_speaking_at: m.stoppedSpeakingAt,
93
- transcription_delay: m.transcriptionDelay,
94
- end_of_turn_delay: m.endOfTurnDelay,
95
- on_user_turn_completed_delay: m.onUserTurnCompletedDelay,
96
- llm_node_ttft: m.llmNodeTtft,
97
- tts_node_ttfb: m.ttsNodeTtfb,
98
- e2e_latency: m.e2eLatency,
99
- });
100
- }
101
-
102
- export function chatMessageToWire(msg: ChatMessage): WireObject {
103
- const result: WireObject = {
104
- id: msg.id,
105
- type: msg.type,
106
- role: msg.role,
107
- content: msg.content.map(chatContentToWire),
108
- interrupted: msg.interrupted,
109
- created_at: msToS(msg.createdAt),
110
- };
111
-
112
- if (msg.transcriptConfidence !== undefined) {
113
- result.transcript_confidence = msg.transcriptConfidence;
114
- }
115
- if (Object.keys(msg.metrics).length > 0) {
116
- result.metrics = metricsReportToWire(msg.metrics);
117
- }
118
- if (Object.keys(msg.extra).length > 0) {
119
- result.extra = msg.extra;
120
- }
121
- return result;
122
- }
123
-
124
- export function functionCallToWire(fc: FunctionCall): WireObject {
125
- const result: WireObject = {
126
- id: fc.id,
127
- type: fc.type,
128
- call_id: fc.callId,
129
- arguments: fc.args,
130
- name: fc.name,
131
- created_at: msToS(fc.createdAt),
132
- };
133
-
134
- if (Object.keys(fc.extra).length > 0) {
135
- result.extra = fc.extra;
136
- }
137
- if (fc.groupId !== undefined) {
138
- result.group_id = fc.groupId;
139
- }
140
- return result;
141
- }
142
-
143
- export function functionCallOutputToWire(fco: FunctionCallOutput): WireObject {
144
- return {
145
- id: fco.id,
146
- type: fco.type,
147
- name: fco.name,
148
- call_id: fco.callId,
149
- output: fco.output,
150
- is_error: fco.isError,
151
- created_at: msToS(fco.createdAt),
152
- };
153
- }
154
-
155
- export function agentHandoffToWire(ah: AgentHandoffItem): WireObject {
156
- const result: WireObject = {
157
- id: ah.id,
158
- type: ah.type,
159
- new_agent_id: ah.newAgentId,
160
- created_at: msToS(ah.createdAt),
161
- };
162
- if (ah.oldAgentId !== undefined) {
163
- result.old_agent_id = ah.oldAgentId;
164
- }
165
- return result;
166
- }
167
-
168
- export function chatItemToWire(item: ChatItem): WireObject {
169
- switch (item.type) {
170
- case 'message':
171
- return chatMessageToWire(item);
172
- case 'function_call':
173
- return functionCallToWire(item);
174
- case 'function_call_output':
175
- return functionCallOutputToWire(item);
176
- case 'agent_handoff':
177
- return agentHandoffToWire(item);
178
- }
179
- }
180
-
181
- function metadataToWire(m: MetricsMetadata | undefined): WireObject | null {
182
- if (!m) return null;
183
- return omitUndefined({
184
- model_name: m.modelName,
185
- model_provider: m.modelProvider,
186
- });
187
- }
188
-
189
- function llmMetricsToWire(m: LLMMetrics): WireObject {
190
- return omitUndefined({
191
- type: m.type,
192
- label: m.label,
193
- request_id: m.requestId,
194
- timestamp: msToS(m.timestamp),
195
- duration: msToS(m.durationMs),
196
- ttft: msToS(m.ttftMs),
197
- cancelled: m.cancelled,
198
- completion_tokens: m.completionTokens,
199
- prompt_tokens: m.promptTokens,
200
- prompt_cached_tokens: m.promptCachedTokens,
201
- total_tokens: m.totalTokens,
202
- tokens_per_second: m.tokensPerSecond,
203
- speech_id: m.speechId,
204
- metadata: metadataToWire(m.metadata),
205
- });
206
- }
207
-
208
- function sttMetricsToWire(m: STTMetrics): WireObject {
209
- return omitUndefined({
210
- type: m.type,
211
- label: m.label,
212
- request_id: m.requestId,
213
- timestamp: msToS(m.timestamp),
214
- duration: msToS(m.durationMs),
215
- audio_duration: msToS(m.audioDurationMs),
216
- input_tokens: m.inputTokens,
217
- output_tokens: m.outputTokens,
218
- streamed: m.streamed,
219
- metadata: metadataToWire(m.metadata),
220
- });
221
- }
222
-
223
- function ttsMetricsToWire(m: TTSMetrics): WireObject {
224
- return omitUndefined({
225
- type: m.type,
226
- label: m.label,
227
- request_id: m.requestId,
228
- timestamp: msToS(m.timestamp),
229
- ttfb: msToS(m.ttfbMs),
230
- duration: msToS(m.durationMs),
231
- audio_duration: msToS(m.audioDurationMs),
232
- cancelled: m.cancelled,
233
- characters_count: m.charactersCount,
234
- input_tokens: m.inputTokens,
235
- output_tokens: m.outputTokens,
236
- streamed: m.streamed,
237
- segment_id: m.segmentId,
238
- speech_id: m.speechId,
239
- metadata: metadataToWire(m.metadata),
240
- });
241
- }
242
-
243
- function vadMetricsToWire(m: VADMetrics): WireObject {
244
- return {
245
- type: m.type,
246
- label: m.label,
247
- timestamp: msToS(m.timestamp),
248
- idle_time: msToS(m.idleTimeMs),
249
- inference_duration_total: msToS(m.inferenceDurationTotalMs),
250
- inference_count: m.inferenceCount,
251
- };
252
- }
253
-
254
- function eouMetricsToWire(m: EOUMetrics): WireObject {
255
- return omitUndefined({
256
- type: m.type,
257
- timestamp: msToS(m.timestamp),
258
- end_of_utterance_delay: msToS(m.endOfUtteranceDelayMs),
259
- transcription_delay: msToS(m.transcriptionDelayMs),
260
- on_user_turn_completed_delay: msToS(m.onUserTurnCompletedDelayMs),
261
- speech_id: m.speechId,
262
- });
263
- }
264
-
265
- function cachedTokenDetailsToWire(d: RealtimeModelMetricsCachedTokenDetails): WireObject {
266
- return {
267
- audio_tokens: d.audioTokens,
268
- text_tokens: d.textTokens,
269
- image_tokens: d.imageTokens,
270
- };
271
- }
272
-
273
- function inputTokenDetailsToWire(d: RealtimeModelMetricsInputTokenDetails): WireObject {
274
- return omitUndefined({
275
- audio_tokens: d.audioTokens,
276
- text_tokens: d.textTokens,
277
- image_tokens: d.imageTokens,
278
- cached_tokens: d.cachedTokens,
279
- cached_tokens_details: d.cachedTokensDetails
280
- ? cachedTokenDetailsToWire(d.cachedTokensDetails)
281
- : undefined,
282
- });
283
- }
284
-
285
- function outputTokenDetailsToWire(d: RealtimeModelMetricsOutputTokenDetails): WireObject {
286
- return {
287
- text_tokens: d.textTokens,
288
- audio_tokens: d.audioTokens,
289
- image_tokens: d.imageTokens,
290
- };
291
- }
292
-
293
- function realtimeModelMetricsToWire(m: RealtimeModelMetrics): WireObject {
294
- return omitUndefined({
295
- type: m.type,
296
- label: m.label,
297
- request_id: m.requestId,
298
- timestamp: msToS(m.timestamp),
299
- duration: msToS(m.durationMs),
300
- session_duration: m.sessionDurationMs !== undefined ? msToS(m.sessionDurationMs) : undefined,
301
- ttft: msToS(m.ttftMs),
302
- cancelled: m.cancelled,
303
- input_tokens: m.inputTokens,
304
- output_tokens: m.outputTokens,
305
- total_tokens: m.totalTokens,
306
- tokens_per_second: m.tokensPerSecond,
307
- input_token_details: inputTokenDetailsToWire(m.inputTokenDetails),
308
- output_token_details: outputTokenDetailsToWire(m.outputTokenDetails),
309
- metadata: metadataToWire(m.metadata),
310
- });
311
- }
312
-
313
- function interruptionMetricsToWire(m: InterruptionMetrics): WireObject {
314
- return omitUndefined({
315
- type: m.type,
316
- timestamp: msToS(m.timestamp),
317
- total_duration: msToS(m.totalDuration),
318
- prediction_duration: msToS(m.predictionDuration),
319
- detection_delay: msToS(m.detectionDelay),
320
- num_interruptions: m.numInterruptions,
321
- num_backchannels: m.numBackchannels,
322
- num_requests: m.numRequests,
323
- metadata: metadataToWire(m.metadata),
324
- });
325
- }
326
-
327
- export function agentMetricsToWire(m: AgentMetrics): WireObject {
328
- switch (m.type) {
329
- case 'llm_metrics':
330
- return llmMetricsToWire(m);
331
- case 'stt_metrics':
332
- return sttMetricsToWire(m);
333
- case 'tts_metrics':
334
- return ttsMetricsToWire(m);
335
- case 'vad_metrics':
336
- return vadMetricsToWire(m);
337
- case 'eou_metrics':
338
- return eouMetricsToWire(m);
339
- case 'realtime_model_metrics':
340
- return realtimeModelMetricsToWire(m);
341
- case 'interruption_metrics':
342
- return interruptionMetricsToWire(m);
343
- }
344
- }
345
-
346
- function llmModelUsageToWire(u: Partial<LLMModelUsage>): WireObject {
347
- return {
348
- type: u.type,
349
- provider: u.provider ?? '',
350
- model: u.model ?? '',
351
- input_tokens: u.inputTokens ?? 0,
352
- input_cached_tokens: u.inputCachedTokens ?? 0,
353
- input_audio_tokens: u.inputAudioTokens ?? 0,
354
- input_cached_audio_tokens: u.inputCachedAudioTokens ?? 0,
355
- input_text_tokens: u.inputTextTokens ?? 0,
356
- input_cached_text_tokens: u.inputCachedTextTokens ?? 0,
357
- input_image_tokens: u.inputImageTokens ?? 0,
358
- input_cached_image_tokens: u.inputCachedImageTokens ?? 0,
359
- output_tokens: u.outputTokens ?? 0,
360
- output_audio_tokens: u.outputAudioTokens ?? 0,
361
- output_text_tokens: u.outputTextTokens ?? 0,
362
- session_duration: msToS(u.sessionDurationMs ?? 0),
363
- };
364
- }
365
-
366
- function ttsModelUsageToWire(u: Partial<TTSModelUsage>): WireObject {
367
- return {
368
- type: u.type,
369
- provider: u.provider ?? '',
370
- model: u.model ?? '',
371
- input_tokens: u.inputTokens ?? 0,
372
- output_tokens: u.outputTokens ?? 0,
373
- characters_count: u.charactersCount ?? 0,
374
- audio_duration: msToS(u.audioDurationMs ?? 0),
375
- };
376
- }
377
-
378
- function sttModelUsageToWire(u: Partial<STTModelUsage>): WireObject {
379
- return {
380
- type: u.type,
381
- provider: u.provider ?? '',
382
- model: u.model ?? '',
383
- input_tokens: u.inputTokens ?? 0,
384
- output_tokens: u.outputTokens ?? 0,
385
- audio_duration: msToS(u.audioDurationMs ?? 0),
386
- };
387
- }
388
-
389
- function interruptionModelUsageToWire(u: Partial<InterruptionModelUsage>): WireObject {
390
- return {
391
- type: u.type,
392
- provider: u.provider ?? '',
393
- model: u.model ?? '',
394
- total_requests: u.totalRequests ?? 0,
395
- };
396
- }
397
-
398
- export function modelUsageToWire(u: Partial<ModelUsage>): WireObject {
399
- switch (u.type) {
400
- case 'llm_usage':
401
- return llmModelUsageToWire(u as Partial<LLMModelUsage>);
402
- case 'tts_usage':
403
- return ttsModelUsageToWire(u as Partial<TTSModelUsage>);
404
- case 'stt_usage':
405
- return sttModelUsageToWire(u as Partial<STTModelUsage>);
406
- case 'interruption_usage':
407
- return interruptionModelUsageToWire(u as Partial<InterruptionModelUsage>);
408
- default:
409
- return u as WireObject;
410
- }
411
- }
412
-
413
- export function agentSessionUsageToWire(u: AgentSessionUsage): WireObject {
414
- return {
415
- model_usage: u.modelUsage.map(modelUsageToWire),
416
- };
417
- }
418
-
419
- // ===========================================================================
420
- // Zod wire-format schemas
421
- // These validate the exact JSON shape that Python Pydantic emits on the wire.
422
- // Inferred types via z.infer give fully typed parse results.
423
- // ===========================================================================
424
- const imageContentWireSchema = z.object({
425
- id: z.string(),
426
- type: z.literal('image_content'),
427
- image: z.string(),
428
- inference_detail: z.enum(['auto', 'high', 'low']).optional(),
429
- inference_width: z.number().optional(),
430
- inference_height: z.number().optional(),
431
- mime_type: z.string().optional(),
432
- });
433
-
434
- const audioContentWireSchema = z.object({
435
- type: z.literal('audio_content'),
436
- transcript: z.string().nullable().optional(),
437
- });
438
-
439
- const chatContentWireSchema = z.union([z.string(), imageContentWireSchema, audioContentWireSchema]);
440
-
441
- const metricsReportWireSchema = z
442
- .object({
443
- started_speaking_at: z.number().optional(),
444
- stopped_speaking_at: z.number().optional(),
445
- transcription_delay: z.number().optional(),
446
- end_of_turn_delay: z.number().optional(),
447
- on_user_turn_completed_delay: z.number().optional(),
448
- llm_node_ttft: z.number().optional(),
449
- tts_node_ttfb: z.number().optional(),
450
- e2e_latency: z.number().optional(),
451
- })
452
- .optional();
453
-
454
- export const chatMessageWireSchema = z.object({
455
- id: z.string(),
456
- type: z.literal('message'),
457
- role: z.enum(['developer', 'system', 'user', 'assistant']),
458
- content: z.array(chatContentWireSchema),
459
- interrupted: z.boolean(),
460
- created_at: z.number(),
461
- transcript_confidence: z.number().optional(),
462
- metrics: metricsReportWireSchema,
463
- extra: z.record(z.string(), z.unknown()).optional(),
464
- });
465
-
466
- export const functionCallWireSchema = z.object({
467
- id: z.string(),
468
- type: z.literal('function_call'),
469
- call_id: z.string(),
470
- arguments: z.string(),
471
- name: z.string(),
472
- created_at: z.number(),
473
- extra: z.record(z.string(), z.unknown()).optional(),
474
- group_id: z.string().optional(),
475
- });
476
-
477
- export const functionCallOutputWireSchema = z.object({
478
- id: z.string(),
479
- type: z.literal('function_call_output'),
480
- name: z.string(),
481
- call_id: z.string(),
482
- output: z.string(),
483
- is_error: z.boolean(),
484
- created_at: z.number(),
485
- });
486
-
487
- export const agentHandoffWireSchema = z.object({
488
- id: z.string(),
489
- type: z.literal('agent_handoff'),
490
- new_agent_id: z.string(),
491
- created_at: z.number(),
492
- old_agent_id: z.string().optional(),
493
- });
494
-
495
- export const chatItemWireSchema = z.discriminatedUnion('type', [
496
- chatMessageWireSchema,
497
- functionCallWireSchema,
498
- functionCallOutputWireSchema,
499
- agentHandoffWireSchema,
500
- ]);
501
-
502
- const metadataWireSchema = z
503
- .object({
504
- model_name: z.string().optional(),
505
- model_provider: z.string().optional(),
506
- })
507
- .nullable()
508
- .optional();
509
-
510
- export const llmMetricsWireSchema = z.object({
511
- type: z.literal('llm_metrics'),
512
- label: z.string(),
513
- request_id: z.string(),
514
- timestamp: z.number(),
515
- duration: z.number(),
516
- ttft: z.number(),
517
- cancelled: z.boolean(),
518
- completion_tokens: z.number(),
519
- prompt_tokens: z.number(),
520
- prompt_cached_tokens: z.number(),
521
- total_tokens: z.number(),
522
- tokens_per_second: z.number(),
523
- speech_id: z.string().nullable().optional(),
524
- metadata: metadataWireSchema,
525
- });
526
-
527
- export const sttMetricsWireSchema = z.object({
528
- type: z.literal('stt_metrics'),
529
- label: z.string(),
530
- request_id: z.string(),
531
- timestamp: z.number(),
532
- duration: z.number(),
533
- audio_duration: z.number(),
534
- input_tokens: z.number().optional(),
535
- output_tokens: z.number().optional(),
536
- streamed: z.boolean(),
537
- metadata: metadataWireSchema,
538
- });
539
-
540
- export const ttsMetricsWireSchema = z.object({
541
- type: z.literal('tts_metrics'),
542
- label: z.string(),
543
- request_id: z.string(),
544
- timestamp: z.number(),
545
- ttfb: z.number(),
546
- duration: z.number(),
547
- audio_duration: z.number(),
548
- cancelled: z.boolean(),
549
- characters_count: z.number(),
550
- input_tokens: z.number().optional(),
551
- output_tokens: z.number().optional(),
552
- streamed: z.boolean(),
553
- segment_id: z.string().nullable().optional(),
554
- speech_id: z.string().nullable().optional(),
555
- metadata: metadataWireSchema,
556
- });
557
-
558
- export const vadMetricsWireSchema = z.object({
559
- type: z.literal('vad_metrics'),
560
- label: z.string(),
561
- timestamp: z.number(),
562
- idle_time: z.number(),
563
- inference_duration_total: z.number(),
564
- inference_count: z.number(),
565
- });
566
-
567
- export const eouMetricsWireSchema = z.object({
568
- type: z.literal('eou_metrics'),
569
- timestamp: z.number(),
570
- end_of_utterance_delay: z.number(),
571
- transcription_delay: z.number(),
572
- on_user_turn_completed_delay: z.number(),
573
- speech_id: z.string().nullable().optional(),
574
- });
575
-
576
- const cachedTokenDetailsWireSchema = z.object({
577
- audio_tokens: z.number(),
578
- text_tokens: z.number(),
579
- image_tokens: z.number(),
580
- });
581
-
582
- const inputTokenDetailsWireSchema = z.object({
583
- audio_tokens: z.number(),
584
- text_tokens: z.number(),
585
- image_tokens: z.number(),
586
- cached_tokens: z.number(),
587
- cached_tokens_details: cachedTokenDetailsWireSchema.nullable().optional(),
588
- });
589
-
590
- const outputTokenDetailsWireSchema = z.object({
591
- text_tokens: z.number(),
592
- audio_tokens: z.number(),
593
- image_tokens: z.number(),
594
- });
595
-
596
- export const realtimeModelMetricsWireSchema = z.object({
597
- type: z.literal('realtime_model_metrics'),
598
- label: z.string(),
599
- request_id: z.string(),
600
- timestamp: z.number(),
601
- duration: z.number(),
602
- session_duration: z.number().optional(),
603
- ttft: z.number(),
604
- cancelled: z.boolean(),
605
- input_tokens: z.number(),
606
- output_tokens: z.number(),
607
- total_tokens: z.number(),
608
- tokens_per_second: z.number(),
609
- input_token_details: inputTokenDetailsWireSchema,
610
- output_token_details: outputTokenDetailsWireSchema,
611
- metadata: metadataWireSchema,
612
- });
613
-
614
- export const interruptionMetricsWireSchema = z.object({
615
- type: z.literal('interruption_metrics'),
616
- timestamp: z.number(),
617
- total_duration: z.number(),
618
- prediction_duration: z.number(),
619
- detection_delay: z.number(),
620
- num_interruptions: z.number(),
621
- num_backchannels: z.number(),
622
- num_requests: z.number(),
623
- metadata: metadataWireSchema,
624
- });
625
-
626
- export const agentMetricsWireSchema = z.discriminatedUnion('type', [
627
- llmMetricsWireSchema,
628
- sttMetricsWireSchema,
629
- ttsMetricsWireSchema,
630
- vadMetricsWireSchema,
631
- eouMetricsWireSchema,
632
- realtimeModelMetricsWireSchema,
633
- interruptionMetricsWireSchema,
634
- ]);
635
-
636
- // ---------------------------------------------------------------------------
637
- // Model usage schemas
638
- // ---------------------------------------------------------------------------
639
-
640
- export const llmModelUsageWireSchema = z.object({
641
- type: z.literal('llm_usage'),
642
- provider: z.string().optional(),
643
- model: z.string().optional(),
644
- input_tokens: z.number().optional(),
645
- input_cached_tokens: z.number().optional(),
646
- input_audio_tokens: z.number().optional(),
647
- input_cached_audio_tokens: z.number().optional(),
648
- input_text_tokens: z.number().optional(),
649
- input_cached_text_tokens: z.number().optional(),
650
- input_image_tokens: z.number().optional(),
651
- input_cached_image_tokens: z.number().optional(),
652
- output_tokens: z.number().optional(),
653
- output_audio_tokens: z.number().optional(),
654
- output_text_tokens: z.number().optional(),
655
- session_duration: z.number().optional(),
656
- });
657
-
658
- export const ttsModelUsageWireSchema = z.object({
659
- type: z.literal('tts_usage'),
660
- provider: z.string().optional(),
661
- model: z.string().optional(),
662
- input_tokens: z.number().optional(),
663
- output_tokens: z.number().optional(),
664
- characters_count: z.number().optional(),
665
- audio_duration: z.number().optional(),
666
- });
667
-
668
- export const sttModelUsageWireSchema = z.object({
669
- type: z.literal('stt_usage'),
670
- provider: z.string().optional(),
671
- model: z.string().optional(),
672
- input_tokens: z.number().optional(),
673
- output_tokens: z.number().optional(),
674
- audio_duration: z.number().optional(),
675
- });
676
-
677
- export const interruptionModelUsageWireSchema = z.object({
678
- type: z.literal('interruption_usage'),
679
- provider: z.string().optional(),
680
- model: z.string().optional(),
681
- total_requests: z.number().optional(),
682
- });
683
-
684
- export const modelUsageWireSchema = z.discriminatedUnion('type', [
685
- llmModelUsageWireSchema,
686
- ttsModelUsageWireSchema,
687
- sttModelUsageWireSchema,
688
- interruptionModelUsageWireSchema,
689
- ]);
690
-
691
- export const agentSessionUsageWireSchema = z.object({
692
- model_usage: z.array(modelUsageWireSchema),
693
- });
694
-
695
- // ---------------------------------------------------------------------------
696
- // Client event schemas
697
- // ---------------------------------------------------------------------------
698
-
699
- const agentStateSchema = z.enum(['initializing', 'idle', 'listening', 'thinking', 'speaking']);
700
- const userStateSchema = z.enum(['speaking', 'listening', 'away']);
701
-
702
- export const clientAgentStateChangedSchema = z.object({
703
- type: z.literal('agent_state_changed'),
704
- old_state: agentStateSchema,
705
- new_state: agentStateSchema,
706
- created_at: z.number(),
707
- });
708
-
709
- export const clientUserStateChangedSchema = z.object({
710
- type: z.literal('user_state_changed'),
711
- old_state: userStateSchema,
712
- new_state: userStateSchema,
713
- created_at: z.number(),
714
- });
715
-
716
- export const clientConversationItemAddedSchema = z.object({
717
- type: z.literal('conversation_item_added'),
718
- item: chatMessageWireSchema,
719
- created_at: z.number(),
720
- });
721
-
722
- export const clientUserInputTranscribedSchema = z.object({
723
- type: z.literal('user_input_transcribed'),
724
- transcript: z.string(),
725
- is_final: z.boolean(),
726
- language: z.string().nullable(),
727
- created_at: z.number(),
728
- });
729
-
730
- export const clientFunctionToolsExecutedSchema = z.object({
731
- type: z.literal('function_tools_executed'),
732
- function_calls: z.array(functionCallWireSchema),
733
- function_call_outputs: z.array(functionCallOutputWireSchema.nullable()),
734
- created_at: z.number(),
735
- });
736
-
737
- export const clientMetricsCollectedSchema = z.object({
738
- type: z.literal('metrics_collected'),
739
- metrics: agentMetricsWireSchema,
740
- created_at: z.number(),
741
- });
742
-
743
- export const clientErrorSchema = z.object({
744
- type: z.literal('error'),
745
- message: z.string(),
746
- created_at: z.number(),
747
- });
748
-
749
- export const clientUserOverlappingSpeechSchema = z.object({
750
- type: z.literal('user_overlapping_speech'),
751
- is_interruption: z.boolean(),
752
- created_at: z.number(),
753
- sent_at: z.number(),
754
- detection_delay: z.number(),
755
- overlap_started_at: z.number().nullable(),
756
- });
757
-
758
- export const clientSessionUsageSchema = z.object({
759
- type: z.literal('session_usage'),
760
- usage: agentSessionUsageWireSchema,
761
- created_at: z.number(),
762
- });
763
-
764
- export const clientEventSchema = z.discriminatedUnion('type', [
765
- clientAgentStateChangedSchema,
766
- clientUserStateChangedSchema,
767
- clientConversationItemAddedSchema,
768
- clientUserInputTranscribedSchema,
769
- clientFunctionToolsExecutedSchema,
770
- clientMetricsCollectedSchema,
771
- clientErrorSchema,
772
- clientUserOverlappingSpeechSchema,
773
- clientSessionUsageSchema,
774
- ]);
775
-
776
- // ---------------------------------------------------------------------------
777
- // RPC schemas
778
- // ---------------------------------------------------------------------------
779
-
780
- export const sendMessageRequestSchema = z.object({
781
- text: z.string(),
782
- });
783
-
784
- export const streamRequestSchema = z.object({
785
- request_id: z.string(),
786
- method: z.string(),
787
- payload: z.string(),
788
- });
789
-
790
- export const streamResponseSchema = z.object({
791
- request_id: z.string(),
792
- payload: z.string(),
793
- error: z.string().nullable().optional(),
794
- });
795
-
796
- export const getSessionStateResponseSchema = z.object({
797
- agent_state: agentStateSchema,
798
- user_state: userStateSchema,
799
- agent_id: z.string(),
800
- options: z.record(z.string(), z.unknown()),
801
- created_at: z.number(),
802
- });
803
-
804
- export const getChatHistoryResponseSchema = z.object({
805
- items: z.array(chatItemWireSchema),
806
- });
807
-
808
- export const getAgentInfoResponseSchema = z.object({
809
- id: z.string(),
810
- instructions: z.string().nullable(),
811
- tools: z.array(z.string()),
812
- chat_ctx: z.array(chatItemWireSchema),
813
- });
814
-
815
- export const sendMessageResponseSchema = z.object({
816
- items: z.array(chatItemWireSchema),
817
- });
818
-
819
- export const getRTCStatsResponseSchema = z.object({
820
- publisher_stats: z.array(z.record(z.string(), z.unknown())),
821
- subscriber_stats: z.array(z.record(z.string(), z.unknown())),
822
- });
823
-
824
- export const getSessionUsageResponseSchema = z.object({
825
- usage: agentSessionUsageWireSchema,
826
- created_at: z.number(),
827
- });