realtime-avatar 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. package/AGENTS.md +132 -0
  2. package/CLAUDE.md +17 -0
  3. package/LICENSE +21 -0
  4. package/README.md +254 -0
  5. package/dist/api-keys.d.ts +26 -0
  6. package/dist/api-keys.d.ts.map +1 -0
  7. package/dist/api-keys.js +88 -0
  8. package/dist/api-keys.js.map +1 -0
  9. package/dist/browser/audio.d.ts +65 -0
  10. package/dist/browser/audio.d.ts.map +1 -0
  11. package/dist/browser/audio.js +154 -0
  12. package/dist/browser/audio.js.map +1 -0
  13. package/dist/browser/boomerang.d.ts +38 -0
  14. package/dist/browser/boomerang.d.ts.map +1 -0
  15. package/dist/browser/boomerang.js +85 -0
  16. package/dist/browser/boomerang.js.map +1 -0
  17. package/dist/browser/index.d.ts +8 -0
  18. package/dist/browser/index.d.ts.map +1 -0
  19. package/dist/browser/index.js +8 -0
  20. package/dist/browser/index.js.map +1 -0
  21. package/dist/browser/media-session.d.ts +43 -0
  22. package/dist/browser/media-session.d.ts.map +1 -0
  23. package/dist/browser/media-session.js +169 -0
  24. package/dist/browser/media-session.js.map +1 -0
  25. package/dist/browser/player.d.ts +162 -0
  26. package/dist/browser/player.d.ts.map +1 -0
  27. package/dist/browser/player.js +514 -0
  28. package/dist/browser/player.js.map +1 -0
  29. package/dist/browser/view.d.ts +47 -0
  30. package/dist/browser/view.d.ts.map +1 -0
  31. package/dist/browser/view.js +7 -0
  32. package/dist/browser/view.js.map +1 -0
  33. package/dist/browser/webrtc.d.ts +21 -0
  34. package/dist/browser/webrtc.d.ts.map +1 -0
  35. package/dist/browser/webrtc.js +149 -0
  36. package/dist/browser/webrtc.js.map +1 -0
  37. package/dist/browser/yuv-canvas.d.ts +13 -0
  38. package/dist/browser/yuv-canvas.d.ts.map +1 -0
  39. package/dist/browser/yuv-canvas.js +95 -0
  40. package/dist/browser/yuv-canvas.js.map +1 -0
  41. package/dist/client.d.ts +195 -0
  42. package/dist/client.d.ts.map +1 -0
  43. package/dist/client.js +440 -0
  44. package/dist/client.js.map +1 -0
  45. package/dist/errors.d.ts +33 -0
  46. package/dist/errors.d.ts.map +1 -0
  47. package/dist/errors.js +73 -0
  48. package/dist/errors.js.map +1 -0
  49. package/dist/generated/openapi.d.ts +1523 -0
  50. package/dist/generated/openapi.d.ts.map +1 -0
  51. package/dist/generated/openapi.js +6 -0
  52. package/dist/generated/openapi.js.map +1 -0
  53. package/dist/index.d.ts +14 -0
  54. package/dist/index.d.ts.map +1 -0
  55. package/dist/index.js +15 -0
  56. package/dist/index.js.map +1 -0
  57. package/dist/media.d.ts +40 -0
  58. package/dist/media.d.ts.map +1 -0
  59. package/dist/media.js +4 -0
  60. package/dist/media.js.map +1 -0
  61. package/dist/mux.d.ts +104 -0
  62. package/dist/mux.d.ts.map +1 -0
  63. package/dist/mux.js +290 -0
  64. package/dist/mux.js.map +1 -0
  65. package/dist/platform.d.ts +163 -0
  66. package/dist/platform.d.ts.map +1 -0
  67. package/dist/platform.js +5 -0
  68. package/dist/platform.js.map +1 -0
  69. package/dist/react/index.d.ts +5 -0
  70. package/dist/react/index.d.ts.map +1 -0
  71. package/dist/react/index.js +5 -0
  72. package/dist/react/index.js.map +1 -0
  73. package/dist/react/provider.d.ts +37 -0
  74. package/dist/react/provider.d.ts.map +1 -0
  75. package/dist/react/provider.js +33 -0
  76. package/dist/react/provider.js.map +1 -0
  77. package/dist/react/realtime.d.ts +74 -0
  78. package/dist/react/realtime.d.ts.map +1 -0
  79. package/dist/react/realtime.js +105 -0
  80. package/dist/react/realtime.js.map +1 -0
  81. package/dist/react/session.d.ts +91 -0
  82. package/dist/react/session.d.ts.map +1 -0
  83. package/dist/react/session.js +322 -0
  84. package/dist/react/session.js.map +1 -0
  85. package/dist/react/stage.d.ts +23 -0
  86. package/dist/react/stage.d.ts.map +1 -0
  87. package/dist/react/stage.js +62 -0
  88. package/dist/react/stage.js.map +1 -0
  89. package/dist/schemas.d.ts +59 -0
  90. package/dist/schemas.d.ts.map +1 -0
  91. package/dist/schemas.js +58 -0
  92. package/dist/schemas.js.map +1 -0
  93. package/dist/server.d.ts +2 -0
  94. package/dist/server.d.ts.map +1 -0
  95. package/dist/server.js +8 -0
  96. package/dist/server.js.map +1 -0
  97. package/dist/session-socket.d.ts +96 -0
  98. package/dist/session-socket.d.ts.map +1 -0
  99. package/dist/session-socket.js +299 -0
  100. package/dist/session-socket.js.map +1 -0
  101. package/dist/session.d.ts +107 -0
  102. package/dist/session.d.ts.map +1 -0
  103. package/dist/session.js +192 -0
  104. package/dist/session.js.map +1 -0
  105. package/dist/types.d.ts +24 -0
  106. package/dist/types.d.ts.map +1 -0
  107. package/dist/types.js +2 -0
  108. package/dist/types.js.map +1 -0
  109. package/package.json +94 -0
@@ -0,0 +1,162 @@
1
+ import type { RealtimeTurnStream } from "../client";
2
+ import type { AvatarTurnEventSource } from "../session-socket";
3
+ import { type PlayoutDelay } from "./audio";
4
+ /** True when this browser can hardware-decode the h264 avatar stream. */
5
+ export declare function supportsH264Playback(): boolean;
6
+ export type AvatarPlayerState = "idle" | "thinking" | "speaking" | "done" | "error";
7
+ export type AvatarPlayerMetrics = {
8
+ /** Total video frames received this turn. */
9
+ frames: number;
10
+ /** Frames dropped to stay in sync with the audio clock. */
11
+ droppedFrames: number;
12
+ /** ms from turn start to the first audio chunk. */
13
+ firstAudioMs: number | null;
14
+ /** ms from turn start to the first video frame. */
15
+ firstVideoMs: number | null;
16
+ /** ms from turn start to the first frame DRAWN on the running audio clock —
17
+ * the user-perceived time-to-first-frame (audio is playing by then). */
18
+ firstFrameDrawnMs: number | null;
19
+ /** Currently buffered audio, in ms. */
20
+ audioQueueMs: number;
21
+ /** Accumulated audio underrun (gaps the scheduler had to paper over), in ms. */
22
+ audioUnderrunMs: number;
23
+ width: number | null;
24
+ height: number | null;
25
+ };
26
+ export type AvatarPlayHandlers = {
27
+ /** Streaming assistant text: `delta` is the new text, `full` the accumulated reply. */
28
+ onText?: (delta: string, full: string) => void;
29
+ onState?: (state: AvatarPlayerState) => void;
30
+ onMetrics?: (metrics: AvatarPlayerMetrics) => void;
31
+ };
32
+ /** Final source-video playback position from the server's `done` event. */
33
+ export type AvatarSourceVideoState = {
34
+ /** Ping-pong cursor; pass back as the next turn's `source_start_frame`. */
35
+ cursor: number;
36
+ /** Source frame index the render stopped on. */
37
+ index: number;
38
+ direction: "forward" | "reverse";
39
+ /** Source frame count in the cache. */
40
+ frames: number;
41
+ };
42
+ export type AvatarPlaySummary = {
43
+ text: string;
44
+ frames: number;
45
+ elapsedMs: number;
46
+ metrics: AvatarPlayerMetrics;
47
+ /** Present on source-video turns: where to resume the idle loop seamlessly. */
48
+ sourceVideo?: AvatarSourceVideoState;
49
+ };
50
+ export type AvatarPlayerOptions = {
51
+ sampleRate?: number;
52
+ /**
53
+ * `"adaptive"` (default) starts audio the moment the first video frame is
54
+ * decodable (or after a short cap for audio-only turns), minimizing
55
+ * perceived first-frame latency. A number reproduces the legacy fixed
56
+ * playout delay in ms.
57
+ */
58
+ playoutDelayMs?: PlayoutDelay;
59
+ };
60
+ /**
61
+ * Renders an avatar turn stream to a canvas with audio-clocked video playback.
62
+ *
63
+ * This is the piece every integrator would otherwise hand-roll: it schedules
64
+ * PCM audio, decodes/queues video frames, and drives a `requestAnimationFrame`
65
+ * clock that draws each frame on its audio-aligned presentation time, dropping
66
+ * late frames so lip-sync never drifts.
67
+ *
68
+ * const player = new AvatarPlayer();
69
+ * player.attach(canvasEl);
70
+ * await player.play(await session.chat("who are you?"));
71
+ */
72
+ export declare class AvatarPlayer {
73
+ private readonly sampleRate;
74
+ private readonly playoutDelay;
75
+ private canvas;
76
+ private scheduler;
77
+ private renderer;
78
+ private queue;
79
+ private pendingJpeg;
80
+ private decoding;
81
+ /** Lazily-configured WebCodecs decoder for `codec: "h264"` streams. */
82
+ private videoDecoder;
83
+ private rafHandle;
84
+ private state;
85
+ private metricsState;
86
+ /** Persistent, gesture-unlocked AudioContext reused across turns. */
87
+ private audioContext;
88
+ /** Lazily-created recording tap; every scheduled audio buffer also feeds it. */
89
+ private audioTap;
90
+ private playStartedAt;
91
+ constructor(options?: AvatarPlayerOptions);
92
+ /** Bind (or rebind) the canvas the player draws into. */
93
+ attach(canvas: HTMLCanvasElement | null): void;
94
+ /**
95
+ * Unlock audio from inside a user gesture (click/tap). Browsers start an
96
+ * AudioContext suspended and only let it resume during a gesture; turns that
97
+ * fire later (e.g. a livestream auto-reaction from a timer/effect) would
98
+ * otherwise play silently. Call this from the first user interaction. Safe to
99
+ * call repeatedly. Resolves once the context is running (or no-ops off-DOM).
100
+ */
101
+ unlock(): Promise<void>;
102
+ get metrics(): AvatarPlayerMetrics;
103
+ /**
104
+ * A live `MediaStream` carrying everything the player schedules to the
105
+ * speakers — pair it with `canvas.captureStream()` + `MediaRecorder` to
106
+ * record a turn exactly as it played. The tap is additive (speaker output is
107
+ * unchanged) and persists across turns. Returns `null` until the player has
108
+ * an AudioContext — call `unlock()` (any user gesture) first.
109
+ */
110
+ captureAudioStream(): MediaStream | null;
111
+ /**
112
+ * Play a turn to completion. Accepts anything that exposes turn events —
113
+ * an HTTP `RealtimeTurnStream` or a `RealtimeSessionSocket.turn()` source.
114
+ * Resolves with a summary when the stream ends. Pass the same `signal` used
115
+ * to create the stream so `stop()` and the network abort together.
116
+ */
117
+ play(stream: RealtimeTurnStream | AvatarTurnEventSource, handlers?: AvatarPlayHandlers & {
118
+ signal?: AbortSignal;
119
+ }): Promise<AvatarPlaySummary>;
120
+ /**
121
+ * After the stream ends, keep the audio clock + render loop running until the
122
+ * buffered audio has fully played out and the video queue has drained, so the
123
+ * tail of the turn actually renders instead of freezing on the last frame.
124
+ */
125
+ private drainPlayback;
126
+ /** Stop playback: cancel the clock, close audio, clear the queue and canvas. */
127
+ stop(): void;
128
+ /** Fully release resources, including the persistent AudioContext. Call on
129
+ * unmount; after this, unlock() must be called again before the next turn. */
130
+ dispose(): void;
131
+ private handleEvent;
132
+ private queueVideo;
133
+ /**
134
+ * Decodes the pending-JPEG queue into bitmaps with bounded concurrency, off
135
+ * the stream loop. Concurrency matters: serial `await createImageBitmap` for
136
+ * tall (e.g. 832px) frames can exceed the per-frame realtime budget, so decode
137
+ * falls permanently behind the audio clock and the render queue starves —
138
+ * which is exactly the "audio plays but video freezes" stall. Decoding a few
139
+ * frames in parallel keeps throughput ahead of realtime.
140
+ *
141
+ * Frames are inserted into the render queue in pts order regardless of which
142
+ * decode finishes first, and we never drop here — the audio-clocked render
143
+ * loop is the single place that drops late frames.
144
+ */
145
+ private pumpJpegDecode;
146
+ /**
147
+ * Feed h264 access units to a WebCodecs decoder. Each server chunk is a
148
+ * self-contained Annex-B stream (AUD + SPS/PPS + IDR + P-frames), so any
149
+ * chunk can start decode and dropped chunks never corrupt later ones.
150
+ * Decoded VideoFrames land in the same pts-ordered queue the render clock
151
+ * already drains; hardware decode happens off the event loop.
152
+ */
153
+ private queueH264;
154
+ /** Insert a decoded frame into the render queue keeping it sorted by pts.
155
+ * A frame in the queue means there is something to lip-sync against, so
156
+ * this is also where adaptive playout releases held audio. */
157
+ private insertOrdered;
158
+ private startVideoClock;
159
+ private draw;
160
+ private setState;
161
+ }
162
+ //# sourceMappingURL=player.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"player.d.ts","sourceRoot":"","sources":["../../src/browser/player.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,WAAW,CAAC;AACpD,OAAO,KAAK,EAAE,qBAAqB,EAAE,MAAM,mBAAmB,CAAC;AAC/D,OAAO,EAAuB,KAAK,YAAY,EAAE,MAAM,SAAS,CAAC;AAcjE,yEAAyE;AACzE,wBAAgB,oBAAoB,IAAI,OAAO,CAE9C;AAED,MAAM,MAAM,iBAAiB,GAAG,MAAM,GAAG,UAAU,GAAG,UAAU,GAAG,MAAM,GAAG,OAAO,CAAC;AAEpF,MAAM,MAAM,mBAAmB,GAAG;IAChC,6CAA6C;IAC7C,MAAM,EAAE,MAAM,CAAC;IACf,2DAA2D;IAC3D,aAAa,EAAE,MAAM,CAAC;IACtB,mDAAmD;IACnD,YAAY,EAAE,MAAM,GAAG,IAAI,CAAC;IAC5B,mDAAmD;IACnD,YAAY,EAAE,MAAM,GAAG,IAAI,CAAC;IAC5B;6EACyE;IACzE,iBAAiB,EAAE,MAAM,GAAG,IAAI,CAAC;IACjC,uCAAuC;IACvC,YAAY,EAAE,MAAM,CAAC;IACrB,gFAAgF;IAChF,eAAe,EAAE,MAAM,CAAC;IACxB,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;IACrB,MAAM,EAAE,MAAM,GAAG,IAAI,CAAC;CACvB,CAAC;AAEF,MAAM,MAAM,kBAAkB,GAAG;IAC/B,uFAAuF;IACvF,MAAM,CAAC,EAAE,CAAC,KAAK,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,KAAK,IAAI,CAAC;IAC/C,OAAO,CAAC,EAAE,CAAC,KAAK,EAAE,iBAAiB,KAAK,IAAI,CAAC;IAC7C,SAAS,CAAC,EAAE,CAAC,OAAO,EAAE,mBAAmB,KAAK,IAAI,CAAC;CACpD,CAAC;AAEF,2EAA2E;AAC3E,MAAM,MAAM,sBAAsB,GAAG;IACnC,2EAA2E;IAC3E,MAAM,EAAE,MAAM,CAAC;IACf,gDAAgD;IAChD,KAAK,EAAE,MAAM,CAAC;IACd,SAAS,EAAE,SAAS,GAAG,SAAS,CAAC;IACjC,uCAAuC;IACvC,MAAM,EAAE,MAAM,CAAC;CAChB,CAAC;AAEF,MAAM,MAAM,iBAAiB,GAAG;IAC9B,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE,mBAAmB,CAAC;IAC7B,+EAA+E;IAC/E,WAAW,CAAC,EAAE,sBAAsB,CAAC;CACtC,CAAC;AAEF,MAAM,MAAM,mBAAmB,GAAG;IAChC,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB;;;;;OAKG;IACH,cAAc,CAAC,EAAE,YAAY,CAAC;CAC/B,CAAC;AAEF;;;;;;;;;;;GAWG;AACH,qBAAa,YAAY;IACvB,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAS;IACpC,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAe;IAC5C,OAAO,CAAC,MAAM,CAAkC;IAChD,OAAO,CAAC,SAAS,CAAoC;IACrD,OAAO,CAAC,QAAQ,CAAmC;IACnD,OAAO,CAAC,KAAK,CAAqB;IAClC,OAAO,CAAC,WAAW,CAA0B;IAC7C,OAAO,CAAC,QAAQ,CAAS;IACzB,uEAAuE;IACvE,OAAO,CAAC,YAAY,CAA6B;IACjD,OAAO,CAAC,SAAS,CAAuB;IACxC,OAAO,CAAC,KAAK,CAA6B;IAC1C,OAAO,CAAC,YAAY,CAAuC;IAC3D,qEAAqE;IACrE,OAAO,CAAC,YAAY,CAA6B;IACjD,gFAAgF;IAChF,OAAO,CAAC,QAAQ,CAAgD;IAChE,OAAO,CAAC,aAAa,CAAK;gBAEd,OAAO,GAAE,mBAAwB;IAK7C,yDAAyD;IACzD,MAAM,CAAC,MAAM,EAAE,iBAAiB,GAAG,IAAI,GAAG,IAAI;IAI9C;;;;;;OAMG;IACG,MAAM,IAAI,OAAO,CAAC,IAAI,CAAC;IAY7B,IAAI,OAAO,IAAI,mBAAmB,CAEjC;IAED;;;;;;OAMG;IACH,kBAAkB,IAAI,WAAW,GAAG,IAAI;IAOxC;;;;;OAKG;IACG,IAAI,CACR,MAAM,EAAE,kBAAkB,GAAG,qBAAqB,EAClD,QAAQ,GAAE,kBAAkB,GAAG;QAAE,MAAM,CAAC,EAAE,WAAW,CAAA;KAAO,GAC3D,OAAO,CAAC,iBAAiB,CAAC;IA+C7B;;;;OAIG;YACW,aAAa;IAoB3B,gFAAgF;IAChF,IAAI,IAAI,IAAI;IAsBZ;mFAC+E;IAC/E,OAAO,IAAI,IAAI;YAQD,WAAW;IA4DzB,OAAO,CAAC,UAAU;IA0ClB;;;;;;;;;;;OAWG;YACW,cAAc;IA2B5B;;;;;;OAMG;IACH,OAAO,CAAC,SAAS;IA+CjB;;mEAE+D;IAC/D,OAAO,CAAC,aAAa;IAiBrB,OAAO,CAAC,eAAe;IA8BvB,OAAO,CAAC,IAAI;IAaZ,OAAO,CAAC,QAAQ;CAKjB"}
@@ -0,0 +1,514 @@
1
+ import { Pcm16AudioScheduler } from "./audio";
2
+ import { I420CanvasRenderer } from "./yuv-canvas";
3
+ /** True when this browser can hardware-decode the h264 avatar stream. */
4
+ export function supportsH264Playback() {
5
+ return typeof VideoDecoder === "function" && typeof EncodedVideoChunk === "function";
6
+ }
7
+ /**
8
+ * Renders an avatar turn stream to a canvas with audio-clocked video playback.
9
+ *
10
+ * This is the piece every integrator would otherwise hand-roll: it schedules
11
+ * PCM audio, decodes/queues video frames, and drives a `requestAnimationFrame`
12
+ * clock that draws each frame on its audio-aligned presentation time, dropping
13
+ * late frames so lip-sync never drifts.
14
+ *
15
+ * const player = new AvatarPlayer();
16
+ * player.attach(canvasEl);
17
+ * await player.play(await session.chat("who are you?"));
18
+ */
19
+ export class AvatarPlayer {
20
+ sampleRate;
21
+ playoutDelay;
22
+ canvas = null;
23
+ scheduler = null;
24
+ renderer = null;
25
+ queue = [];
26
+ pendingJpeg = [];
27
+ decoding = false;
28
+ /** Lazily-configured WebCodecs decoder for `codec: "h264"` streams. */
29
+ videoDecoder = null;
30
+ rafHandle = null;
31
+ state = "idle";
32
+ metricsState = emptyMetrics();
33
+ /** Persistent, gesture-unlocked AudioContext reused across turns. */
34
+ audioContext = null;
35
+ /** Lazily-created recording tap; every scheduled audio buffer also feeds it. */
36
+ audioTap = null;
37
+ playStartedAt = 0;
38
+ constructor(options = {}) {
39
+ this.sampleRate = options.sampleRate ?? 16_000;
40
+ this.playoutDelay = options.playoutDelayMs ?? "adaptive";
41
+ }
42
+ /** Bind (or rebind) the canvas the player draws into. */
43
+ attach(canvas) {
44
+ this.canvas = canvas;
45
+ }
46
+ /**
47
+ * Unlock audio from inside a user gesture (click/tap). Browsers start an
48
+ * AudioContext suspended and only let it resume during a gesture; turns that
49
+ * fire later (e.g. a livestream auto-reaction from a timer/effect) would
50
+ * otherwise play silently. Call this from the first user interaction. Safe to
51
+ * call repeatedly. Resolves once the context is running (or no-ops off-DOM).
52
+ */
53
+ async unlock() {
54
+ const AudioContextCtor = typeof window !== "undefined"
55
+ ? window.AudioContext || window.webkitAudioContext
56
+ : undefined;
57
+ if (!AudioContextCtor)
58
+ return;
59
+ this.audioContext ??= new AudioContextCtor({ latencyHint: "interactive" });
60
+ if (this.audioContext.state !== "running") {
61
+ await this.audioContext.resume().catch(() => { });
62
+ }
63
+ }
64
+ get metrics() {
65
+ return { ...this.metricsState };
66
+ }
67
+ /**
68
+ * A live `MediaStream` carrying everything the player schedules to the
69
+ * speakers — pair it with `canvas.captureStream()` + `MediaRecorder` to
70
+ * record a turn exactly as it played. The tap is additive (speaker output is
71
+ * unchanged) and persists across turns. Returns `null` until the player has
72
+ * an AudioContext — call `unlock()` (any user gesture) first.
73
+ */
74
+ captureAudioStream() {
75
+ const context = this.audioContext;
76
+ if (!context || typeof context.createMediaStreamDestination !== "function")
77
+ return null;
78
+ this.audioTap ??= context.createMediaStreamDestination();
79
+ return this.audioTap.stream;
80
+ }
81
+ /**
82
+ * Play a turn to completion. Accepts anything that exposes turn events —
83
+ * an HTTP `RealtimeTurnStream` or a `RealtimeSessionSocket.turn()` source.
84
+ * Resolves with a summary when the stream ends. Pass the same `signal` used
85
+ * to create the stream so `stop()` and the network abort together.
86
+ */
87
+ async play(stream, handlers = {}) {
88
+ this.stop();
89
+ const startMs = now();
90
+ this.playStartedAt = startMs;
91
+ const signal = handlers.signal;
92
+ this.metricsState = emptyMetrics();
93
+ this.scheduler = new Pcm16AudioScheduler(this.sampleRate, this.playoutDelay, this.audioContext, {}, this.audioTap);
94
+ await this.scheduler.prepare();
95
+ this.setState("thinking", handlers);
96
+ this.startVideoClock();
97
+ let text = "";
98
+ let frames = 0;
99
+ let elapsedMs = 0;
100
+ let sourceVideo;
101
+ let streamErrored = false;
102
+ try {
103
+ for await (const event of stream.events) {
104
+ if (signal?.aborted)
105
+ break;
106
+ const handled = await this.handleEvent(event, startMs, text, handlers);
107
+ text = handled.text;
108
+ if (handled.frames !== undefined)
109
+ frames = handled.frames;
110
+ if (handled.elapsedMs !== undefined)
111
+ elapsedMs = handled.elapsedMs;
112
+ if (handled.sourceVideo)
113
+ sourceVideo = handled.sourceVideo;
114
+ if (handled.done)
115
+ break;
116
+ }
117
+ }
118
+ catch (error) {
119
+ streamErrored = true;
120
+ throw error;
121
+ }
122
+ finally {
123
+ // CRITICAL: the network stream ends well before playback does — there is
124
+ // ~playoutDelayMs of buffered audio plus a video queue still waiting for
125
+ // its presentation time. Do NOT tear the scheduler down here, or the
126
+ // render clock loses its time source and the buffered video tail freezes
127
+ // on the last drawn frame while the audio keeps playing. Instead keep the
128
+ // clock alive and drain to the end of playback first (unless aborted or
129
+ // the stream errored).
130
+ if (!signal?.aborted && !streamErrored) {
131
+ await this.drainPlayback(signal);
132
+ }
133
+ this.scheduler?.close();
134
+ this.scheduler = null;
135
+ }
136
+ this.setState("done", handlers);
137
+ return { text, frames, elapsedMs: elapsedMs || now() - startMs, metrics: this.metrics, sourceVideo };
138
+ }
139
+ /**
140
+ * After the stream ends, keep the audio clock + render loop running until the
141
+ * buffered audio has fully played out and the video queue has drained, so the
142
+ * tail of the turn actually renders instead of freezing on the last frame.
143
+ */
144
+ async drainPlayback(signal) {
145
+ const scheduler = this.scheduler;
146
+ if (!scheduler)
147
+ return;
148
+ const deadline = now() + 12_000; // hard cap so we never hang
149
+ while (now() < deadline) {
150
+ if (signal?.aborted)
151
+ return;
152
+ // Finish decoding anything still pending (JPEG pool or WebCodecs queue).
153
+ if (this.pendingJpeg.length || this.decoding || (this.videoDecoder?.decodeQueueSize ?? 0) > 0) {
154
+ await sleep(30);
155
+ continue;
156
+ }
157
+ const mediaTime = scheduler.mediaTimeSeconds;
158
+ const lastPts = this.queue.length ? this.queue[this.queue.length - 1].pts : null;
159
+ const audioRemainingMs = scheduler.queuedMs;
160
+ const videoDrained = lastPts === null || (mediaTime !== null && mediaTime >= lastPts);
161
+ if (videoDrained && audioRemainingMs <= 30)
162
+ return; // everything played out
163
+ await sleep(30);
164
+ }
165
+ }
166
+ /** Stop playback: cancel the clock, close audio, clear the queue and canvas. */
167
+ stop() {
168
+ if (this.rafHandle !== null) {
169
+ cancelAnimationFrame(this.rafHandle);
170
+ this.rafHandle = null;
171
+ }
172
+ this.queue.forEach(closeQueuedFrame);
173
+ this.queue = [];
174
+ this.pendingJpeg = [];
175
+ if (this.videoDecoder && this.videoDecoder.state !== "closed") {
176
+ try {
177
+ this.videoDecoder.close();
178
+ }
179
+ catch {
180
+ // already closing
181
+ }
182
+ }
183
+ this.videoDecoder = null;
184
+ this.scheduler?.close();
185
+ this.scheduler = null;
186
+ this.renderer?.reset();
187
+ clearCanvas(this.canvas);
188
+ }
189
+ /** Fully release resources, including the persistent AudioContext. Call on
190
+ * unmount; after this, unlock() must be called again before the next turn. */
191
+ dispose() {
192
+ this.stop();
193
+ this.audioTap = null; // bound to the context being closed below
194
+ const context = this.audioContext;
195
+ this.audioContext = null;
196
+ if (context && context.state !== "closed")
197
+ void context.close();
198
+ }
199
+ async handleEvent(event, startMs, priorText, handlers) {
200
+ const header = event.header;
201
+ switch (header.type) {
202
+ case "start":
203
+ this.setState("speaking", handlers);
204
+ return { text: priorText };
205
+ case "text_delta": {
206
+ const delta = new TextDecoder().decode(event.payload);
207
+ const text = priorText + delta;
208
+ handlers.onText?.(delta, text);
209
+ return { text };
210
+ }
211
+ case "text_done": {
212
+ const decoded = new TextDecoder().decode(event.payload);
213
+ const text = decoded || priorText;
214
+ if (decoded)
215
+ handlers.onText?.("", text);
216
+ return { text };
217
+ }
218
+ case "audio": {
219
+ const scheduled = await this.scheduler?.schedule(event.payload);
220
+ this.metricsState.firstAudioMs ??= Math.round(now() - startMs);
221
+ if (scheduled) {
222
+ this.metricsState.audioQueueMs = scheduled.queuedMs;
223
+ if (scheduled.underrunMs > 0)
224
+ this.metricsState.audioUnderrunMs += scheduled.underrunMs;
225
+ }
226
+ this.setState("speaking", handlers);
227
+ handlers.onMetrics?.(this.metrics);
228
+ return { text: priorText };
229
+ }
230
+ case "video":
231
+ this.queueVideo(header, event.payload);
232
+ this.metricsState.firstVideoMs ??= Math.round(now() - startMs);
233
+ handlers.onMetrics?.(this.metrics);
234
+ return { text: priorText };
235
+ case "done":
236
+ return {
237
+ text: priorText,
238
+ frames: header.frames,
239
+ elapsedMs: header.elapsedMs,
240
+ done: true,
241
+ sourceVideo: header.sourceVideo,
242
+ };
243
+ case "error":
244
+ throw new Error(header.message);
245
+ default:
246
+ return { text: priorText };
247
+ }
248
+ }
249
+ queueVideo(header, payload) {
250
+ this.metricsState.width = header.width;
251
+ this.metricsState.height = header.height;
252
+ if (header.pixelFormat === "h264") {
253
+ this.queueH264(header, payload);
254
+ return;
255
+ }
256
+ if (header.pixelFormat === "jpeg") {
257
+ // Enqueue COPIES of each frame's bytes and decode them off the event loop.
258
+ // Two reasons this must be a copy, not a subarray view:
259
+ // 1. the stream's read buffer is reused, so a deferred decode of a view
260
+ // could read bytes that a later chunk has already overwritten;
261
+ // 2. decoding inline here (await per frame) would block the stream's
262
+ // for-await loop, starving the audio scheduler and stalling playback.
263
+ let offset = 0;
264
+ for (let i = 0; i < header.frames; i += 1) {
265
+ const size = header.frameSizes?.[i] ?? 0;
266
+ this.pendingJpeg.push({
267
+ pts: (header.startFrame + i) / header.fps,
268
+ bytes: payload.slice(offset, offset + size),
269
+ width: header.width,
270
+ height: header.height,
271
+ });
272
+ offset += size;
273
+ }
274
+ this.metricsState.frames += header.frames;
275
+ void this.pumpJpegDecode();
276
+ return;
277
+ }
278
+ const frameBytes = header.frameBytes ?? 0;
279
+ const frames = [];
280
+ for (let i = 0; i < header.frames; i += 1) {
281
+ const start = i * frameBytes;
282
+ // i420 frames are drawn straight from the queue; copy so a reused stream
283
+ // buffer can't corrupt a not-yet-drawn frame.
284
+ frames.push({ kind: "i420", pts: (header.startFrame + i) / header.fps, frame: payload.slice(start, start + frameBytes), width: header.width, height: header.height });
285
+ }
286
+ this.queue.push(...frames);
287
+ this.metricsState.frames += frames.length;
288
+ if (frames.length)
289
+ this.scheduler?.startPlayout();
290
+ }
291
+ /**
292
+ * Decodes the pending-JPEG queue into bitmaps with bounded concurrency, off
293
+ * the stream loop. Concurrency matters: serial `await createImageBitmap` for
294
+ * tall (e.g. 832px) frames can exceed the per-frame realtime budget, so decode
295
+ * falls permanently behind the audio clock and the render queue starves —
296
+ * which is exactly the "audio plays but video freezes" stall. Decoding a few
297
+ * frames in parallel keeps throughput ahead of realtime.
298
+ *
299
+ * Frames are inserted into the render queue in pts order regardless of which
300
+ * decode finishes first, and we never drop here — the audio-clocked render
301
+ * loop is the single place that drops late frames.
302
+ */
303
+ async pumpJpegDecode() {
304
+ if (this.decoding)
305
+ return;
306
+ this.decoding = true;
307
+ const CONCURRENCY = 4;
308
+ try {
309
+ while (this.pendingJpeg.length) {
310
+ const batch = this.pendingJpeg.splice(0, CONCURRENCY);
311
+ const decoded = await Promise.all(batch.map(async (frame) => {
312
+ try {
313
+ const bitmap = await decodeJpegFrame(frame.bytes);
314
+ return { kind: "bitmap", pts: frame.pts, bitmap, width: frame.width, height: frame.height };
315
+ }
316
+ catch {
317
+ return null;
318
+ }
319
+ }));
320
+ for (const frame of decoded) {
321
+ if (frame)
322
+ this.insertOrdered(frame);
323
+ else
324
+ this.metricsState.droppedFrames += 1;
325
+ }
326
+ }
327
+ }
328
+ finally {
329
+ this.decoding = false;
330
+ }
331
+ }
332
+ /**
333
+ * Feed h264 access units to a WebCodecs decoder. Each server chunk is a
334
+ * self-contained Annex-B stream (AUD + SPS/PPS + IDR + P-frames), so any
335
+ * chunk can start decode and dropped chunks never corrupt later ones.
336
+ * Decoded VideoFrames land in the same pts-ordered queue the render clock
337
+ * already drains; hardware decode happens off the event loop.
338
+ */
339
+ queueH264(header, payload) {
340
+ if (!supportsH264Playback()) {
341
+ this.metricsState.droppedFrames += header.frames;
342
+ return;
343
+ }
344
+ if (!this.videoDecoder || this.videoDecoder.state === "closed") {
345
+ this.videoDecoder = new VideoDecoder({
346
+ output: (frame) => {
347
+ this.insertOrdered({
348
+ kind: "videoframe",
349
+ pts: frame.timestamp / 1_000_000,
350
+ frame,
351
+ width: header.width,
352
+ height: header.height,
353
+ });
354
+ },
355
+ error: () => {
356
+ // A decoder fault drops the rest of this turn's h264 frames; audio
357
+ // keeps playing and the next turn reconfigures a fresh decoder.
358
+ this.videoDecoder = null;
359
+ },
360
+ });
361
+ // Annex-B is implied when no description is attached. The stream is
362
+ // baseline profile level 3.1 (see avtr1_modal/realtime/h264.py).
363
+ this.videoDecoder.configure({ codec: "avc1.42001f", optimizeForLatency: true });
364
+ }
365
+ let offset = 0;
366
+ for (let i = 0; i < header.frames; i += 1) {
367
+ const size = header.frameSizes?.[i] ?? 0;
368
+ const bytes = payload.slice(offset, offset + size);
369
+ offset += size;
370
+ try {
371
+ this.videoDecoder.decode(new EncodedVideoChunk({
372
+ // Chunk-leading AUs carry the IDR; the rest are P-frames.
373
+ type: i === 0 ? "key" : "delta",
374
+ timestamp: Math.round(((header.startFrame + i) / header.fps) * 1_000_000),
375
+ data: bytes,
376
+ }));
377
+ }
378
+ catch {
379
+ this.metricsState.droppedFrames += 1;
380
+ }
381
+ }
382
+ this.metricsState.frames += header.frames;
383
+ }
384
+ /** Insert a decoded frame into the render queue keeping it sorted by pts.
385
+ * A frame in the queue means there is something to lip-sync against, so
386
+ * this is also where adaptive playout releases held audio. */
387
+ insertOrdered(frame) {
388
+ this.scheduler?.startPlayout();
389
+ const queue = this.queue;
390
+ if (queue.length === 0 || queue[queue.length - 1].pts <= frame.pts) {
391
+ queue.push(frame);
392
+ return;
393
+ }
394
+ let lo = 0;
395
+ let hi = queue.length;
396
+ while (lo < hi) {
397
+ const mid = (lo + hi) >> 1;
398
+ if (queue[mid].pts <= frame.pts)
399
+ lo = mid + 1;
400
+ else
401
+ hi = mid;
402
+ }
403
+ queue.splice(lo, 0, frame);
404
+ }
405
+ startVideoClock() {
406
+ if (this.rafHandle !== null)
407
+ cancelAnimationFrame(this.rafHandle);
408
+ const tick = () => {
409
+ const mediaTime = this.scheduler?.mediaTimeSeconds;
410
+ const canvas = this.canvas;
411
+ if (mediaTime !== null && mediaTime !== undefined && canvas) {
412
+ let drawable = null;
413
+ let dropped = 0;
414
+ while (this.queue.length && this.queue[0].pts <= mediaTime + 0.03) {
415
+ if (drawable) {
416
+ closeQueuedFrame(drawable);
417
+ dropped += 1;
418
+ }
419
+ drawable = this.queue.shift() ?? null;
420
+ }
421
+ while (this.queue.length > 36 && this.queue[0].pts < mediaTime - 0.08) {
422
+ closeQueuedFrame(this.queue.shift() ?? null);
423
+ dropped += 1;
424
+ }
425
+ if (dropped)
426
+ this.metricsState.droppedFrames += dropped;
427
+ if (drawable) {
428
+ this.metricsState.firstFrameDrawnMs ??= Math.round(now() - this.playStartedAt);
429
+ this.draw(canvas, drawable);
430
+ }
431
+ }
432
+ this.rafHandle = requestAnimationFrame(tick);
433
+ };
434
+ this.rafHandle = requestAnimationFrame(tick);
435
+ }
436
+ draw(canvas, frame) {
437
+ if (frame.kind === "bitmap") {
438
+ drawBitmapFrame(canvas, frame);
439
+ return;
440
+ }
441
+ if (frame.kind === "videoframe") {
442
+ drawVideoFrame(canvas, frame);
443
+ return;
444
+ }
445
+ this.renderer ??= new I420CanvasRenderer();
446
+ this.renderer.draw(canvas, frame.frame, frame.width, frame.height, Math.round(frame.pts * 1_000_000));
447
+ }
448
+ setState(state, handlers) {
449
+ if (this.state === state)
450
+ return;
451
+ this.state = state;
452
+ handlers.onState?.(state);
453
+ }
454
+ }
455
+ function emptyMetrics() {
456
+ return {
457
+ frames: 0,
458
+ droppedFrames: 0,
459
+ firstAudioMs: null,
460
+ firstVideoMs: null,
461
+ firstFrameDrawnMs: null,
462
+ audioQueueMs: 0,
463
+ audioUnderrunMs: 0,
464
+ width: null,
465
+ height: null,
466
+ };
467
+ }
468
+ function closeQueuedFrame(frame) {
469
+ if (frame?.kind === "bitmap")
470
+ frame.bitmap.close();
471
+ if (frame?.kind === "videoframe")
472
+ frame.frame.close();
473
+ }
474
+ function drawVideoFrame(canvas, frame) {
475
+ if (canvas.width !== frame.width)
476
+ canvas.width = frame.width;
477
+ if (canvas.height !== frame.height)
478
+ canvas.height = frame.height;
479
+ const context = canvas.getContext("2d", { alpha: false });
480
+ context?.drawImage(frame.frame, 0, 0, frame.width, frame.height);
481
+ frame.frame.close();
482
+ }
483
+ function clearCanvas(canvas) {
484
+ if (!canvas)
485
+ return;
486
+ const context = canvas.getContext("2d");
487
+ context?.clearRect(0, 0, canvas.width, canvas.height);
488
+ // eslint-disable-next-line no-self-assign -- reset draw state cheaply
489
+ canvas.width = canvas.width;
490
+ }
491
+ function drawBitmapFrame(canvas, frame) {
492
+ if (canvas.width !== frame.width)
493
+ canvas.width = frame.width;
494
+ if (canvas.height !== frame.height)
495
+ canvas.height = frame.height;
496
+ const context = canvas.getContext("2d", { alpha: false });
497
+ context?.drawImage(frame.bitmap, 0, 0, frame.width, frame.height);
498
+ frame.bitmap.close();
499
+ }
500
+ async function decodeJpegFrame(bytes) {
501
+ if (typeof createImageBitmap !== "function") {
502
+ throw new Error("This browser does not support realtime JPEG frame decoding.");
503
+ }
504
+ // `.slice()` does a single fast buffer copy (the Blob must own the bytes,
505
+ // since the underlying stream buffer is reused); avoid element-wise copies.
506
+ return createImageBitmap(new Blob([bytes.slice()], { type: "image/jpeg" }));
507
+ }
508
+ function now() {
509
+ return typeof performance !== "undefined" ? performance.now() : Date.now();
510
+ }
511
+ function sleep(ms) {
512
+ return new Promise((resolve) => setTimeout(resolve, ms));
513
+ }
514
+ //# sourceMappingURL=player.js.map