avbridge 2.12.1 → 2.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -38,7 +38,21 @@ export class VideoRenderer {
38
38
  private framesPainted = 0;
39
39
  private framesDroppedLate = 0;
40
40
  private framesDroppedOverflow = 0;
41
+ /** True once the head frame has been painted as a pre-roll poster
42
+ * since the last flush. Used to ensure pre-roll paints exactly one
43
+ * frame (held static) during the post-seek discard window. */
41
44
  private prerolled = false;
45
+ /** PTS (µs) of the most recently painted frame. Used as the calibration
46
+ * reference on the first post-flush snap: the pre-roll path paints one
47
+ * frame *before* PTS-based playback starts, so the queue head's PTS at
48
+ * first PTS-based paint is the *next* frame, off by one frameDur from
49
+ * the actually-displayed frame. Calibrating against the painted frame
50
+ * instead of the queue head removes that one-frame offset and yields
51
+ * calib ≈ 0 instead of +frameDur. */
52
+ private lastPaintedPtsUs = 0;
53
+ private hasLastPaintedPts = false;
54
+ /** Audio-clock reading (ms) at the previous paint, for overlay Δaud. */
55
+ private lastPaintAudMs = 0;
42
56
  /** Wall-clock time of the last paint, in ms (performance.now()). */
43
57
  private lastPaintWall = 0;
44
58
  /** Minimum ms between paints — paces video at roughly source fps. */
@@ -163,13 +177,17 @@ export class VideoRenderer {
163
177
  }
164
178
 
165
179
  /**
166
- * Soft cap for decoder backpressure. The decoder pump throttles when
167
- * `queueDepth() >= queueHighWater`. Set high enough that normal decode
168
- * bursts don't trigger the renderer's overflow-drop loop (which runs at
169
- * every paint), but low enough that the decoder doesn't run unboundedly
170
- * ahead. The hard cap in `enqueue()` is 64.
180
+ * Cap the decoder may fill the queue up to. Used by the decoder's
181
+ * enqueue-side discard logic (it closes new frames instead of pushing
182
+ * them when this is reached). Sized so a long post-seek catch-up
183
+ * fits the decoder produces frames at PTS T_kf onwards rapidly
184
+ * while the demuxer is chewing through pre-target audio; if the
185
+ * queue can hold the whole post-seek burst, the renderer plays
186
+ * smoothly from pre-roll without a frozen-video gap when audio.start
187
+ * fires. At ~340 KB per SD frame the cap is ~85 MB peak; at HD it's
188
+ * larger but still bounded.
171
189
  */
172
- readonly queueHighWater = 30;
190
+ readonly queueHighWater = 256;
173
191
 
174
192
  enqueue(frame: VideoFrame): void {
175
193
  if (this.destroyed) {
@@ -181,10 +199,12 @@ export class VideoRenderer {
181
199
  if (this.queue.length === 1 && this.framesPainted === 0) {
182
200
  this.resolveFirstFrame();
183
201
  }
184
- // Hard cap. Should rarely trigger because the decoder backs off at
185
- // queueHighWater (30) and the drift correction trims gently. This is
186
- // the last-resort defense against runaway producers.
187
- while (this.queue.length > 60) {
202
+ // Hard cap. The decoder's enqueue-side discard at `queueHighWater`
203
+ // is the primary defense; this `+8` margin is just safety for a
204
+ // racy producer. Drops the OLDEST frames, which during catch-up
205
+ // would mean losing the frames closest to the seek target — so the
206
+ // decoder should be tuned to never reach this.
207
+ while (this.queue.length > this.queueHighWater + 8) {
188
208
  this.queue.shift()?.close();
189
209
  this.framesDroppedOverflow++;
190
210
  }
@@ -283,14 +303,27 @@ export class VideoRenderer {
283
303
 
284
304
  const playing = this.clock.isPlaying();
285
305
 
286
- // Pre-roll: paint the very first frame as a poster while audio buffers.
306
+ // Pre-roll: paint the head frame ONCE as a poster while audio buffers.
307
+ //
308
+ // Safety invariant (load-bearing): with the decoder.ts content-clock
309
+ // fix (POSTMORTEMS 2026-06-01), pre-target frames are discarded at
310
+ // the decoder/enqueue boundary, so queue[0] here is guaranteed to be
311
+ // a near-target frame — never the keyframe-to-target preroll sequence
312
+ // that previously caused the post-seek fast-forward when painted.
313
+ //
314
+ // Paint at most ONE frame and hold it (gate via `prerolled`). Do NOT
315
+ // shift the queue: when audio unfreezes and `playing` becomes true,
316
+ // the regular PTS loop below will paint this same frame again and
317
+ // shift it out. That second paint is a no-op visually (same pixels)
318
+ // so there's no flicker.
319
+ //
320
+ // If the queue is empty (decoder still grinding through the post-seek
321
+ // discard window), just return — last pre-flush frame stays on canvas
322
+ // as the freeze poster, which is the safe fallback.
287
323
  if (!playing) {
288
- if (!this.prerolled) {
289
- const head = this.queue.shift()!;
290
- this.paint(head);
291
- head.close();
324
+ if (!this.prerolled && this.queue.length > 0) {
292
325
  this.prerolled = true;
293
- this.lastPaintWall = performance.now();
326
+ this.paint(this.queue[0]);
294
327
  }
295
328
  return;
296
329
  }
@@ -312,16 +345,81 @@ export class VideoRenderer {
312
345
  // plus a small rate drift (~7ms/s). We snap the offset on first paint
313
346
  // and re-snap every 10 seconds. Between snaps, max drift is ~70ms
314
347
  // (under 2 frames at 24fps, below lip-sync perception threshold).
348
+ //
349
+ // Two cases for the *first* snap after flush:
350
+ // - Anchor `rawAudioNowUs` against `clock.now()` (default for the
351
+ // periodic 10s re-snap) drifts with the audio clock — including
352
+ // decode-stall lag accumulated between `audio.start()` and the
353
+ // first frame's arrival. On a slow seek where the first frame
354
+ // lands 1–2s after audio resumed, this captures the lag as a
355
+ // permanent offset and the video stays that far behind audio.
356
+ // - For the *first* snap post-flush we instead use the audio's
357
+ // **anchor time** (`mediaTimeOfAnchor`, == the seek target / 0
358
+ // on cold start). That gives `headTs − seekTarget` ≈ keyframe
359
+ // offset (usually < 100ms), independent of decode delay.
315
360
  const wallNow = performance.now();
316
- if (!this.ptsCalibrated || wallNow - this.lastCalibrationWall > 10_000) {
317
- this.ptsCalibrationUs = headTs - rawAudioNowUs;
361
+ // First snap after flush/cold-start anchors against the audio's
362
+ // *master-clock reference* (= `mediaTimeOfAnchor`, == the rebased
363
+ // audio first-chunk PTS), NOT `clock.now()`. `clock.now()` includes
364
+ // wall-clock-drifted elapsed time between `audio.start()` and the
365
+ // first paint — on a slow seek where the first frame lands 1-2 s
366
+ // after audio resumed, that decode delay gets baked into the
367
+ // calibration as a permanent video-lag offset. See POSTMORTEMS.md
368
+ // (2026-04-13). The periodic re-snap continues to use `rawAudioNow`
369
+ // as the original design intended — a stateless independent snap
370
+ // every 10 s bounds drift to ~70 ms at the documented ~7 ms/s rate,
371
+ // below the lip-sync perception threshold. Do *not* introduce a
372
+ // smoothed / EMA / bounded-delta variant here: the measured offset
373
+ // includes the current calibration, which produces a feedback loop
374
+ // (postmortem 2026-04-13, hypothesis 3).
375
+ if (!this.ptsCalibrated) {
376
+ const anchorUs = (this.clock.anchorTime?.() ?? this.clock.now()) * 1_000_000;
377
+ // Reference frame for calibration: prefer the pre-rolled frame's
378
+ // PTS over the queue head, since the pre-rolled frame is what the
379
+ // user is *actually looking at* the moment audio starts. The queue
380
+ // head at this point is the NEXT frame (PTS == prerolled + frameDur),
381
+ // and calibrating against it bakes that one-frame offset into the
382
+ // calibration permanently. With the painted-frame reference, calib
383
+ // ≈ 0 when video keyframe lands at the seek target.
384
+ const referencePtsUs = this.hasLastPaintedPts ? this.lastPaintedPtsUs : headTs;
385
+ this.ptsCalibrationUs = referencePtsUs - anchorUs;
318
386
  this.ptsCalibrated = true;
319
387
  this.lastCalibrationWall = wallNow;
388
+ if (isDebug()) {
389
+ // eslint-disable-next-line no-console
390
+ console.log(
391
+ `[avbridge:renderer] CALIB-FIRST audioAnchor=${(anchorUs / 1000).toFixed(1)}ms ` +
392
+ `prerolledPTS=${this.hasLastPaintedPts ? (this.lastPaintedPtsUs / 1000).toFixed(1) : "n/a"}ms ` +
393
+ `queueHeadPTS=${(headTs / 1000).toFixed(1)}ms ` +
394
+ `rawAudioNow=${(rawAudioNowUs / 1000).toFixed(1)}ms ` +
395
+ `→ calib=${(this.ptsCalibrationUs / 1000).toFixed(1)}ms`,
396
+ );
397
+ }
398
+ } else if (wallNow - this.lastCalibrationWall > 10_000) {
399
+ const oldCalib = this.ptsCalibrationUs;
400
+ this.ptsCalibrationUs = headTs - rawAudioNowUs;
401
+ this.lastCalibrationWall = wallNow;
402
+ if (isDebug()) {
403
+ // eslint-disable-next-line no-console
404
+ console.log(
405
+ `[avbridge:renderer] CALIB-RESNAP ` +
406
+ `headPTS=${(headTs / 1000).toFixed(1)}ms rawAudioNow=${(rawAudioNowUs / 1000).toFixed(1)}ms ` +
407
+ `calib ${(oldCalib / 1000).toFixed(1)}ms → ${(this.ptsCalibrationUs / 1000).toFixed(1)}ms ` +
408
+ `(Δ=${((this.ptsCalibrationUs - oldCalib) / 1000).toFixed(1)}ms after 10s)`,
409
+ );
410
+ }
320
411
  }
321
412
 
322
413
  const audioNowUs = rawAudioNowUs + this.ptsCalibrationUs;
323
- const frameDurationUs = this.paintIntervalMs * 1000;
324
- const deadlineUs = audioNowUs + frameDurationUs;
414
+ // Paint the frame whose PTS is at or just before audioNow. A frame
415
+ // at PTS P should be the displayed frame from the moment audio
416
+ // reaches P, *not* from P − frameDur. The previous code used
417
+ // `deadline = audioNow + frameDur`, which painted frames up to one
418
+ // source-frame ahead of audio — a steady ~40 ms video-leads-audio
419
+ // offset that the user perceived as "fast-forward then normal."
420
+ // With `deadline = audioNow`, paints land exactly at the frame's
421
+ // start of display interval; lip sync matches.
422
+ const deadlineUs = audioNowUs;
325
423
 
326
424
  let bestIdx = -1;
327
425
  for (let i = 0; i < this.queue.length; i++) {
@@ -353,29 +451,27 @@ export class VideoRenderer {
353
451
  return;
354
452
  }
355
453
 
356
- // Only drop frames that are more than 2 frame-durations behind.
357
- // Diagnostic escape hatch: `globalThis.AVBRIDGE_RELAX_DROP = true`
358
- // pushes the threshold so far back that frames are effectively
359
- // never dropped as late. The display will run behind the audio
360
- // clock but won't stutter from drop bursts. Useful for isolating
361
- // "is the problem decode throughput or drop policy?".
454
+ // Audio-sync skip: when `bestIdx > 0` there are multiple frames in
455
+ // the queue whose PTS ≤ deadline. Drop everything before `bestIdx`
456
+ // and paint the latest paintable frame. See POSTMORTEMS.md
457
+ // 2026-05-31 coda for the rationale.
362
458
  const _relaxDrop =
363
459
  (globalThis as { AVBRIDGE_RELAX_DROP?: boolean }).AVBRIDGE_RELAX_DROP === true;
364
- const dropThresholdUs = _relaxDrop
365
- ? audioNowUs - 60 * 1_000_000 /* 60 s */
366
- : audioNowUs - frameDurationUs * 2;
367
460
  let dropped = 0;
368
- while (bestIdx > 0) {
369
- const ts = this.queue[0].timestamp ?? 0;
370
- if (ts < dropThresholdUs) {
461
+ const initialBestIdx = bestIdx;
462
+ if (!_relaxDrop) {
463
+ while (bestIdx > 0) {
371
464
  this.queue.shift()?.close();
372
465
  this.framesDroppedLate++;
373
466
  bestIdx--;
374
467
  dropped++;
375
- } else {
376
- break;
377
468
  }
378
469
  }
470
+ const paintTs = this.queue[0]?.timestamp ?? 0;
471
+ if (isDebug()) {
472
+ // eslint-disable-next-line no-console
473
+ console.log(`[TRACE] PAINT bestIdx_initial=${initialBestIdx} dropped=${dropped} paintPts=${(paintTs / 1000).toFixed(1)}ms audioNow=${(audioNowUs / 1000).toFixed(1)}ms deadline=${(deadlineUs / 1000).toFixed(1)}ms queueLen=${this.queue.length} wall=${performance.now().toFixed(0)}`);
474
+ }
379
475
 
380
476
  this.ticksPainted++;
381
477
 
@@ -423,6 +519,51 @@ export class VideoRenderer {
423
519
  }
424
520
  try {
425
521
  this.ctx.drawImage(frame, 0, 0, this.canvas.width, this.canvas.height);
522
+
523
+ // Debug overlay (gated on AVBRIDGE_DEBUG). Draws frame info on top
524
+ // of the painted frame so the user can SEE what's actually
525
+ // displayed and at what rate. Three time domains:
526
+ // pts — source content time (from frame.timestamp)
527
+ // aud — audio media clock (clock.now() × 1000)
528
+ // wall — performance.now() (monotonic browser clock)
529
+ // Plus the per-paint deltas. If `Δpts > Δwall` sustained across
530
+ // multiple frames, that's real fast-forward; if it alternates
531
+ // 33/50ms on a 25fps source, that's 3:2 pulldown judder. (See
532
+ // POSTMORTEMS 2026-06-01 for why this overlay was load-bearing
533
+ // when diagnosing the post-seek fast-forward.)
534
+ if (isDebug()) {
535
+ const wallNow = performance.now();
536
+ const audNowMs = this.clock.now() * 1000;
537
+ const ptsMs = (frame.timestamp ?? 0) / 1000;
538
+ const dWall = this.lastPaintWall > 0 ? wallNow - this.lastPaintWall : 0;
539
+ const dAud = this.lastPaintAudMs > 0 ? audNowMs - this.lastPaintAudMs : 0;
540
+ const dPts = this.hasLastPaintedPts ? ptsMs - this.lastPaintedPtsUs / 1000 : 0;
541
+ this.ctx.save();
542
+ this.ctx.font = "bold 18px monospace";
543
+ const lines = [
544
+ `#${this.framesPainted + 1} pts=${ptsMs.toFixed(0)} aud=${audNowMs.toFixed(0)} wall=${wallNow.toFixed(0)}`,
545
+ `Δpts=${dPts.toFixed(0)} Δaud=${dAud.toFixed(0)} Δwall=${dWall.toFixed(0)}`,
546
+ ];
547
+ const lineHeight = 22;
548
+ const padTop = 6;
549
+ const stripH = padTop + lineHeight * lines.length;
550
+ this.ctx.fillStyle = "rgba(0,0,0,0.7)";
551
+ this.ctx.fillRect(0, 0, this.canvas.width, stripH);
552
+ this.ctx.fillStyle = "#0f0";
553
+ for (let i = 0; i < lines.length; i++) {
554
+ this.ctx.fillText(lines[i], 8, padTop + lineHeight * (i + 1) - 4);
555
+ }
556
+ this.ctx.restore();
557
+ }
558
+
559
+ // Record the just-painted frame's PTS so the next paint's overlay
560
+ // Δpts and the next CALIB-RESNAP have a reference. Must run
561
+ // unconditionally — `hasLastPaintedPts`/`lastPaintedPtsUs` are read
562
+ // by the calibration path in tick() too, not just the overlay.
563
+ this.lastPaintedPtsUs = frame.timestamp ?? 0;
564
+ this.hasLastPaintedPts = true;
565
+ this.lastPaintAudMs = this.clock.now() * 1000;
566
+
426
567
  this.framesPainted++;
427
568
  } catch (err) {
428
569
  // Log only once so a structurally broken frame format doesn't spam
@@ -439,6 +580,7 @@ export class VideoRenderer {
439
580
  const count = this.queue.length;
440
581
  while (this.queue.length > 0) this.queue.shift()?.close();
441
582
  this.prerolled = false;
583
+ this.hasLastPaintedPts = false; // calibration ref doesn't carry across seek
442
584
  this.ptsCalibrated = false; // recalibrate at new seek position
443
585
  this.hasEverEnqueuedSinceFlush = false; // so waitForBuffer() waits for post-flush frames
444
586
  if (isDebug() && count > 0) {
@@ -22,7 +22,6 @@ import { dbg } from "../../util/debug.js";
22
22
  import { pickLibavVariant } from "../fallback/variant-routing.js";
23
23
  import {
24
24
  sanitizePacketTimestamp,
25
- sanitizeFrameTimestamp,
26
25
  libavFrameToInterleavedFloat32,
27
26
  packetPtsSec,
28
27
  } from "../../util/libav-demux.js";
@@ -248,8 +247,9 @@ export async function startHybridDecoder(opts: StartHybridDecoderOptions): Promi
248
247
  let videoChunksFed = 0;
249
248
  let bufferedUntilSec = 0;
250
249
 
250
+ // Synthetic video timestamp for packets with AV_NOPTS_VALUE (audio
251
+ // uses the packet PTS directly — see decodeAudioBatch).
251
252
  let syntheticVideoUs = 0;
252
- let syntheticAudioUs = 0;
253
253
 
254
254
  const videoTrackInfo = opts.context.videoTracks.find((t) => t.id === videoStream?.index);
255
255
  const videoFps = videoTrackInfo?.fps && videoTrackInfo.fps > 0 ? videoTrackInfo.fps : 30;
@@ -300,7 +300,7 @@ export async function startHybridDecoder(opts: StartHybridDecoderOptions): Promi
300
300
  // 10-50 ms. Processing audio first ensures the audio scheduler is
301
301
  // fed before video decode starts, reducing perceived stutter.
302
302
  if (audioDec && audioPackets && audioPackets.length > 0) {
303
- await decodeAudioBatch(audioPackets, myToken);
303
+ await decodeAudioBatch(audioPackets, myToken, /*flush*/ false, audioTimeBase);
304
304
  }
305
305
  if (myToken !== pumpToken || destroyed) return;
306
306
 
@@ -363,9 +363,23 @@ export async function startHybridDecoder(opts: StartHybridDecoderOptions): Promi
363
363
  }
364
364
  }
365
365
 
366
- async function decodeAudioBatch(pkts: LibavPacket[], myToken: number, flush = false) {
366
+ async function decodeAudioBatch(
367
+ pkts: LibavPacket[],
368
+ myToken: number,
369
+ flush = false,
370
+ tb?: [number, number],
371
+ ) {
367
372
  if (!audioDec || destroyed || myToken !== pumpToken) return;
368
373
 
374
+ // Capture packet-level PTS before decode (same rationale as fallback
375
+ // decoder — see POSTMORTEMS.md 2026-05-31: libav's reported
376
+ // `frame.pts` is unreliable for some container/codec combinations;
377
+ // the demuxer's packet PTS is reliable). For mp3/aac the packet→frame
378
+ // mapping is 1:1, so the PTS array aligns with `allFrames`.
379
+ const pktPtsSec: (number | null)[] = pkts.map((p) =>
380
+ tb ? packetPtsSec(p, tb) : null,
381
+ );
382
+
369
383
  // For heavy codecs (DTS, AC3), decode in small sub-batches and yield
370
384
  // between them so the event loop can run rAF for video painting.
371
385
  // Each ff_decode_multi call is a blocking WASM invocation.
@@ -409,22 +423,13 @@ export async function startHybridDecoder(opts: StartHybridDecoderOptions): Promi
409
423
  if (myToken !== pumpToken || destroyed) return;
410
424
  const frames = allFrames;
411
425
 
412
- for (const f of frames) {
426
+ for (let i = 0; i < frames.length; i++) {
413
427
  if (myToken !== pumpToken || destroyed) return;
414
- sanitizeFrameTimestamp(
415
- f,
416
- () => {
417
- const ts = syntheticAudioUs;
418
- const samples = f.nb_samples ?? 1024;
419
- const sampleRate = f.sample_rate ?? 44100;
420
- syntheticAudioUs += Math.round((samples * 1_000_000) / sampleRate);
421
- return ts;
422
- },
423
- audioTimeBase,
424
- );
428
+ const f = frames[i];
425
429
  const samples = libavFrameToInterleavedFloat32(f);
426
430
  if (samples) {
427
- opts.audio.schedule(samples.data, samples.channels, samples.sampleRate);
431
+ const pts = pktPtsSec[i] ?? null;
432
+ opts.audio.schedule(samples.data, samples.channels, samples.sampleRate, pts);
428
433
  audioFramesDecoded++;
429
434
  }
430
435
  }
@@ -522,7 +527,6 @@ export async function startHybridDecoder(opts: StartHybridDecoderOptions): Promi
522
527
  await flushBSF();
523
528
 
524
529
  syntheticVideoUs = Math.round(timeSec * 1_000_000);
525
- syntheticAudioUs = Math.round(timeSec * 1_000_000);
526
530
 
527
531
  pumpRunning = pumpLoop(newToken).catch((err) =>
528
532
  console.error("[avbridge] hybrid pump failed (post-setAudioTrack):", err),
@@ -566,7 +570,6 @@ export async function startHybridDecoder(opts: StartHybridDecoderOptions): Promi
566
570
  await flushBSF();
567
571
 
568
572
  syntheticVideoUs = Math.round(timeSec * 1_000_000);
569
- syntheticAudioUs = Math.round(timeSec * 1_000_000);
570
573
 
571
574
  pumpRunning = pumpLoop(newToken).catch((err) =>
572
575
  console.error("[avbridge] hybrid pump failed (post-seek):", err),
@@ -129,6 +129,13 @@ export async function createRemuxPipeline(
129
129
  }
130
130
 
131
131
  let mimePromise: Promise<string> | null = null;
132
+ // Capture the active pump token at the moment this output was created.
133
+ // A subsequent seek bumps `pumpToken`, and any in-flight write from this
134
+ // (now-stale) output must drop its chunk instead of appending to the
135
+ // SourceBuffer — otherwise stale fragments land at their original
136
+ // timestamps, the deferred seek applies against the wrong buffered
137
+ // range, and the video snaps to the end of the stale range.
138
+ const myToken = pumpToken;
132
139
 
133
140
  const writable = new WritableStream<{
134
141
  type: "write";
@@ -136,11 +143,13 @@ export async function createRemuxPipeline(
136
143
  position: number;
137
144
  }>({
138
145
  write: async (chunk) => {
139
- if (destroyed) return;
146
+ if (destroyed || pumpToken !== myToken) return;
140
147
  if (!sink) {
141
148
  const mime = await (mimePromise ??= output.getMimeType());
149
+ if (destroyed || pumpToken !== myToken) return;
142
150
  sink = new MseSink({ mime, video });
143
151
  await sink.ready();
152
+ if (destroyed || pumpToken !== myToken) return;
144
153
  // Apply deferred seek + autoPlay for the initial start.
145
154
  if (pendingStartTime > 0) {
146
155
  sink.invalidate(pendingStartTime);
@@ -148,10 +157,10 @@ export async function createRemuxPipeline(
148
157
  sink.setPlayOnSeek(pendingAutoPlay);
149
158
  }
150
159
  // Backpressure: wait for the SourceBuffer append queue to drain.
151
- while (sink && !destroyed && (sink.queueLength() > 10 || sink.bufferedAhead() > 60 || sink.totalBuffered() > 120)) {
160
+ while (sink && !destroyed && pumpToken === myToken && (sink.queueLength() > 10 || sink.bufferedAhead() > 60 || sink.totalBuffered() > 120)) {
152
161
  await new Promise((r) => setTimeout(r, 500));
153
162
  }
154
- if (destroyed) return;
163
+ if (destroyed || pumpToken !== myToken) return;
155
164
  sink.append(chunk.data);
156
165
  stats.bytesWritten += chunk.data.byteLength;
157
166
  stats.fragments++;