@invintusmedia/tomp4 1.2.1 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,493 @@
1
+ /**
2
+ * fMP4 Fragment Muxer
3
+ *
4
+ * Creates CMAF-compliant fMP4 init segments and media fragments
5
+ * from parsed TS data (NAL units + AAC frames). Used by the HLS
6
+ * clipper to produce frame-accurate fMP4/CMAF output.
7
+ *
8
+ * @module muxers/fmp4
9
+ */
10
+
11
+ import { createBox } from '../fmp4/utils.js';
12
+ import { parseSPS } from './mp4.js';
13
+
14
+ // ── helpers ───────────────────────────────────────────────
15
+
16
+ function createFullBox(type, version, flags, ...payloads) {
17
+ const header = new Uint8Array(4);
18
+ header[0] = version;
19
+ header[1] = (flags >> 16) & 0xFF;
20
+ header[2] = (flags >> 8) & 0xFF;
21
+ header[3] = flags & 0xFF;
22
+ return createBox(type, header, ...payloads);
23
+ }
24
+
25
+ function strToBytes(s) {
26
+ return new Uint8Array([...s].map(c => c.charCodeAt(0)));
27
+ }
28
+
29
+ // ── init segment ──────────────────────────────────────────
30
+
31
+ /**
32
+ * Create a CMAF ftyp box.
33
+ * @returns {Uint8Array}
34
+ */
35
+ export function createCmafFtyp() {
36
+ const data = new Uint8Array(16);
37
+ // major brand: isom
38
+ data.set(strToBytes('isom'), 0);
39
+ // minor version: 0x200
40
+ data[7] = 0x02;
41
+ // compatible brands: isom, iso6
42
+ data.set(strToBytes('isom'), 8);
43
+ data.set(strToBytes('iso6'), 12);
44
+ return createBox('ftyp', data);
45
+ }
46
+
47
+ /**
48
+ * Build an avcC box from SPS and PPS NAL units.
49
+ */
50
+ function buildAvcC(sps, pps) {
51
+ const data = new Uint8Array(11 + sps.length + pps.length);
52
+ const view = new DataView(data.buffer);
53
+ data[0] = 1;
54
+ data[1] = sps[1]; data[2] = sps[2]; data[3] = sps[3];
55
+ data[4] = 0xFF; // lengthSizeMinusOne = 3 (4-byte NAL lengths)
56
+ data[5] = 0xE1; // numSPS = 1
57
+ view.setUint16(6, sps.length);
58
+ data.set(sps, 8);
59
+ data[8 + sps.length] = 1; // numPPS
60
+ view.setUint16(9 + sps.length, pps.length);
61
+ data.set(pps, 11 + sps.length);
62
+ return createBox('avcC', data);
63
+ }
64
+
65
+ /**
66
+ * Build an esds box for AAC audio.
67
+ */
68
+ function buildEsds(sampleRate, channels) {
69
+ const SAMPLE_RATE_INDEX = {
70
+ 96000: 0, 88200: 1, 64000: 2, 48000: 3, 44100: 4, 32000: 5,
71
+ 24000: 6, 22050: 7, 16000: 8, 12000: 9, 11025: 10, 8000: 11, 7350: 12
72
+ };
73
+ const samplingFreqIndex = SAMPLE_RATE_INDEX[sampleRate] ?? 4;
74
+ const audioConfig = ((2 << 11) | (samplingFreqIndex << 7) | (channels << 3)) & 0xFFFF;
75
+
76
+ const data = new Uint8Array([
77
+ 0x00, 0x00, 0x00, 0x00,
78
+ 0x03, 0x19, 0x00, 0x02, 0x00,
79
+ 0x04, 0x11, 0x40, 0x15,
80
+ 0x00, 0x00, 0x00,
81
+ 0x00, 0x01, 0xF4, 0x00,
82
+ 0x00, 0x01, 0xF4, 0x00,
83
+ 0x05, 0x02,
84
+ (audioConfig >> 8) & 0xFF, audioConfig & 0xFF,
85
+ 0x06, 0x01, 0x02
86
+ ]);
87
+ return createBox('esds', data);
88
+ }
89
+
90
+ /**
91
+ * Create a CMAF init segment (ftyp + moov) from codec parameters.
92
+ *
93
+ * @param {object} codecInfo
94
+ * @param {Uint8Array} codecInfo.sps - H.264 SPS NAL unit
95
+ * @param {Uint8Array} codecInfo.pps - H.264 PPS NAL unit
96
+ * @param {number} [codecInfo.audioSampleRate=48000]
97
+ * @param {number} [codecInfo.audioChannels=2]
98
+ * @param {boolean} [codecInfo.hasAudio=true]
99
+ * @param {number} [codecInfo.videoTimescale=90000]
100
+ * @param {number} [codecInfo.audioTimescale] - defaults to audioSampleRate
101
+ * @returns {Uint8Array}
102
+ */
103
+ export function createInitSegment(codecInfo) {
104
+ const {
105
+ sps, pps,
106
+ audioSampleRate = 48000,
107
+ audioChannels = 2,
108
+ hasAudio = true,
109
+ videoTimescale = 90000,
110
+ } = codecInfo;
111
+ const audioTimescale = codecInfo.audioTimescale || audioSampleRate;
112
+ const { width, height } = parseSPS(sps);
113
+
114
+ const VIDEO_TRACK_ID = 1;
115
+ const AUDIO_TRACK_ID = 2;
116
+
117
+ // ── mvhd ──
118
+ const mvhdData = new Uint8Array(96);
119
+ const mvhdView = new DataView(mvhdData.buffer);
120
+ mvhdView.setUint32(8, 1000); // timescale
121
+ mvhdView.setUint32(16, 0x00010000); // rate
122
+ mvhdView.setUint16(20, 0x0100); // volume
123
+ mvhdView.setUint32(32, 0x00010000); // matrix
124
+ mvhdView.setUint32(48, 0x00010000);
125
+ mvhdView.setUint32(64, 0x40000000);
126
+ mvhdView.setUint32(92, hasAudio ? 3 : 2); // next_track_ID
127
+ const mvhd = createFullBox('mvhd', 0, 0, mvhdData);
128
+
129
+ // ── video trak ──
130
+ const videoTrak = buildInitTrak(VIDEO_TRACK_ID, 'vide', videoTimescale, width, height, () => {
131
+ const avcC = buildAvcC(sps, pps);
132
+ const avc1Data = new Uint8Array(78 + avcC.byteLength);
133
+ const v = new DataView(avc1Data.buffer);
134
+ v.setUint16(6, 1); v.setUint16(24, width); v.setUint16(26, height);
135
+ v.setUint32(28, 0x00480000); v.setUint32(32, 0x00480000);
136
+ v.setUint16(40, 1); v.setUint16(74, 0x0018); v.setInt16(76, -1);
137
+ avc1Data.set(avcC, 78);
138
+ return createBox('avc1', avc1Data);
139
+ });
140
+
141
+ // ── audio trak ──
142
+ let audioTrak = null;
143
+ if (hasAudio) {
144
+ audioTrak = buildInitTrak(AUDIO_TRACK_ID, 'soun', audioTimescale, 0, 0, () => {
145
+ const esds = buildEsds(audioSampleRate, audioChannels);
146
+ const mp4aData = new Uint8Array(28 + esds.byteLength);
147
+ const v = new DataView(mp4aData.buffer);
148
+ v.setUint16(6, 1); v.setUint16(16, audioChannels); v.setUint16(18, 16);
149
+ v.setUint32(24, audioTimescale << 16);
150
+ mp4aData.set(esds, 28);
151
+ return createBox('mp4a', mp4aData);
152
+ });
153
+ }
154
+
155
+ // ── mvex (track extends for fragmented mode) ──
156
+ const mvexParts = [buildTrex(VIDEO_TRACK_ID)];
157
+ if (hasAudio) mvexParts.push(buildTrex(AUDIO_TRACK_ID));
158
+ const mvex = createBox('mvex', ...mvexParts);
159
+
160
+ // ── assemble moov ──
161
+ const moovParts = [mvhd, videoTrak];
162
+ if (audioTrak) moovParts.push(audioTrak);
163
+ moovParts.push(mvex);
164
+ const moov = createBox('moov', ...moovParts);
165
+
166
+ const ftyp = createCmafFtyp();
167
+ const result = new Uint8Array(ftyp.byteLength + moov.byteLength);
168
+ result.set(ftyp, 0);
169
+ result.set(moov, ftyp.byteLength);
170
+ return result;
171
+ }
172
+
173
+ /**
174
+ * Build a trak box for the init segment (empty sample tables).
175
+ */
176
+ function buildInitTrak(trackId, handlerType, timescale, width, height, buildSampleEntry) {
177
+ // tkhd
178
+ const tkhdData = new Uint8Array(80);
179
+ const tkhdView = new DataView(tkhdData.buffer);
180
+ tkhdView.setUint32(8, trackId);
181
+ tkhdView.setUint32(36, 0x00010000); // matrix
182
+ tkhdView.setUint32(52, 0x00010000);
183
+ tkhdView.setUint32(68, 0x40000000);
184
+ if (width && height) {
185
+ tkhdView.setUint32(72, width << 16);
186
+ tkhdView.setUint32(76, height << 16);
187
+ }
188
+ if (handlerType === 'soun') tkhdView.setUint16(32, 0x0100); // volume
189
+ const tkhd = createFullBox('tkhd', 0, 3, tkhdData);
190
+
191
+ // mdhd
192
+ const mdhdData = new Uint8Array(20);
193
+ new DataView(mdhdData.buffer).setUint32(8, timescale);
194
+ mdhdData[16] = 0x55; mdhdData[17] = 0xC4; // language: und
195
+ const mdhd = createFullBox('mdhd', 0, 0, mdhdData);
196
+
197
+ // hdlr
198
+ const hdlrData = new Uint8Array(21);
199
+ hdlrData.set(strToBytes(handlerType), 4);
200
+ const hdlr = createFullBox('hdlr', 0, 0, hdlrData);
201
+
202
+ // xmhd (vmhd or smhd)
203
+ const xmhd = handlerType === 'vide'
204
+ ? createFullBox('vmhd', 0, 1, new Uint8Array(8))
205
+ : createFullBox('smhd', 0, 0, new Uint8Array(4));
206
+
207
+ // dinf
208
+ const urlBox = createFullBox('url ', 0, 1, new Uint8Array(0));
209
+ const dref = createFullBox('dref', 0, 0, new Uint8Array([0, 0, 0, 1]), urlBox);
210
+ const dinf = createBox('dinf', dref);
211
+
212
+ // stbl (empty sample tables for init segment)
213
+ const sampleEntry = buildSampleEntry();
214
+ const stsdHeader = new Uint8Array(4);
215
+ new DataView(stsdHeader.buffer).setUint32(0, 1);
216
+ const stsd = createFullBox('stsd', 0, 0, stsdHeader, sampleEntry);
217
+
218
+ const emptyStts = createFullBox('stts', 0, 0, new Uint8Array(4));
219
+ const emptyStsc = createFullBox('stsc', 0, 0, new Uint8Array(4));
220
+ const emptyStsz = createFullBox('stsz', 0, 0, new Uint8Array(8));
221
+ const emptyStco = createFullBox('stco', 0, 0, new Uint8Array(4));
222
+ const stbl = createBox('stbl', stsd, emptyStts, emptyStsc, emptyStsz, emptyStco);
223
+
224
+ const minf = createBox('minf', xmhd, dinf, stbl);
225
+ const mdia = createBox('mdia', mdhd, hdlr, minf);
226
+ return createBox('trak', tkhd, mdia);
227
+ }
228
+
229
+ /**
230
+ * Build a trex (track extends) box for mvex.
231
+ */
232
+ function buildTrex(trackId) {
233
+ const data = new Uint8Array(20);
234
+ const view = new DataView(data.buffer);
235
+ view.setUint32(0, trackId); // track_ID
236
+ view.setUint32(4, 1); // default_sample_description_index
237
+ return createFullBox('trex', 0, 0, data);
238
+ }
239
+
240
+ // ── media fragments ───────────────────────────────────────
241
+
242
+ /**
243
+ * Create an fMP4 media fragment (moof + mdat) from video and audio samples.
244
+ *
245
+ * Video samples: array of { nalUnits: Uint8Array[], pts: number, dts: number }
246
+ * (pts/dts in 90kHz ticks, same as TSParser output)
247
+ *
248
+ * Audio samples: array of { data: Uint8Array, pts: number }
249
+ * (pts in 90kHz ticks)
250
+ *
251
+ * @param {object} opts
252
+ * @param {Array} opts.videoSamples - Video access units
253
+ * @param {Array} [opts.audioSamples] - Audio access units
254
+ * @param {number} opts.sequenceNumber - Fragment sequence (1-based)
255
+ * @param {number} opts.videoTimescale - Video timescale (typically 90000)
256
+ * @param {number} [opts.audioTimescale=48000] - Audio timescale
257
+ * @param {number} [opts.videoBaseTime=0] - Video base decode time (in videoTimescale ticks)
258
+ * @param {number} [opts.audioBaseTime=0] - Audio base decode time (in audioTimescale ticks)
259
+ * @param {number} [opts.audioSampleDuration=1024] - AAC frame duration in audio timescale
260
+ * @returns {Uint8Array} moof + mdat
261
+ */
262
+ export function createFragment(opts) {
263
+ const {
264
+ videoSamples,
265
+ audioSamples = [],
266
+ sequenceNumber = 1,
267
+ videoTimescale = 90000,
268
+ audioTimescale = 48000,
269
+ videoBaseTime = 0,
270
+ audioBaseTime = 0,
271
+ audioSampleDuration = 1024,
272
+ } = opts;
273
+
274
+ const VIDEO_TRACK_ID = 1;
275
+ const AUDIO_TRACK_ID = 2;
276
+
277
+ // ── build video sample data (AVCC format) + metadata ──
278
+ const videoChunks = [];
279
+ const videoMeta = [];
280
+ for (let i = 0; i < videoSamples.length; i++) {
281
+ const au = videoSamples[i];
282
+ let sampleSize = 0;
283
+ const parts = [];
284
+ for (const nalUnit of au.nalUnits) {
285
+ const prefixed = new Uint8Array(4 + nalUnit.length);
286
+ new DataView(prefixed.buffer).setUint32(0, nalUnit.length);
287
+ prefixed.set(nalUnit, 4);
288
+ parts.push(prefixed);
289
+ sampleSize += prefixed.length;
290
+ }
291
+ videoChunks.push(parts);
292
+
293
+ // Detect keyframe (IDR NAL type 5)
294
+ let isKeyframe = false;
295
+ for (const nalUnit of au.nalUnits) {
296
+ if ((nalUnit[0] & 0x1F) === 5) { isKeyframe = true; break; }
297
+ }
298
+
299
+ const duration = i < videoSamples.length - 1
300
+ ? videoSamples[i + 1].dts - au.dts
301
+ : (videoMeta.length > 0 ? videoMeta[videoMeta.length - 1].duration : 3003);
302
+ const compositionTimeOffset = au.pts - au.dts;
303
+
304
+ videoMeta.push({
305
+ size: sampleSize,
306
+ duration,
307
+ flags: isKeyframe ? 0x02000000 : 0x01010000,
308
+ compositionTimeOffset,
309
+ });
310
+ }
311
+
312
+ // ── build audio sample data + metadata ──
313
+ const audioChunks = [];
314
+ const audioMeta = [];
315
+ for (const frame of audioSamples) {
316
+ audioChunks.push(frame.data);
317
+ audioMeta.push({ size: frame.data.length });
318
+ }
319
+
320
+ // ── compute total mdat content sizes ──
321
+ let videoDataSize = 0;
322
+ for (const parts of videoChunks) for (const p of parts) videoDataSize += p.length;
323
+ let audioDataSize = 0;
324
+ for (const d of audioChunks) audioDataSize += d.length;
325
+
326
+ // ── build trafs ──
327
+ // We need to know the moof size to set trun data_offset. Two-pass:
328
+ // 1. Build trafs with placeholder data_offset
329
+ // 2. Measure moof size
330
+ // 3. Patch data_offsets
331
+
332
+ const videoTraf = buildTraf(VIDEO_TRACK_ID, videoBaseTime, videoMeta, true);
333
+ const audioTraf = audioMeta.length > 0
334
+ ? buildTraf(AUDIO_TRACK_ID, audioBaseTime, audioMeta, false, audioSampleDuration)
335
+ : null;
336
+
337
+ // mfhd
338
+ const mfhdData = new Uint8Array(4);
339
+ new DataView(mfhdData.buffer).setUint32(0, sequenceNumber);
340
+ const mfhd = createFullBox('mfhd', 0, 0, mfhdData);
341
+
342
+ // Assemble moof (with placeholder offsets)
343
+ const moofParts = [mfhd, videoTraf];
344
+ if (audioTraf) moofParts.push(audioTraf);
345
+ const moof = createBox('moof', ...moofParts);
346
+
347
+ // ── build mdat ──
348
+ const mdatContentSize = videoDataSize + audioDataSize;
349
+ const mdatHeaderSize = 8;
350
+ const mdatTotal = mdatHeaderSize + mdatContentSize;
351
+ const mdat = new Uint8Array(mdatTotal);
352
+ new DataView(mdat.buffer).setUint32(0, mdatTotal);
353
+ mdat[4] = 'm'.charCodeAt(0); mdat[5] = 'd'.charCodeAt(0);
354
+ mdat[6] = 'a'.charCodeAt(0); mdat[7] = 't'.charCodeAt(0);
355
+
356
+ let writeOffset = mdatHeaderSize;
357
+ for (const parts of videoChunks) {
358
+ for (const p of parts) { mdat.set(p, writeOffset); writeOffset += p.length; }
359
+ }
360
+ for (const d of audioChunks) { mdat.set(d, writeOffset); writeOffset += d.length; }
361
+
362
+ // ── patch trun data_offsets in moof ──
363
+ // data_offset = byte distance from moof start to the track's data in mdat
364
+ const videoDataOffset = moof.byteLength + mdatHeaderSize;
365
+ const audioDataOffset = videoDataOffset + videoDataSize;
366
+ patchTrunDataOffset(moof, VIDEO_TRACK_ID, videoDataOffset);
367
+ if (audioTraf) patchTrunDataOffset(moof, AUDIO_TRACK_ID, audioDataOffset);
368
+
369
+ // ── combine ──
370
+ const result = new Uint8Array(moof.byteLength + mdat.byteLength);
371
+ result.set(moof, 0);
372
+ result.set(mdat, moof.byteLength);
373
+ return result;
374
+ }
375
+
376
+ /**
377
+ * Build a traf box for one track.
378
+ */
379
+ function buildTraf(trackId, baseDecodeTime, sampleMeta, isVideo, defaultDuration = 0) {
380
+ // tfhd: track_id only, no defaults in header
381
+ const tfhdFlags = 0x020000; // default-base-is-moof
382
+ const tfhdData = new Uint8Array(4);
383
+ new DataView(tfhdData.buffer).setUint32(0, trackId);
384
+ const tfhd = createFullBox('tfhd', 0, tfhdFlags, tfhdData);
385
+
386
+ // tfdt: base media decode time
387
+ const tfdtData = new Uint8Array(8);
388
+ const tfdtView = new DataView(tfdtData.buffer);
389
+ // Use version 1 (64-bit) for large timestamps
390
+ tfdtView.setUint32(0, (baseDecodeTime / 0x100000000) >>> 0);
391
+ tfdtView.setUint32(4, baseDecodeTime >>> 0);
392
+ const tfdt = createFullBox('tfdt', 1, 0, tfdtData);
393
+
394
+ // trun
395
+ const trun = isVideo
396
+ ? buildVideoTrun(sampleMeta)
397
+ : buildAudioTrun(sampleMeta, defaultDuration);
398
+
399
+ return createBox('traf', tfhd, tfdt, trun);
400
+ }
401
+
402
+ /**
403
+ * Build a video trun with per-sample duration, size, flags, CTO.
404
+ */
405
+ function buildVideoTrun(samples) {
406
+ // flags: data-offset-present | duration | size | flags | composition-time-offset
407
+ const trunFlags = 0x000001 | 0x000100 | 0x000200 | 0x000400 | 0x000800;
408
+ const headerSize = 8; // sample_count(4) + data_offset(4)
409
+ const perSampleSize = 16; // duration(4) + size(4) + flags(4) + CTO(4)
410
+ const payload = new Uint8Array(headerSize + samples.length * perSampleSize);
411
+ const view = new DataView(payload.buffer);
412
+
413
+ view.setUint32(0, samples.length); // sample_count
414
+ view.setInt32(4, 0); // data_offset (placeholder, patched later)
415
+
416
+ let offset = 8;
417
+ for (const sample of samples) {
418
+ view.setUint32(offset, sample.duration); offset += 4;
419
+ view.setUint32(offset, sample.size); offset += 4;
420
+ view.setUint32(offset, sample.flags); offset += 4;
421
+ view.setInt32(offset, sample.compositionTimeOffset); offset += 4;
422
+ }
423
+
424
+ return createFullBox('trun', 0, trunFlags, payload);
425
+ }
426
+
427
+ /**
428
+ * Build an audio trun with per-sample size (duration via default).
429
+ */
430
+ function buildAudioTrun(samples, defaultDuration) {
431
+ // flags: data-offset-present | size
432
+ const trunFlags = 0x000001 | 0x000100 | 0x000200;
433
+ const headerSize = 8;
434
+ const perSampleSize = 8; // duration(4) + size(4)
435
+ const payload = new Uint8Array(headerSize + samples.length * perSampleSize);
436
+ const view = new DataView(payload.buffer);
437
+
438
+ view.setUint32(0, samples.length);
439
+ view.setInt32(4, 0); // data_offset placeholder
440
+
441
+ let offset = 8;
442
+ for (const sample of samples) {
443
+ view.setUint32(offset, defaultDuration); offset += 4;
444
+ view.setUint32(offset, sample.size); offset += 4;
445
+ }
446
+
447
+ return createFullBox('trun', 0, trunFlags, payload);
448
+ }
449
+
450
+ /**
451
+ * Patch the data_offset in a trun box within a moof.
452
+ * Scans the moof for a traf with the given trackId, then patches its trun.
453
+ */
454
+ function patchTrunDataOffset(moof, targetTrackId, dataOffset) {
455
+ const view = new DataView(moof.buffer, moof.byteOffset, moof.byteLength);
456
+ let pos = 8; // skip moof header
457
+
458
+ while (pos + 8 < moof.byteLength) {
459
+ const boxSize = view.getUint32(pos);
460
+ const boxType = String.fromCharCode(moof[pos + 4], moof[pos + 5], moof[pos + 6], moof[pos + 7]);
461
+ if (boxSize < 8) break;
462
+
463
+ if (boxType === 'traf') {
464
+ // Find tfhd to check track ID
465
+ let innerPos = pos + 8;
466
+ let foundTrack = false;
467
+ while (innerPos + 8 < pos + boxSize) {
468
+ const innerSize = view.getUint32(innerPos);
469
+ const innerType = String.fromCharCode(moof[innerPos + 4], moof[innerPos + 5], moof[innerPos + 6], moof[innerPos + 7]);
470
+ if (innerSize < 8) break;
471
+
472
+ if (innerType === 'tfhd') {
473
+ const trackId = view.getUint32(innerPos + 12);
474
+ foundTrack = (trackId === targetTrackId);
475
+ }
476
+
477
+ if (innerType === 'trun' && foundTrack) {
478
+ // data_offset is at fullbox header (12) + sample_count (4) = offset 16
479
+ // But trun has: box header (8) + version/flags (4) + sample_count (4) + data_offset (4)
480
+ // So data_offset is at innerPos + 16
481
+ view.setInt32(innerPos + 16, dataOffset);
482
+ return;
483
+ }
484
+
485
+ innerPos += innerSize;
486
+ }
487
+ }
488
+
489
+ pos += boxSize;
490
+ }
491
+ }
492
+
493
+ export default { createInitSegment, createFragment, createCmafFtyp };
package/src/muxers/mp4.js CHANGED
@@ -513,19 +513,26 @@ export class MP4Muxer {
513
513
  if (this.parser.audioPts.length === 0) return null;
514
514
 
515
515
  const firstAudioPts = this.parser.audioPts[0];
516
+ const audioDuration = this.audioSampleSizes.length * this.audioSampleDuration;
516
517
 
517
- // When clipping with preroll, audio is normalized to start at PTS 0
518
- // (matching video playback start after edit list), so no edit list needed
519
- if (firstAudioPts === 0) return null;
518
+ // Determine media_time: when clipping with preroll, audio shares the
519
+ // same timeline as video (both normalized from keyframe), so the audio
520
+ // edit list must skip the same preroll to stay in sync.
521
+ let mediaTime;
522
+ if (this.preroll > 0) {
523
+ mediaTime = Math.round(this.preroll * this.audioTimescale / 90000);
524
+ } else if (firstAudioPts !== 0) {
525
+ mediaTime = Math.round(firstAudioPts * this.audioTimescale / 90000);
526
+ } else {
527
+ return null; // No offset, no preroll — no edit list needed
528
+ }
520
529
 
521
- // For non-clipped content, handle any timestamp offset
522
- const mediaTime = Math.round(firstAudioPts * this.audioTimescale / 90000);
523
- const audioDuration = this.audioSampleSizes.length * this.audioSampleDuration;
530
+ const playbackDuration = Math.max(0, audioDuration - mediaTime);
524
531
 
525
532
  const elstData = new Uint8Array(16);
526
533
  const view = new DataView(elstData.buffer);
527
534
  view.setUint32(0, 1);
528
- view.setUint32(4, Math.round(audioDuration * this.videoTimescale / this.audioTimescale));
535
+ view.setUint32(4, Math.round(playbackDuration * this.videoTimescale / this.audioTimescale));
529
536
  view.setInt32(8, mediaTime);
530
537
  view.setUint16(12, 1);
531
538
  view.setUint16(14, 0);
package/src/ts-to-mp4.js CHANGED
@@ -90,24 +90,23 @@ function clipAccessUnits(videoAUs, audioAUs, startTime, endTime) {
90
90
  // This is the time the decoder needs to process but player shouldn't display
91
91
  const prerollPts = Math.max(0, startPts - keyframePts);
92
92
 
93
- // Clip audio to the REQUESTED time range (not from keyframe)
94
- // Audio doesn't need keyframe pre-roll
95
- const audioStartPts = startPts;
96
- const audioEndPts = Math.min(endPts, lastFramePts + 90000); // Include audio slightly past last video
93
+ // Clip audio from KEYFRAME time (same as video) so A/V stays in sync
94
+ // even on players that ignore edit lists. The edit list will skip the
95
+ // audio preroll on compliant players, just like it does for video.
96
+ const audioStartPts = keyframePts;
97
+ const audioEndPts = Math.min(endPts, lastFramePts + 90000);
97
98
  const clippedAudio = audioAUs.filter(au => au.pts >= audioStartPts && au.pts < audioEndPts);
98
99
 
99
- // Normalize video timestamps so keyframe starts at 0
100
+ // Normalize both video and audio to the same base (keyframe PTS)
101
+ // so they share a common timeline regardless of edit list support
100
102
  const offset = keyframePts;
101
103
  for (const au of clippedVideo) {
102
104
  au.pts -= offset;
103
105
  au.dts -= offset;
104
106
  }
105
107
 
106
- // Normalize audio timestamps so it starts at 0 (matching video playback start after preroll)
107
- // Audio doesn't have preroll, so it should start at PTS 0 to sync with video after edit list
108
- const audioOffset = audioStartPts; // Use requested start, not keyframe
109
108
  for (const au of clippedAudio) {
110
- au.pts -= audioOffset;
109
+ au.pts -= offset;
111
110
  }
112
111
 
113
112
  return {