@invintusmedia/tomp4 1.2.1 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/tomp4.js +56 -18
- package/package.json +4 -2
- package/src/fmp4/converter.js +46 -7
- package/src/hls-clip.js +459 -0
- package/src/index.d.ts +413 -0
- package/src/index.js +18 -2
- package/src/mp4-clip.js +132 -0
- package/src/muxers/fmp4.js +493 -0
- package/src/muxers/mp4.js +14 -7
- package/src/ts-to-mp4.js +8 -9
|
@@ -0,0 +1,493 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* fMP4 Fragment Muxer
|
|
3
|
+
*
|
|
4
|
+
* Creates CMAF-compliant fMP4 init segments and media fragments
|
|
5
|
+
* from parsed TS data (NAL units + AAC frames). Used by the HLS
|
|
6
|
+
* clipper to produce frame-accurate fMP4/CMAF output.
|
|
7
|
+
*
|
|
8
|
+
* @module muxers/fmp4
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { createBox } from '../fmp4/utils.js';
|
|
12
|
+
import { parseSPS } from './mp4.js';
|
|
13
|
+
|
|
14
|
+
// ── helpers ───────────────────────────────────────────────
|
|
15
|
+
|
|
16
|
+
function createFullBox(type, version, flags, ...payloads) {
|
|
17
|
+
const header = new Uint8Array(4);
|
|
18
|
+
header[0] = version;
|
|
19
|
+
header[1] = (flags >> 16) & 0xFF;
|
|
20
|
+
header[2] = (flags >> 8) & 0xFF;
|
|
21
|
+
header[3] = flags & 0xFF;
|
|
22
|
+
return createBox(type, header, ...payloads);
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
function strToBytes(s) {
|
|
26
|
+
return new Uint8Array([...s].map(c => c.charCodeAt(0)));
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// ── init segment ──────────────────────────────────────────
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Create a CMAF ftyp box.
|
|
33
|
+
* @returns {Uint8Array}
|
|
34
|
+
*/
|
|
35
|
+
export function createCmafFtyp() {
|
|
36
|
+
const data = new Uint8Array(16);
|
|
37
|
+
// major brand: isom
|
|
38
|
+
data.set(strToBytes('isom'), 0);
|
|
39
|
+
// minor version: 0x200
|
|
40
|
+
data[7] = 0x02;
|
|
41
|
+
// compatible brands: isom, iso6
|
|
42
|
+
data.set(strToBytes('isom'), 8);
|
|
43
|
+
data.set(strToBytes('iso6'), 12);
|
|
44
|
+
return createBox('ftyp', data);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Build an avcC box from SPS and PPS NAL units.
|
|
49
|
+
*/
|
|
50
|
+
function buildAvcC(sps, pps) {
|
|
51
|
+
const data = new Uint8Array(11 + sps.length + pps.length);
|
|
52
|
+
const view = new DataView(data.buffer);
|
|
53
|
+
data[0] = 1;
|
|
54
|
+
data[1] = sps[1]; data[2] = sps[2]; data[3] = sps[3];
|
|
55
|
+
data[4] = 0xFF; // lengthSizeMinusOne = 3 (4-byte NAL lengths)
|
|
56
|
+
data[5] = 0xE1; // numSPS = 1
|
|
57
|
+
view.setUint16(6, sps.length);
|
|
58
|
+
data.set(sps, 8);
|
|
59
|
+
data[8 + sps.length] = 1; // numPPS
|
|
60
|
+
view.setUint16(9 + sps.length, pps.length);
|
|
61
|
+
data.set(pps, 11 + sps.length);
|
|
62
|
+
return createBox('avcC', data);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Build an esds box for AAC audio.
|
|
67
|
+
*/
|
|
68
|
+
function buildEsds(sampleRate, channels) {
|
|
69
|
+
const SAMPLE_RATE_INDEX = {
|
|
70
|
+
96000: 0, 88200: 1, 64000: 2, 48000: 3, 44100: 4, 32000: 5,
|
|
71
|
+
24000: 6, 22050: 7, 16000: 8, 12000: 9, 11025: 10, 8000: 11, 7350: 12
|
|
72
|
+
};
|
|
73
|
+
const samplingFreqIndex = SAMPLE_RATE_INDEX[sampleRate] ?? 4;
|
|
74
|
+
const audioConfig = ((2 << 11) | (samplingFreqIndex << 7) | (channels << 3)) & 0xFFFF;
|
|
75
|
+
|
|
76
|
+
const data = new Uint8Array([
|
|
77
|
+
0x00, 0x00, 0x00, 0x00,
|
|
78
|
+
0x03, 0x19, 0x00, 0x02, 0x00,
|
|
79
|
+
0x04, 0x11, 0x40, 0x15,
|
|
80
|
+
0x00, 0x00, 0x00,
|
|
81
|
+
0x00, 0x01, 0xF4, 0x00,
|
|
82
|
+
0x00, 0x01, 0xF4, 0x00,
|
|
83
|
+
0x05, 0x02,
|
|
84
|
+
(audioConfig >> 8) & 0xFF, audioConfig & 0xFF,
|
|
85
|
+
0x06, 0x01, 0x02
|
|
86
|
+
]);
|
|
87
|
+
return createBox('esds', data);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Create a CMAF init segment (ftyp + moov) from codec parameters.
|
|
92
|
+
*
|
|
93
|
+
* @param {object} codecInfo
|
|
94
|
+
* @param {Uint8Array} codecInfo.sps - H.264 SPS NAL unit
|
|
95
|
+
* @param {Uint8Array} codecInfo.pps - H.264 PPS NAL unit
|
|
96
|
+
* @param {number} [codecInfo.audioSampleRate=48000]
|
|
97
|
+
* @param {number} [codecInfo.audioChannels=2]
|
|
98
|
+
* @param {boolean} [codecInfo.hasAudio=true]
|
|
99
|
+
* @param {number} [codecInfo.videoTimescale=90000]
|
|
100
|
+
* @param {number} [codecInfo.audioTimescale] - defaults to audioSampleRate
|
|
101
|
+
* @returns {Uint8Array}
|
|
102
|
+
*/
|
|
103
|
+
export function createInitSegment(codecInfo) {
|
|
104
|
+
const {
|
|
105
|
+
sps, pps,
|
|
106
|
+
audioSampleRate = 48000,
|
|
107
|
+
audioChannels = 2,
|
|
108
|
+
hasAudio = true,
|
|
109
|
+
videoTimescale = 90000,
|
|
110
|
+
} = codecInfo;
|
|
111
|
+
const audioTimescale = codecInfo.audioTimescale || audioSampleRate;
|
|
112
|
+
const { width, height } = parseSPS(sps);
|
|
113
|
+
|
|
114
|
+
const VIDEO_TRACK_ID = 1;
|
|
115
|
+
const AUDIO_TRACK_ID = 2;
|
|
116
|
+
|
|
117
|
+
// ── mvhd ──
|
|
118
|
+
const mvhdData = new Uint8Array(96);
|
|
119
|
+
const mvhdView = new DataView(mvhdData.buffer);
|
|
120
|
+
mvhdView.setUint32(8, 1000); // timescale
|
|
121
|
+
mvhdView.setUint32(16, 0x00010000); // rate
|
|
122
|
+
mvhdView.setUint16(20, 0x0100); // volume
|
|
123
|
+
mvhdView.setUint32(32, 0x00010000); // matrix
|
|
124
|
+
mvhdView.setUint32(48, 0x00010000);
|
|
125
|
+
mvhdView.setUint32(64, 0x40000000);
|
|
126
|
+
mvhdView.setUint32(92, hasAudio ? 3 : 2); // next_track_ID
|
|
127
|
+
const mvhd = createFullBox('mvhd', 0, 0, mvhdData);
|
|
128
|
+
|
|
129
|
+
// ── video trak ──
|
|
130
|
+
const videoTrak = buildInitTrak(VIDEO_TRACK_ID, 'vide', videoTimescale, width, height, () => {
|
|
131
|
+
const avcC = buildAvcC(sps, pps);
|
|
132
|
+
const avc1Data = new Uint8Array(78 + avcC.byteLength);
|
|
133
|
+
const v = new DataView(avc1Data.buffer);
|
|
134
|
+
v.setUint16(6, 1); v.setUint16(24, width); v.setUint16(26, height);
|
|
135
|
+
v.setUint32(28, 0x00480000); v.setUint32(32, 0x00480000);
|
|
136
|
+
v.setUint16(40, 1); v.setUint16(74, 0x0018); v.setInt16(76, -1);
|
|
137
|
+
avc1Data.set(avcC, 78);
|
|
138
|
+
return createBox('avc1', avc1Data);
|
|
139
|
+
});
|
|
140
|
+
|
|
141
|
+
// ── audio trak ──
|
|
142
|
+
let audioTrak = null;
|
|
143
|
+
if (hasAudio) {
|
|
144
|
+
audioTrak = buildInitTrak(AUDIO_TRACK_ID, 'soun', audioTimescale, 0, 0, () => {
|
|
145
|
+
const esds = buildEsds(audioSampleRate, audioChannels);
|
|
146
|
+
const mp4aData = new Uint8Array(28 + esds.byteLength);
|
|
147
|
+
const v = new DataView(mp4aData.buffer);
|
|
148
|
+
v.setUint16(6, 1); v.setUint16(16, audioChannels); v.setUint16(18, 16);
|
|
149
|
+
v.setUint32(24, audioTimescale << 16);
|
|
150
|
+
mp4aData.set(esds, 28);
|
|
151
|
+
return createBox('mp4a', mp4aData);
|
|
152
|
+
});
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// ── mvex (track extends for fragmented mode) ──
|
|
156
|
+
const mvexParts = [buildTrex(VIDEO_TRACK_ID)];
|
|
157
|
+
if (hasAudio) mvexParts.push(buildTrex(AUDIO_TRACK_ID));
|
|
158
|
+
const mvex = createBox('mvex', ...mvexParts);
|
|
159
|
+
|
|
160
|
+
// ── assemble moov ──
|
|
161
|
+
const moovParts = [mvhd, videoTrak];
|
|
162
|
+
if (audioTrak) moovParts.push(audioTrak);
|
|
163
|
+
moovParts.push(mvex);
|
|
164
|
+
const moov = createBox('moov', ...moovParts);
|
|
165
|
+
|
|
166
|
+
const ftyp = createCmafFtyp();
|
|
167
|
+
const result = new Uint8Array(ftyp.byteLength + moov.byteLength);
|
|
168
|
+
result.set(ftyp, 0);
|
|
169
|
+
result.set(moov, ftyp.byteLength);
|
|
170
|
+
return result;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
/**
|
|
174
|
+
* Build a trak box for the init segment (empty sample tables).
|
|
175
|
+
*/
|
|
176
|
+
function buildInitTrak(trackId, handlerType, timescale, width, height, buildSampleEntry) {
|
|
177
|
+
// tkhd
|
|
178
|
+
const tkhdData = new Uint8Array(80);
|
|
179
|
+
const tkhdView = new DataView(tkhdData.buffer);
|
|
180
|
+
tkhdView.setUint32(8, trackId);
|
|
181
|
+
tkhdView.setUint32(36, 0x00010000); // matrix
|
|
182
|
+
tkhdView.setUint32(52, 0x00010000);
|
|
183
|
+
tkhdView.setUint32(68, 0x40000000);
|
|
184
|
+
if (width && height) {
|
|
185
|
+
tkhdView.setUint32(72, width << 16);
|
|
186
|
+
tkhdView.setUint32(76, height << 16);
|
|
187
|
+
}
|
|
188
|
+
if (handlerType === 'soun') tkhdView.setUint16(32, 0x0100); // volume
|
|
189
|
+
const tkhd = createFullBox('tkhd', 0, 3, tkhdData);
|
|
190
|
+
|
|
191
|
+
// mdhd
|
|
192
|
+
const mdhdData = new Uint8Array(20);
|
|
193
|
+
new DataView(mdhdData.buffer).setUint32(8, timescale);
|
|
194
|
+
mdhdData[16] = 0x55; mdhdData[17] = 0xC4; // language: und
|
|
195
|
+
const mdhd = createFullBox('mdhd', 0, 0, mdhdData);
|
|
196
|
+
|
|
197
|
+
// hdlr
|
|
198
|
+
const hdlrData = new Uint8Array(21);
|
|
199
|
+
hdlrData.set(strToBytes(handlerType), 4);
|
|
200
|
+
const hdlr = createFullBox('hdlr', 0, 0, hdlrData);
|
|
201
|
+
|
|
202
|
+
// xmhd (vmhd or smhd)
|
|
203
|
+
const xmhd = handlerType === 'vide'
|
|
204
|
+
? createFullBox('vmhd', 0, 1, new Uint8Array(8))
|
|
205
|
+
: createFullBox('smhd', 0, 0, new Uint8Array(4));
|
|
206
|
+
|
|
207
|
+
// dinf
|
|
208
|
+
const urlBox = createFullBox('url ', 0, 1, new Uint8Array(0));
|
|
209
|
+
const dref = createFullBox('dref', 0, 0, new Uint8Array([0, 0, 0, 1]), urlBox);
|
|
210
|
+
const dinf = createBox('dinf', dref);
|
|
211
|
+
|
|
212
|
+
// stbl (empty sample tables for init segment)
|
|
213
|
+
const sampleEntry = buildSampleEntry();
|
|
214
|
+
const stsdHeader = new Uint8Array(4);
|
|
215
|
+
new DataView(stsdHeader.buffer).setUint32(0, 1);
|
|
216
|
+
const stsd = createFullBox('stsd', 0, 0, stsdHeader, sampleEntry);
|
|
217
|
+
|
|
218
|
+
const emptyStts = createFullBox('stts', 0, 0, new Uint8Array(4));
|
|
219
|
+
const emptyStsc = createFullBox('stsc', 0, 0, new Uint8Array(4));
|
|
220
|
+
const emptyStsz = createFullBox('stsz', 0, 0, new Uint8Array(8));
|
|
221
|
+
const emptyStco = createFullBox('stco', 0, 0, new Uint8Array(4));
|
|
222
|
+
const stbl = createBox('stbl', stsd, emptyStts, emptyStsc, emptyStsz, emptyStco);
|
|
223
|
+
|
|
224
|
+
const minf = createBox('minf', xmhd, dinf, stbl);
|
|
225
|
+
const mdia = createBox('mdia', mdhd, hdlr, minf);
|
|
226
|
+
return createBox('trak', tkhd, mdia);
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
/**
|
|
230
|
+
* Build a trex (track extends) box for mvex.
|
|
231
|
+
*/
|
|
232
|
+
function buildTrex(trackId) {
|
|
233
|
+
const data = new Uint8Array(20);
|
|
234
|
+
const view = new DataView(data.buffer);
|
|
235
|
+
view.setUint32(0, trackId); // track_ID
|
|
236
|
+
view.setUint32(4, 1); // default_sample_description_index
|
|
237
|
+
return createFullBox('trex', 0, 0, data);
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
// ── media fragments ───────────────────────────────────────
|
|
241
|
+
|
|
242
|
+
/**
|
|
243
|
+
* Create an fMP4 media fragment (moof + mdat) from video and audio samples.
|
|
244
|
+
*
|
|
245
|
+
* Video samples: array of { nalUnits: Uint8Array[], pts: number, dts: number }
|
|
246
|
+
* (pts/dts in 90kHz ticks, same as TSParser output)
|
|
247
|
+
*
|
|
248
|
+
* Audio samples: array of { data: Uint8Array, pts: number }
|
|
249
|
+
* (pts in 90kHz ticks)
|
|
250
|
+
*
|
|
251
|
+
* @param {object} opts
|
|
252
|
+
* @param {Array} opts.videoSamples - Video access units
|
|
253
|
+
* @param {Array} [opts.audioSamples] - Audio access units
|
|
254
|
+
* @param {number} opts.sequenceNumber - Fragment sequence (1-based)
|
|
255
|
+
* @param {number} opts.videoTimescale - Video timescale (typically 90000)
|
|
256
|
+
* @param {number} [opts.audioTimescale=48000] - Audio timescale
|
|
257
|
+
* @param {number} [opts.videoBaseTime=0] - Video base decode time (in videoTimescale ticks)
|
|
258
|
+
* @param {number} [opts.audioBaseTime=0] - Audio base decode time (in audioTimescale ticks)
|
|
259
|
+
* @param {number} [opts.audioSampleDuration=1024] - AAC frame duration in audio timescale
|
|
260
|
+
* @returns {Uint8Array} moof + mdat
|
|
261
|
+
*/
|
|
262
|
+
export function createFragment(opts) {
|
|
263
|
+
const {
|
|
264
|
+
videoSamples,
|
|
265
|
+
audioSamples = [],
|
|
266
|
+
sequenceNumber = 1,
|
|
267
|
+
videoTimescale = 90000,
|
|
268
|
+
audioTimescale = 48000,
|
|
269
|
+
videoBaseTime = 0,
|
|
270
|
+
audioBaseTime = 0,
|
|
271
|
+
audioSampleDuration = 1024,
|
|
272
|
+
} = opts;
|
|
273
|
+
|
|
274
|
+
const VIDEO_TRACK_ID = 1;
|
|
275
|
+
const AUDIO_TRACK_ID = 2;
|
|
276
|
+
|
|
277
|
+
// ── build video sample data (AVCC format) + metadata ──
|
|
278
|
+
const videoChunks = [];
|
|
279
|
+
const videoMeta = [];
|
|
280
|
+
for (let i = 0; i < videoSamples.length; i++) {
|
|
281
|
+
const au = videoSamples[i];
|
|
282
|
+
let sampleSize = 0;
|
|
283
|
+
const parts = [];
|
|
284
|
+
for (const nalUnit of au.nalUnits) {
|
|
285
|
+
const prefixed = new Uint8Array(4 + nalUnit.length);
|
|
286
|
+
new DataView(prefixed.buffer).setUint32(0, nalUnit.length);
|
|
287
|
+
prefixed.set(nalUnit, 4);
|
|
288
|
+
parts.push(prefixed);
|
|
289
|
+
sampleSize += prefixed.length;
|
|
290
|
+
}
|
|
291
|
+
videoChunks.push(parts);
|
|
292
|
+
|
|
293
|
+
// Detect keyframe (IDR NAL type 5)
|
|
294
|
+
let isKeyframe = false;
|
|
295
|
+
for (const nalUnit of au.nalUnits) {
|
|
296
|
+
if ((nalUnit[0] & 0x1F) === 5) { isKeyframe = true; break; }
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
const duration = i < videoSamples.length - 1
|
|
300
|
+
? videoSamples[i + 1].dts - au.dts
|
|
301
|
+
: (videoMeta.length > 0 ? videoMeta[videoMeta.length - 1].duration : 3003);
|
|
302
|
+
const compositionTimeOffset = au.pts - au.dts;
|
|
303
|
+
|
|
304
|
+
videoMeta.push({
|
|
305
|
+
size: sampleSize,
|
|
306
|
+
duration,
|
|
307
|
+
flags: isKeyframe ? 0x02000000 : 0x01010000,
|
|
308
|
+
compositionTimeOffset,
|
|
309
|
+
});
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
// ── build audio sample data + metadata ──
|
|
313
|
+
const audioChunks = [];
|
|
314
|
+
const audioMeta = [];
|
|
315
|
+
for (const frame of audioSamples) {
|
|
316
|
+
audioChunks.push(frame.data);
|
|
317
|
+
audioMeta.push({ size: frame.data.length });
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
// ── compute total mdat content sizes ──
|
|
321
|
+
let videoDataSize = 0;
|
|
322
|
+
for (const parts of videoChunks) for (const p of parts) videoDataSize += p.length;
|
|
323
|
+
let audioDataSize = 0;
|
|
324
|
+
for (const d of audioChunks) audioDataSize += d.length;
|
|
325
|
+
|
|
326
|
+
// ── build trafs ──
|
|
327
|
+
// We need to know the moof size to set trun data_offset. Two-pass:
|
|
328
|
+
// 1. Build trafs with placeholder data_offset
|
|
329
|
+
// 2. Measure moof size
|
|
330
|
+
// 3. Patch data_offsets
|
|
331
|
+
|
|
332
|
+
const videoTraf = buildTraf(VIDEO_TRACK_ID, videoBaseTime, videoMeta, true);
|
|
333
|
+
const audioTraf = audioMeta.length > 0
|
|
334
|
+
? buildTraf(AUDIO_TRACK_ID, audioBaseTime, audioMeta, false, audioSampleDuration)
|
|
335
|
+
: null;
|
|
336
|
+
|
|
337
|
+
// mfhd
|
|
338
|
+
const mfhdData = new Uint8Array(4);
|
|
339
|
+
new DataView(mfhdData.buffer).setUint32(0, sequenceNumber);
|
|
340
|
+
const mfhd = createFullBox('mfhd', 0, 0, mfhdData);
|
|
341
|
+
|
|
342
|
+
// Assemble moof (with placeholder offsets)
|
|
343
|
+
const moofParts = [mfhd, videoTraf];
|
|
344
|
+
if (audioTraf) moofParts.push(audioTraf);
|
|
345
|
+
const moof = createBox('moof', ...moofParts);
|
|
346
|
+
|
|
347
|
+
// ── build mdat ──
|
|
348
|
+
const mdatContentSize = videoDataSize + audioDataSize;
|
|
349
|
+
const mdatHeaderSize = 8;
|
|
350
|
+
const mdatTotal = mdatHeaderSize + mdatContentSize;
|
|
351
|
+
const mdat = new Uint8Array(mdatTotal);
|
|
352
|
+
new DataView(mdat.buffer).setUint32(0, mdatTotal);
|
|
353
|
+
mdat[4] = 'm'.charCodeAt(0); mdat[5] = 'd'.charCodeAt(0);
|
|
354
|
+
mdat[6] = 'a'.charCodeAt(0); mdat[7] = 't'.charCodeAt(0);
|
|
355
|
+
|
|
356
|
+
let writeOffset = mdatHeaderSize;
|
|
357
|
+
for (const parts of videoChunks) {
|
|
358
|
+
for (const p of parts) { mdat.set(p, writeOffset); writeOffset += p.length; }
|
|
359
|
+
}
|
|
360
|
+
for (const d of audioChunks) { mdat.set(d, writeOffset); writeOffset += d.length; }
|
|
361
|
+
|
|
362
|
+
// ── patch trun data_offsets in moof ──
|
|
363
|
+
// data_offset = byte distance from moof start to the track's data in mdat
|
|
364
|
+
const videoDataOffset = moof.byteLength + mdatHeaderSize;
|
|
365
|
+
const audioDataOffset = videoDataOffset + videoDataSize;
|
|
366
|
+
patchTrunDataOffset(moof, VIDEO_TRACK_ID, videoDataOffset);
|
|
367
|
+
if (audioTraf) patchTrunDataOffset(moof, AUDIO_TRACK_ID, audioDataOffset);
|
|
368
|
+
|
|
369
|
+
// ── combine ──
|
|
370
|
+
const result = new Uint8Array(moof.byteLength + mdat.byteLength);
|
|
371
|
+
result.set(moof, 0);
|
|
372
|
+
result.set(mdat, moof.byteLength);
|
|
373
|
+
return result;
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
/**
|
|
377
|
+
* Build a traf box for one track.
|
|
378
|
+
*/
|
|
379
|
+
function buildTraf(trackId, baseDecodeTime, sampleMeta, isVideo, defaultDuration = 0) {
|
|
380
|
+
// tfhd: track_id only, no defaults in header
|
|
381
|
+
const tfhdFlags = 0x020000; // default-base-is-moof
|
|
382
|
+
const tfhdData = new Uint8Array(4);
|
|
383
|
+
new DataView(tfhdData.buffer).setUint32(0, trackId);
|
|
384
|
+
const tfhd = createFullBox('tfhd', 0, tfhdFlags, tfhdData);
|
|
385
|
+
|
|
386
|
+
// tfdt: base media decode time
|
|
387
|
+
const tfdtData = new Uint8Array(8);
|
|
388
|
+
const tfdtView = new DataView(tfdtData.buffer);
|
|
389
|
+
// Use version 1 (64-bit) for large timestamps
|
|
390
|
+
tfdtView.setUint32(0, (baseDecodeTime / 0x100000000) >>> 0);
|
|
391
|
+
tfdtView.setUint32(4, baseDecodeTime >>> 0);
|
|
392
|
+
const tfdt = createFullBox('tfdt', 1, 0, tfdtData);
|
|
393
|
+
|
|
394
|
+
// trun
|
|
395
|
+
const trun = isVideo
|
|
396
|
+
? buildVideoTrun(sampleMeta)
|
|
397
|
+
: buildAudioTrun(sampleMeta, defaultDuration);
|
|
398
|
+
|
|
399
|
+
return createBox('traf', tfhd, tfdt, trun);
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
/**
|
|
403
|
+
* Build a video trun with per-sample duration, size, flags, CTO.
|
|
404
|
+
*/
|
|
405
|
+
function buildVideoTrun(samples) {
|
|
406
|
+
// flags: data-offset-present | duration | size | flags | composition-time-offset
|
|
407
|
+
const trunFlags = 0x000001 | 0x000100 | 0x000200 | 0x000400 | 0x000800;
|
|
408
|
+
const headerSize = 8; // sample_count(4) + data_offset(4)
|
|
409
|
+
const perSampleSize = 16; // duration(4) + size(4) + flags(4) + CTO(4)
|
|
410
|
+
const payload = new Uint8Array(headerSize + samples.length * perSampleSize);
|
|
411
|
+
const view = new DataView(payload.buffer);
|
|
412
|
+
|
|
413
|
+
view.setUint32(0, samples.length); // sample_count
|
|
414
|
+
view.setInt32(4, 0); // data_offset (placeholder, patched later)
|
|
415
|
+
|
|
416
|
+
let offset = 8;
|
|
417
|
+
for (const sample of samples) {
|
|
418
|
+
view.setUint32(offset, sample.duration); offset += 4;
|
|
419
|
+
view.setUint32(offset, sample.size); offset += 4;
|
|
420
|
+
view.setUint32(offset, sample.flags); offset += 4;
|
|
421
|
+
view.setInt32(offset, sample.compositionTimeOffset); offset += 4;
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
return createFullBox('trun', 0, trunFlags, payload);
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
/**
|
|
428
|
+
* Build an audio trun with per-sample size (duration via default).
|
|
429
|
+
*/
|
|
430
|
+
function buildAudioTrun(samples, defaultDuration) {
|
|
431
|
+
// flags: data-offset-present | size
|
|
432
|
+
const trunFlags = 0x000001 | 0x000100 | 0x000200;
|
|
433
|
+
const headerSize = 8;
|
|
434
|
+
const perSampleSize = 8; // duration(4) + size(4)
|
|
435
|
+
const payload = new Uint8Array(headerSize + samples.length * perSampleSize);
|
|
436
|
+
const view = new DataView(payload.buffer);
|
|
437
|
+
|
|
438
|
+
view.setUint32(0, samples.length);
|
|
439
|
+
view.setInt32(4, 0); // data_offset placeholder
|
|
440
|
+
|
|
441
|
+
let offset = 8;
|
|
442
|
+
for (const sample of samples) {
|
|
443
|
+
view.setUint32(offset, defaultDuration); offset += 4;
|
|
444
|
+
view.setUint32(offset, sample.size); offset += 4;
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
return createFullBox('trun', 0, trunFlags, payload);
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
/**
|
|
451
|
+
* Patch the data_offset in a trun box within a moof.
|
|
452
|
+
* Scans the moof for a traf with the given trackId, then patches its trun.
|
|
453
|
+
*/
|
|
454
|
+
function patchTrunDataOffset(moof, targetTrackId, dataOffset) {
|
|
455
|
+
const view = new DataView(moof.buffer, moof.byteOffset, moof.byteLength);
|
|
456
|
+
let pos = 8; // skip moof header
|
|
457
|
+
|
|
458
|
+
while (pos + 8 < moof.byteLength) {
|
|
459
|
+
const boxSize = view.getUint32(pos);
|
|
460
|
+
const boxType = String.fromCharCode(moof[pos + 4], moof[pos + 5], moof[pos + 6], moof[pos + 7]);
|
|
461
|
+
if (boxSize < 8) break;
|
|
462
|
+
|
|
463
|
+
if (boxType === 'traf') {
|
|
464
|
+
// Find tfhd to check track ID
|
|
465
|
+
let innerPos = pos + 8;
|
|
466
|
+
let foundTrack = false;
|
|
467
|
+
while (innerPos + 8 < pos + boxSize) {
|
|
468
|
+
const innerSize = view.getUint32(innerPos);
|
|
469
|
+
const innerType = String.fromCharCode(moof[innerPos + 4], moof[innerPos + 5], moof[innerPos + 6], moof[innerPos + 7]);
|
|
470
|
+
if (innerSize < 8) break;
|
|
471
|
+
|
|
472
|
+
if (innerType === 'tfhd') {
|
|
473
|
+
const trackId = view.getUint32(innerPos + 12);
|
|
474
|
+
foundTrack = (trackId === targetTrackId);
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
if (innerType === 'trun' && foundTrack) {
|
|
478
|
+
// data_offset is at fullbox header (12) + sample_count (4) = offset 16
|
|
479
|
+
// But trun has: box header (8) + version/flags (4) + sample_count (4) + data_offset (4)
|
|
480
|
+
// So data_offset is at innerPos + 16
|
|
481
|
+
view.setInt32(innerPos + 16, dataOffset);
|
|
482
|
+
return;
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
innerPos += innerSize;
|
|
486
|
+
}
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
pos += boxSize;
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
export default { createInitSegment, createFragment, createCmafFtyp };
|
package/src/muxers/mp4.js
CHANGED
|
@@ -513,19 +513,26 @@ export class MP4Muxer {
|
|
|
513
513
|
if (this.parser.audioPts.length === 0) return null;
|
|
514
514
|
|
|
515
515
|
const firstAudioPts = this.parser.audioPts[0];
|
|
516
|
+
const audioDuration = this.audioSampleSizes.length * this.audioSampleDuration;
|
|
516
517
|
|
|
517
|
-
//
|
|
518
|
-
//
|
|
519
|
-
|
|
518
|
+
// Determine media_time: when clipping with preroll, audio shares the
|
|
519
|
+
// same timeline as video (both normalized from keyframe), so the audio
|
|
520
|
+
// edit list must skip the same preroll to stay in sync.
|
|
521
|
+
let mediaTime;
|
|
522
|
+
if (this.preroll > 0) {
|
|
523
|
+
mediaTime = Math.round(this.preroll * this.audioTimescale / 90000);
|
|
524
|
+
} else if (firstAudioPts !== 0) {
|
|
525
|
+
mediaTime = Math.round(firstAudioPts * this.audioTimescale / 90000);
|
|
526
|
+
} else {
|
|
527
|
+
return null; // No offset, no preroll — no edit list needed
|
|
528
|
+
}
|
|
520
529
|
|
|
521
|
-
|
|
522
|
-
const mediaTime = Math.round(firstAudioPts * this.audioTimescale / 90000);
|
|
523
|
-
const audioDuration = this.audioSampleSizes.length * this.audioSampleDuration;
|
|
530
|
+
const playbackDuration = Math.max(0, audioDuration - mediaTime);
|
|
524
531
|
|
|
525
532
|
const elstData = new Uint8Array(16);
|
|
526
533
|
const view = new DataView(elstData.buffer);
|
|
527
534
|
view.setUint32(0, 1);
|
|
528
|
-
view.setUint32(4, Math.round(
|
|
535
|
+
view.setUint32(4, Math.round(playbackDuration * this.videoTimescale / this.audioTimescale));
|
|
529
536
|
view.setInt32(8, mediaTime);
|
|
530
537
|
view.setUint16(12, 1);
|
|
531
538
|
view.setUint16(14, 0);
|
package/src/ts-to-mp4.js
CHANGED
|
@@ -90,24 +90,23 @@ function clipAccessUnits(videoAUs, audioAUs, startTime, endTime) {
|
|
|
90
90
|
// This is the time the decoder needs to process but player shouldn't display
|
|
91
91
|
const prerollPts = Math.max(0, startPts - keyframePts);
|
|
92
92
|
|
|
93
|
-
// Clip audio
|
|
94
|
-
//
|
|
95
|
-
|
|
96
|
-
const
|
|
93
|
+
// Clip audio from KEYFRAME time (same as video) so A/V stays in sync
|
|
94
|
+
// even on players that ignore edit lists. The edit list will skip the
|
|
95
|
+
// audio preroll on compliant players, just like it does for video.
|
|
96
|
+
const audioStartPts = keyframePts;
|
|
97
|
+
const audioEndPts = Math.min(endPts, lastFramePts + 90000);
|
|
97
98
|
const clippedAudio = audioAUs.filter(au => au.pts >= audioStartPts && au.pts < audioEndPts);
|
|
98
99
|
|
|
99
|
-
// Normalize video
|
|
100
|
+
// Normalize both video and audio to the same base (keyframe PTS)
|
|
101
|
+
// so they share a common timeline regardless of edit list support
|
|
100
102
|
const offset = keyframePts;
|
|
101
103
|
for (const au of clippedVideo) {
|
|
102
104
|
au.pts -= offset;
|
|
103
105
|
au.dts -= offset;
|
|
104
106
|
}
|
|
105
107
|
|
|
106
|
-
// Normalize audio timestamps so it starts at 0 (matching video playback start after preroll)
|
|
107
|
-
// Audio doesn't have preroll, so it should start at PTS 0 to sync with video after edit list
|
|
108
|
-
const audioOffset = audioStartPts; // Use requested start, not keyframe
|
|
109
108
|
for (const au of clippedAudio) {
|
|
110
|
-
au.pts -=
|
|
109
|
+
au.pts -= offset;
|
|
111
110
|
}
|
|
112
111
|
|
|
113
112
|
return {
|