audio-video-sync 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,475 @@
1
+ import { FFmpeg } from '@ffmpeg/ffmpeg';
2
+ import { fetchFile } from '@ffmpeg/util';
3
+
4
+ /**
5
+ * 音频提取模块
6
+ * 使用 FFmpeg.wasm 从视频文件中提取音频 PCM 数据
7
+ */
8
+ const DEFAULT_OPTIONS = {
9
+ sampleRate: 16000,
10
+ mono: true,
11
+ maxDuration: 60
12
+ };
13
+ /**
14
+ * 从视频文件提取音频 PCM 数据
15
+ *
16
+ * @param ffmpeg 已加载的 FFmpeg 实例
17
+ * @param videoFile 视频文件(File 或 Blob)
18
+ * @param options 提取选项
19
+ * @returns 音频 PCM 数据
20
+ */
21
+ async function extractAudio(ffmpeg, videoFile, options = {}) {
22
+ const opts = { ...DEFAULT_OPTIONS, ...options };
23
+ const inputName = 'input_video';
24
+ const outputName = 'output_audio.pcm';
25
+ // 写入输入文件
26
+ await ffmpeg.writeFile(inputName, await fetchFile(videoFile));
27
+ // 构建 FFmpeg 命令
28
+ const args = [
29
+ '-i', inputName,
30
+ '-vn', // 不处理视频
31
+ '-ac', opts.mono ? '1' : '2', // 声道数
32
+ '-ar', opts.sampleRate.toString(), // 采样率
33
+ '-f', 's16le', // 16位小端 PCM
34
+ '-t', opts.maxDuration.toString(), // 最大时长
35
+ outputName
36
+ ];
37
+ await ffmpeg.exec(args);
38
+ // 读取输出文件
39
+ const data = await ffmpeg.readFile(outputName);
40
+ const pcmData = data instanceof Uint8Array ? data : new Uint8Array(data);
41
+ // 清理临时文件
42
+ await ffmpeg.deleteFile(inputName);
43
+ await ffmpeg.deleteFile(outputName);
44
+ // 将 16 位 PCM 转换为 Float32Array
45
+ const samples = pcm16ToFloat32(pcmData);
46
+ return {
47
+ samples,
48
+ sampleRate: opts.sampleRate,
49
+ duration: samples.length / opts.sampleRate,
50
+ channels: opts.mono ? 1 : 2
51
+ };
52
+ }
53
+ /**
54
+ * 将 16 位 PCM 数据转换为 Float32Array(归一化到 -1 到 1)
55
+ */
56
+ function pcm16ToFloat32(pcmData) {
57
+ const numSamples = pcmData.length / 2;
58
+ const float32 = new Float32Array(numSamples);
59
+ const view = new DataView(pcmData.buffer, pcmData.byteOffset, pcmData.byteLength);
60
+ for (let i = 0; i < numSamples; i++) {
61
+ // 读取 16 位有符号整数(小端)
62
+ const int16 = view.getInt16(i * 2, true);
63
+ // 归一化到 -1 到 1
64
+ float32[i] = int16 / 32768;
65
+ }
66
+ return float32;
67
+ }
68
+ /**
69
+ * 从 AudioBuffer 获取 PCM 数据(用于 Web Audio API)
70
+ */
71
+ function audioBufferToFloat32(audioBuffer) {
72
+ // 如果是多声道,混合为单声道
73
+ if (audioBuffer.numberOfChannels === 1) {
74
+ return audioBuffer.getChannelData(0);
75
+ }
76
+ const length = audioBuffer.length;
77
+ const mixed = new Float32Array(length);
78
+ const numChannels = audioBuffer.numberOfChannels;
79
+ for (let ch = 0; ch < numChannels; ch++) {
80
+ const channelData = audioBuffer.getChannelData(ch);
81
+ for (let i = 0; i < length; i++) {
82
+ mixed[i] += channelData[i] / numChannels;
83
+ }
84
+ }
85
+ return mixed;
86
+ }
87
+ /**
88
+ * 对音频数据进行降采样
89
+ */
90
+ function downsample(samples, fromRate, toRate) {
91
+ if (fromRate === toRate) {
92
+ return samples;
93
+ }
94
+ const ratio = fromRate / toRate;
95
+ const newLength = Math.floor(samples.length / ratio);
96
+ const result = new Float32Array(newLength);
97
+ for (let i = 0; i < newLength; i++) {
98
+ const srcIndex = Math.floor(i * ratio);
99
+ result[i] = samples[srcIndex];
100
+ }
101
+ return result;
102
+ }
103
+ /**
104
+ * 对音频应用简单的预处理(去直流偏移、归一化)
105
+ */
106
+ function preprocessAudio(samples) {
107
+ const result = new Float32Array(samples.length);
108
+ // 计算均值(直流偏移)
109
+ let sum = 0;
110
+ for (let i = 0; i < samples.length; i++) {
111
+ sum += samples[i];
112
+ }
113
+ const mean = sum / samples.length;
114
+ // 去除直流偏移并找最大值
115
+ let maxAbs = 0;
116
+ for (let i = 0; i < samples.length; i++) {
117
+ result[i] = samples[i] - mean;
118
+ maxAbs = Math.max(maxAbs, Math.abs(result[i]));
119
+ }
120
+ // 归一化
121
+ if (maxAbs > 0) {
122
+ for (let i = 0; i < result.length; i++) {
123
+ result[i] /= maxAbs;
124
+ }
125
+ }
126
+ return result;
127
+ }
128
+
129
+ /**
130
+ * FFT (Fast Fourier Transform) 实现
131
+ * 用于音频信号的频域分析和互相关计算
132
+ */
133
+ /**
134
+ * 将数组填充到 2 的幂次长度
135
+ */
136
+ function padToPowerOfTwo(arr, targetLength) {
137
+ const padded = new Float32Array(targetLength);
138
+ padded.set(arr);
139
+ return padded;
140
+ }
141
+ /**
142
+ * 计算下一个 2 的幂次
143
+ */
144
+ function nextPowerOfTwo(n) {
145
+ return 1 << Math.ceil(Math.log2(n));
146
+ }
147
+ /**
148
+ * Cooley-Tukey FFT 算法
149
+ */
150
+ function fft(input) {
151
+ const n = input.length;
152
+ if (n === 1) {
153
+ return [[input[0], 0]];
154
+ }
155
+ if (n & (n - 1)) {
156
+ throw new Error('FFT 输入长度必须是 2 的幂次');
157
+ }
158
+ // 分离奇偶项
159
+ const even = new Float32Array(n / 2);
160
+ const odd = new Float32Array(n / 2);
161
+ for (let i = 0; i < n / 2; i++) {
162
+ even[i] = input[2 * i];
163
+ odd[i] = input[2 * i + 1];
164
+ }
165
+ const evenFFT = fft(even);
166
+ const oddFFT = fft(odd);
167
+ const result = new Array(n);
168
+ for (let k = 0; k < n / 2; k++) {
169
+ const angle = -2 * Math.PI * k / n;
170
+ const twiddle = [Math.cos(angle), Math.sin(angle)];
171
+ // 复数乘法: twiddle * oddFFT[k]
172
+ const t = [
173
+ twiddle[0] * oddFFT[k][0] - twiddle[1] * oddFFT[k][1],
174
+ twiddle[0] * oddFFT[k][1] + twiddle[1] * oddFFT[k][0]
175
+ ];
176
+ result[k] = [evenFFT[k][0] + t[0], evenFFT[k][1] + t[1]];
177
+ result[k + n / 2] = [evenFFT[k][0] - t[0], evenFFT[k][1] - t[1]];
178
+ }
179
+ return result;
180
+ }
181
+ /**
182
+ * 逆 FFT
183
+ */
184
+ function ifft(input) {
185
+ const n = input.length;
186
+ // 共轭
187
+ const conjugated = input.map(([re, im]) => [re, -im]);
188
+ // 转换为 Float32Array 进行 FFT
189
+ const realPart = new Float32Array(n);
190
+ for (let i = 0; i < n; i++) {
191
+ realPart[i] = conjugated[i][0];
192
+ }
193
+ // 对共轭进行 FFT(这里需要处理复数输入)
194
+ const result = fftComplex(conjugated);
195
+ // 共轭并除以 n
196
+ return result.map(([re, im]) => [re / n, -im / n]);
197
+ }
198
+ /**
199
+ * 复数输入的 FFT
200
+ */
201
+ function fftComplex(input) {
202
+ const n = input.length;
203
+ if (n === 1) {
204
+ return [input[0]];
205
+ }
206
+ if (n & (n - 1)) {
207
+ throw new Error('FFT 输入长度必须是 2 的幂次');
208
+ }
209
+ const even = [];
210
+ const odd = [];
211
+ for (let i = 0; i < n / 2; i++) {
212
+ even.push(input[2 * i]);
213
+ odd.push(input[2 * i + 1]);
214
+ }
215
+ const evenFFT = fftComplex(even);
216
+ const oddFFT = fftComplex(odd);
217
+ const result = new Array(n);
218
+ for (let k = 0; k < n / 2; k++) {
219
+ const angle = -2 * Math.PI * k / n;
220
+ const twiddle = [Math.cos(angle), Math.sin(angle)];
221
+ const t = [
222
+ twiddle[0] * oddFFT[k][0] - twiddle[1] * oddFFT[k][1],
223
+ twiddle[0] * oddFFT[k][1] + twiddle[1] * oddFFT[k][0]
224
+ ];
225
+ result[k] = [evenFFT[k][0] + t[0], evenFFT[k][1] + t[1]];
226
+ result[k + n / 2] = [evenFFT[k][0] - t[0], evenFFT[k][1] - t[1]];
227
+ }
228
+ return result;
229
+ }
230
+ /**
231
+ * 计算两个信号的互相关
232
+ * 使用 FFT 加速: corr(a,b) = IFFT(FFT(a) * conj(FFT(b)))
233
+ *
234
+ * @param signalA 参考信号
235
+ * @param signalB 待对齐信号
236
+ * @returns 互相关结果数组
237
+ */
238
+ function crossCorrelate(signalA, signalB) {
239
+ // 计算需要的 FFT 长度(两个信号长度之和的下一个 2 的幂次)
240
+ const n = nextPowerOfTwo(signalA.length + signalB.length - 1);
241
+ // 填充信号到相同长度
242
+ const paddedA = padToPowerOfTwo(signalA, n);
243
+ const paddedB = padToPowerOfTwo(signalB, n);
244
+ // 计算 FFT
245
+ const fftA = fft(paddedA);
246
+ const fftB = fft(paddedB);
247
+ // 计算 FFT(A) * conj(FFT(B))
248
+ const product = fftA.map((a, i) => {
249
+ const b = fftB[i];
250
+ // a * conj(b) = (a.re * b.re + a.im * b.im) + i(a.im * b.re - a.re * b.im)
251
+ return [
252
+ a[0] * b[0] + a[1] * b[1],
253
+ a[1] * b[0] - a[0] * b[1]
254
+ ];
255
+ });
256
+ // 计算 IFFT
257
+ const correlation = ifft(product);
258
+ // 提取实部
259
+ const result = new Float32Array(n);
260
+ for (let i = 0; i < n; i++) {
261
+ result[i] = correlation[i][0];
262
+ }
263
+ return result;
264
+ }
265
+ /**
266
+ * 从互相关结果中找到最大峰值位置
267
+ *
268
+ * @param correlation 互相关结果
269
+ * @param signalBLength 信号 B 的原始长度
270
+ * @returns 偏移量(正值表示 B 相对于 A 延迟,负值表示 B 领先)
271
+ */
272
+ function findPeakOffset(correlation, signalBLength) {
273
+ let maxValue = -Infinity;
274
+ let maxIndex = 0;
275
+ for (let i = 0; i < correlation.length; i++) {
276
+ if (correlation[i] > maxValue) {
277
+ maxValue = correlation[i];
278
+ maxIndex = i;
279
+ }
280
+ }
281
+ // 将索引转换为偏移量
282
+ // 如果 maxIndex < signalBLength,则 B 相对于 A 延迟
283
+ // 如果 maxIndex >= signalBLength,则需要从另一端计算
284
+ const n = correlation.length;
285
+ if (maxIndex > n / 2) {
286
+ return maxIndex - n;
287
+ }
288
+ return maxIndex;
289
+ }
290
+ /**
291
+ * 计算互相关的置信度(归一化相关系数)
292
+ */
293
+ function calculateConfidence(signalA, signalB, correlation, peakIndex) {
294
+ const maxCorr = correlation[peakIndex < 0 ? correlation.length + peakIndex : peakIndex];
295
+ // 计算信号的能量
296
+ let energyA = 0;
297
+ let energyB = 0;
298
+ for (let i = 0; i < signalA.length; i++) {
299
+ energyA += signalA[i] * signalA[i];
300
+ }
301
+ for (let i = 0; i < signalB.length; i++) {
302
+ energyB += signalB[i] * signalB[i];
303
+ }
304
+ // 归一化相关系数
305
+ const normFactor = Math.sqrt(energyA * energyB);
306
+ if (normFactor === 0)
307
+ return 0;
308
+ return Math.abs(maxCorr) / normFactor;
309
+ }
310
+
311
+ /**
312
+ * 多机位视频音频同步主模块
313
+ *
314
+ * 使用音频互相关算法自动对齐多个视频的时间轴
315
+ */
316
+ /**
317
+ * 音频视频同步器
318
+ */
319
+ class AudioVideoSync {
320
+ constructor(ffmpeg) {
321
+ this.loaded = false;
322
+ this.ffmpeg = ffmpeg || new FFmpeg();
323
+ }
324
+ /**
325
+ * 加载 FFmpeg(如果尚未加载)
326
+ */
327
+ async load() {
328
+ if (this.loaded)
329
+ return;
330
+ if (!this.ffmpeg.loaded) {
331
+ await this.ffmpeg.load();
332
+ }
333
+ this.loaded = true;
334
+ }
335
+ /**
336
+ * 同步多个视频
337
+ *
338
+ * @param videos 视频文件数组
339
+ * @param options 同步选项
340
+ * @returns 同步结果
341
+ */
342
+ async syncVideos(videos, options = {}) {
343
+ const { referenceIndex = 0, minConfidence = 0.3, onProgress, ...extractOptions } = options;
344
+ if (videos.length < 2) {
345
+ return {
346
+ referenceId: videos[0]?.id || '0',
347
+ results: [],
348
+ sampleRate: extractOptions.sampleRate || 16000,
349
+ success: false,
350
+ error: '至少需要 2 个视频进行同步'
351
+ };
352
+ }
353
+ if (referenceIndex < 0 || referenceIndex >= videos.length) {
354
+ return {
355
+ referenceId: '',
356
+ results: [],
357
+ sampleRate: extractOptions.sampleRate || 16000,
358
+ success: false,
359
+ error: '参考视频索引无效'
360
+ };
361
+ }
362
+ try {
363
+ await this.load();
364
+ // 提取所有视频的音频
365
+ const audioDataList = [];
366
+ for (let i = 0; i < videos.length; i++) {
367
+ onProgress?.('extracting', (i + 1) / videos.length);
368
+ const audio = await extractAudio(this.ffmpeg, videos[i].file, extractOptions);
369
+ audioDataList.push(audio);
370
+ }
371
+ // 预处理音频
372
+ const processedAudio = audioDataList.map(a => preprocessAudio(a.samples));
373
+ // 获取参考音频
374
+ const referenceAudio = processedAudio[referenceIndex];
375
+ const referenceVideo = videos[referenceIndex];
376
+ const sampleRate = audioDataList[referenceIndex].sampleRate;
377
+ // 计算每个视频相对于参考视频的偏移
378
+ const results = [];
379
+ for (let i = 0; i < videos.length; i++) {
380
+ onProgress?.('correlating', (i + 1) / videos.length);
381
+ const video = videos[i];
382
+ const videoId = video.id || i.toString();
383
+ if (i === referenceIndex) {
384
+ // 参考视频偏移为 0
385
+ results.push({
386
+ id: videoId,
387
+ offsetSeconds: 0,
388
+ offsetSamples: 0,
389
+ confidence: 1,
390
+ correctedStartTime: video.originalStartTime || null
391
+ });
392
+ continue;
393
+ }
394
+ const targetAudio = processedAudio[i];
395
+ // 计算互相关
396
+ const correlation = crossCorrelate(referenceAudio, targetAudio);
397
+ // 找到峰值偏移
398
+ const offsetSamples = findPeakOffset(correlation, targetAudio.length);
399
+ const offsetSeconds = offsetSamples / sampleRate;
400
+ // 计算置信度
401
+ const confidence = calculateConfidence(referenceAudio, targetAudio, correlation, offsetSamples);
402
+ // 计算校正后的开始时间
403
+ let correctedStartTime = null;
404
+ if (referenceVideo.originalStartTime) {
405
+ correctedStartTime = new Date(referenceVideo.originalStartTime.getTime() - offsetSeconds * 1000);
406
+ }
407
+ results.push({
408
+ id: videoId,
409
+ offsetSeconds,
410
+ offsetSamples,
411
+ confidence,
412
+ correctedStartTime
413
+ });
414
+ }
415
+ // 检查是否所有同步都成功
416
+ const allSuccessful = results.every(r => r.confidence >= minConfidence);
417
+ return {
418
+ referenceId: referenceVideo.id || referenceIndex.toString(),
419
+ results,
420
+ sampleRate,
421
+ success: allSuccessful,
422
+ error: allSuccessful ? undefined : '部分视频同步置信度过低'
423
+ };
424
+ }
425
+ catch (error) {
426
+ return {
427
+ referenceId: videos[referenceIndex]?.id || referenceIndex.toString(),
428
+ results: [],
429
+ sampleRate: extractOptions.sampleRate || 16000,
430
+ success: false,
431
+ error: error instanceof Error ? error.message : '同步过程发生错误'
432
+ };
433
+ }
434
+ }
435
+ /**
436
+ * 计算两个视频之间的时间偏移
437
+ *
438
+ * @param referenceVideo 参考视频
439
+ * @param targetVideo 目标视频
440
+ * @param options 提取选项
441
+ * @returns 偏移结果
442
+ */
443
+ async calculateOffset(referenceVideo, targetVideo, options = {}) {
444
+ await this.load();
445
+ // 提取音频
446
+ const refAudio = await extractAudio(this.ffmpeg, referenceVideo, options);
447
+ const targetAudio = await extractAudio(this.ffmpeg, targetVideo, options);
448
+ // 预处理
449
+ const refProcessed = preprocessAudio(refAudio.samples);
450
+ const targetProcessed = preprocessAudio(targetAudio.samples);
451
+ // 计算互相关
452
+ const correlation = crossCorrelate(refProcessed, targetProcessed);
453
+ // 找到峰值
454
+ const offsetSamples = findPeakOffset(correlation, targetProcessed.length);
455
+ const offsetSeconds = offsetSamples / refAudio.sampleRate;
456
+ // 计算置信度
457
+ const confidence = calculateConfidence(refProcessed, targetProcessed, correlation, offsetSamples);
458
+ return { offsetSeconds, confidence };
459
+ }
460
+ }
461
+ /**
462
+ * 创建同步器实例的便捷函数
463
+ */
464
+ function createSync(ffmpeg) {
465
+ return new AudioVideoSync(ffmpeg);
466
+ }
467
+ /**
468
+ * 快速同步多个视频(一次性使用)
469
+ */
470
+ async function syncVideos(videos, options) {
471
+ const sync = new AudioVideoSync();
472
+ return sync.syncVideos(videos, options);
473
+ }
474
+
475
+ export { AudioVideoSync, audioBufferToFloat32, calculateConfidence, createSync, crossCorrelate, downsample, extractAudio, fft, findPeakOffset, ifft, nextPowerOfTwo, padToPowerOfTwo, preprocessAudio, syncVideos };