minicpmo-utils 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. cosyvoice/__init__.py +17 -0
  2. cosyvoice/bin/average_model.py +93 -0
  3. cosyvoice/bin/export_jit.py +103 -0
  4. cosyvoice/bin/export_onnx.py +120 -0
  5. cosyvoice/bin/inference_deprecated.py +126 -0
  6. cosyvoice/bin/train.py +195 -0
  7. cosyvoice/cli/__init__.py +0 -0
  8. cosyvoice/cli/cosyvoice.py +209 -0
  9. cosyvoice/cli/frontend.py +238 -0
  10. cosyvoice/cli/model.py +386 -0
  11. cosyvoice/dataset/__init__.py +0 -0
  12. cosyvoice/dataset/dataset.py +151 -0
  13. cosyvoice/dataset/processor.py +434 -0
  14. cosyvoice/flow/decoder.py +494 -0
  15. cosyvoice/flow/flow.py +281 -0
  16. cosyvoice/flow/flow_matching.py +227 -0
  17. cosyvoice/flow/length_regulator.py +70 -0
  18. cosyvoice/hifigan/discriminator.py +230 -0
  19. cosyvoice/hifigan/f0_predictor.py +58 -0
  20. cosyvoice/hifigan/generator.py +582 -0
  21. cosyvoice/hifigan/hifigan.py +67 -0
  22. cosyvoice/llm/llm.py +610 -0
  23. cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +58836 -0
  24. cosyvoice/tokenizer/tokenizer.py +279 -0
  25. cosyvoice/transformer/__init__.py +0 -0
  26. cosyvoice/transformer/activation.py +84 -0
  27. cosyvoice/transformer/attention.py +330 -0
  28. cosyvoice/transformer/convolution.py +145 -0
  29. cosyvoice/transformer/decoder.py +396 -0
  30. cosyvoice/transformer/decoder_layer.py +132 -0
  31. cosyvoice/transformer/embedding.py +302 -0
  32. cosyvoice/transformer/encoder.py +474 -0
  33. cosyvoice/transformer/encoder_layer.py +236 -0
  34. cosyvoice/transformer/label_smoothing_loss.py +96 -0
  35. cosyvoice/transformer/positionwise_feed_forward.py +115 -0
  36. cosyvoice/transformer/subsampling.py +383 -0
  37. cosyvoice/transformer/upsample_encoder.py +320 -0
  38. cosyvoice/utils/__init__.py +0 -0
  39. cosyvoice/utils/class_utils.py +83 -0
  40. cosyvoice/utils/common.py +186 -0
  41. cosyvoice/utils/executor.py +176 -0
  42. cosyvoice/utils/file_utils.py +129 -0
  43. cosyvoice/utils/frontend_utils.py +136 -0
  44. cosyvoice/utils/losses.py +57 -0
  45. cosyvoice/utils/mask.py +265 -0
  46. cosyvoice/utils/scheduler.py +738 -0
  47. cosyvoice/utils/train_utils.py +367 -0
  48. cosyvoice/vllm/cosyvoice2.py +103 -0
  49. matcha/__init__.py +0 -0
  50. matcha/app.py +357 -0
  51. matcha/cli.py +418 -0
  52. matcha/hifigan/__init__.py +0 -0
  53. matcha/hifigan/config.py +28 -0
  54. matcha/hifigan/denoiser.py +64 -0
  55. matcha/hifigan/env.py +17 -0
  56. matcha/hifigan/meldataset.py +217 -0
  57. matcha/hifigan/models.py +368 -0
  58. matcha/hifigan/xutils.py +60 -0
  59. matcha/models/__init__.py +0 -0
  60. matcha/models/baselightningmodule.py +209 -0
  61. matcha/models/components/__init__.py +0 -0
  62. matcha/models/components/decoder.py +443 -0
  63. matcha/models/components/flow_matching.py +132 -0
  64. matcha/models/components/text_encoder.py +410 -0
  65. matcha/models/components/transformer.py +316 -0
  66. matcha/models/matcha_tts.py +239 -0
  67. matcha/onnx/__init__.py +0 -0
  68. matcha/onnx/export.py +181 -0
  69. matcha/onnx/infer.py +168 -0
  70. matcha/text/__init__.py +53 -0
  71. matcha/text/cleaners.py +116 -0
  72. matcha/text/numbers.py +71 -0
  73. matcha/text/symbols.py +17 -0
  74. matcha/train.py +122 -0
  75. matcha/utils/__init__.py +5 -0
  76. matcha/utils/audio.py +82 -0
  77. matcha/utils/generate_data_statistics.py +111 -0
  78. matcha/utils/instantiators.py +56 -0
  79. matcha/utils/logging_utils.py +53 -0
  80. matcha/utils/model.py +90 -0
  81. matcha/utils/monotonic_align/__init__.py +22 -0
  82. matcha/utils/monotonic_align/setup.py +7 -0
  83. matcha/utils/pylogger.py +21 -0
  84. matcha/utils/rich_utils.py +101 -0
  85. matcha/utils/utils.py +219 -0
  86. minicpmo/__init__.py +24 -0
  87. minicpmo/utils.py +636 -0
  88. minicpmo/version.py +2 -0
  89. minicpmo_utils-0.1.0.dist-info/METADATA +72 -0
  90. minicpmo_utils-0.1.0.dist-info/RECORD +148 -0
  91. minicpmo_utils-0.1.0.dist-info/WHEEL +5 -0
  92. minicpmo_utils-0.1.0.dist-info/top_level.txt +5 -0
  93. s3tokenizer/__init__.py +153 -0
  94. s3tokenizer/assets/BAC009S0764W0121.wav +0 -0
  95. s3tokenizer/assets/BAC009S0764W0122.wav +0 -0
  96. s3tokenizer/assets/mel_filters.npz +0 -0
  97. s3tokenizer/cli.py +183 -0
  98. s3tokenizer/model.py +546 -0
  99. s3tokenizer/model_v2.py +605 -0
  100. s3tokenizer/utils.py +390 -0
  101. stepaudio2/__init__.py +40 -0
  102. stepaudio2/cosyvoice2/__init__.py +1 -0
  103. stepaudio2/cosyvoice2/flow/__init__.py +0 -0
  104. stepaudio2/cosyvoice2/flow/decoder_dit.py +585 -0
  105. stepaudio2/cosyvoice2/flow/flow.py +230 -0
  106. stepaudio2/cosyvoice2/flow/flow_matching.py +205 -0
  107. stepaudio2/cosyvoice2/transformer/__init__.py +0 -0
  108. stepaudio2/cosyvoice2/transformer/attention.py +328 -0
  109. stepaudio2/cosyvoice2/transformer/embedding.py +119 -0
  110. stepaudio2/cosyvoice2/transformer/encoder_layer.py +163 -0
  111. stepaudio2/cosyvoice2/transformer/positionwise_feed_forward.py +56 -0
  112. stepaudio2/cosyvoice2/transformer/subsampling.py +79 -0
  113. stepaudio2/cosyvoice2/transformer/upsample_encoder_v2.py +483 -0
  114. stepaudio2/cosyvoice2/utils/__init__.py +1 -0
  115. stepaudio2/cosyvoice2/utils/class_utils.py +41 -0
  116. stepaudio2/cosyvoice2/utils/common.py +101 -0
  117. stepaudio2/cosyvoice2/utils/mask.py +49 -0
  118. stepaudio2/flashcosyvoice/__init__.py +0 -0
  119. stepaudio2/flashcosyvoice/cli.py +424 -0
  120. stepaudio2/flashcosyvoice/config.py +80 -0
  121. stepaudio2/flashcosyvoice/cosyvoice2.py +160 -0
  122. stepaudio2/flashcosyvoice/cosyvoice3.py +1 -0
  123. stepaudio2/flashcosyvoice/engine/__init__.py +0 -0
  124. stepaudio2/flashcosyvoice/engine/block_manager.py +114 -0
  125. stepaudio2/flashcosyvoice/engine/llm_engine.py +125 -0
  126. stepaudio2/flashcosyvoice/engine/model_runner.py +310 -0
  127. stepaudio2/flashcosyvoice/engine/scheduler.py +77 -0
  128. stepaudio2/flashcosyvoice/engine/sequence.py +90 -0
  129. stepaudio2/flashcosyvoice/modules/__init__.py +0 -0
  130. stepaudio2/flashcosyvoice/modules/flow.py +198 -0
  131. stepaudio2/flashcosyvoice/modules/flow_components/__init__.py +0 -0
  132. stepaudio2/flashcosyvoice/modules/flow_components/estimator.py +974 -0
  133. stepaudio2/flashcosyvoice/modules/flow_components/upsample_encoder.py +998 -0
  134. stepaudio2/flashcosyvoice/modules/hifigan.py +249 -0
  135. stepaudio2/flashcosyvoice/modules/hifigan_components/__init__.py +0 -0
  136. stepaudio2/flashcosyvoice/modules/hifigan_components/layers.py +433 -0
  137. stepaudio2/flashcosyvoice/modules/qwen2.py +92 -0
  138. stepaudio2/flashcosyvoice/modules/qwen2_components/__init__.py +0 -0
  139. stepaudio2/flashcosyvoice/modules/qwen2_components/layers.py +616 -0
  140. stepaudio2/flashcosyvoice/modules/sampler.py +231 -0
  141. stepaudio2/flashcosyvoice/utils/__init__.py +0 -0
  142. stepaudio2/flashcosyvoice/utils/audio.py +77 -0
  143. stepaudio2/flashcosyvoice/utils/context.py +28 -0
  144. stepaudio2/flashcosyvoice/utils/loader.py +116 -0
  145. stepaudio2/flashcosyvoice/utils/memory.py +19 -0
  146. stepaudio2/stepaudio2.py +204 -0
  147. stepaudio2/token2wav.py +248 -0
  148. stepaudio2/utils.py +91 -0
minicpmo/utils.py ADDED
@@ -0,0 +1,636 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ # MiniCPM-o utils: 视频 / 音频处理等通用工具。
4
+ #
5
+ # 该模块设计为可以通过:
6
+ # from minicpmo.utils import ...
7
+ # 在外部项目中直接使用。
8
+
9
+ import base64
10
+ import logging
11
+ import math
12
+ import os
13
+ import subprocess
14
+ import tempfile
15
+ from io import BytesIO
16
+
17
+ import librosa
18
+ import numpy as np
19
+ from decord import cpu
20
+ from decord import VideoReader
21
+ from PIL import Image
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ MAX_NUM_FRAMES = int(os.getenv("MAX_NUM_FRAMES", 64))
26
+ VIDEO_MME_DURATION = os.getenv("VIDEO_MME_DURATION", "ALL")
27
+
28
+
29
+ def concat_images(images, bg_color=(255, 255, 255), cell_size=None, line_color=(0, 0, 0), line_width=6):
30
+ """
31
+ images: List[PIL.Image.Image]
32
+ Layout rules: 3 images -> 1x3; 4 images -> 2x2; 9 images -> 3x3; others: 1xN
33
+ Only draw separator lines at joints (no outer border).
34
+ """
35
+
36
+ _converted_images = []
37
+ for im in images:
38
+ if isinstance(im, Image.Image):
39
+ _converted_images.append(im)
40
+ elif isinstance(im, (bytes, bytearray)):
41
+ _converted_images.append(Image.open(BytesIO(im)).convert("RGB"))
42
+ elif isinstance(im, str):
43
+ b64 = im.split(",")[-1] if ";base64," in im else im
44
+ img_bytes = base64.b64decode(b64)
45
+ _converted_images.append(Image.open(BytesIO(img_bytes)).convert("RGB"))
46
+ else:
47
+ raise TypeError(f"Unsupported image type: {type(im)}")
48
+ images = _converted_images
49
+ n = len(images)
50
+ if n == 0:
51
+ raise ValueError("images is empty")
52
+
53
+ if n == 4:
54
+ rows, cols = 2, 2
55
+ elif n == 3:
56
+ # 动态选择 1x3 / 3x1,使得最终画布更接近正方形
57
+ if cell_size is None:
58
+ cell_w = max(im.width for im in images)
59
+ cell_h = max(im.height for im in images)
60
+ else:
61
+ cell_w, cell_h = cell_size
62
+
63
+ candidates = [(1, 3), (3, 1)]
64
+
65
+ def canvas_ratio(r, c):
66
+ W = c * cell_w + (c - 1) * line_width
67
+ H = r * cell_h + (r - 1) * line_width
68
+ return W / max(1, H)
69
+
70
+ ratios = [abs(canvas_ratio(r, c) - 1.0) for (r, c) in candidates]
71
+ best_idx = int(np.argmin(ratios))
72
+ rows, cols = candidates[best_idx]
73
+ elif n == 1:
74
+ rows, cols = 1, 1
75
+ elif n == 2:
76
+ # 动态选择 1x2 / 2x1,使得最终画布更接近正方形
77
+ if cell_size is None:
78
+ cell_w = max(im.width for im in images)
79
+ cell_h = max(im.height for im in images)
80
+ else:
81
+ cell_w, cell_h = cell_size
82
+ candidates = [(1, 2), (2, 1)]
83
+
84
+ def canvas_ratio(r, c):
85
+ W = c * cell_w + (c - 1) * line_width
86
+ H = r * cell_h + (r - 1) * line_width
87
+ return W / max(1, H)
88
+
89
+ ratios = [abs(canvas_ratio(r, c) - 1.0) for (r, c) in candidates]
90
+ if ratios[0] == ratios[1]:
91
+ avg_ar = np.mean([im.width / max(1, im.height) for im in images])
92
+ rows, cols = (1, 2) if avg_ar >= 1.0 else (2, 1)
93
+ else:
94
+ best_idx = int(np.argmin(ratios))
95
+ rows, cols = candidates[best_idx]
96
+ else:
97
+ rows, cols = 1, n
98
+
99
+ if cell_size is None:
100
+ cell_w = max(im.width for im in images)
101
+ cell_h = max(im.height for im in images)
102
+ else:
103
+ cell_w, cell_h = cell_size
104
+
105
+ def letterbox(im, tw, th):
106
+ im = im.convert("RGB")
107
+ w, h = im.size
108
+ s = min(tw / w, th / h)
109
+ nw, nh = max(1, int(round(w * s))), max(1, int(round(h * s)))
110
+ try:
111
+ im_r = im.resize((nw, nh), Image.Resampling.BICUBIC)
112
+ except AttributeError:
113
+ im_r = im.resize((nw, nh), Image.BICUBIC)
114
+ canvas = Image.new("RGB", (tw, th), bg_color)
115
+ canvas.paste(im_r, ((tw - nw) // 2, (th - nh) // 2))
116
+ return canvas
117
+
118
+ W = cols * cell_w + (cols - 1) * line_width
119
+ H = rows * cell_h + (rows - 1) * line_width
120
+ canvas = Image.new("RGB", (W, H), line_color)
121
+
122
+ for i, im in enumerate(images[: rows * cols]):
123
+ r, c = divmod(i, cols)
124
+ cell = letterbox(im, cell_w, cell_h)
125
+ x = c * (cell_w + line_width)
126
+ y = r * (cell_h + line_width)
127
+ canvas.paste(cell, (x, y))
128
+
129
+ return canvas
130
+
131
+
132
+ def uniform_sample(l, n):
133
+ if len(l) <= n:
134
+ return l
135
+ idxs = np.linspace(0, len(l) - 1, n, dtype=int)
136
+ return [l[i] for i in idxs]
137
+
138
+
139
+ def get_audio_segments(
140
+ timestamps, duration, video_path, audio_path=None, sr=16000, adjust_length=False, use_ffmpeg=False
141
+ ):
142
+ """
143
+ 根据时间戳切分音频。
144
+ """
145
+ import subprocess
146
+ import warnings
147
+
148
+ if audio_path is None:
149
+ if use_ffmpeg:
150
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
151
+ temp_audio_path = temp_audio_file.name
152
+ try:
153
+ cmd = ["ffmpeg", "-y", "-i", video_path, "-vn", "-ac", "1", "-ar", str(sr), temp_audio_path]
154
+ subprocess.run(cmd, check=True, capture_output=True)
155
+ with warnings.catch_warnings():
156
+ warnings.filterwarnings("ignore", message="PySoundFile failed")
157
+ audio_np, sr = librosa.load(temp_audio_path, sr=sr, mono=True)
158
+ finally:
159
+ if os.path.exists(temp_audio_path):
160
+ os.remove(temp_audio_path)
161
+ else:
162
+ try:
163
+ with warnings.catch_warnings():
164
+ warnings.filterwarnings("ignore", message="PySoundFile failed")
165
+ audio_np, sr = librosa.load(video_path, sr=sr, mono=True)
166
+ except Exception:
167
+ try:
168
+ from moviepy import VideoFileClip # moviepy >= 2.0
169
+ except ImportError:
170
+ from moviepy.editor import VideoFileClip # moviepy < 2.0
171
+
172
+ video_clip = VideoFileClip(video_path)
173
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as temp_audio_file:
174
+ temp_audio_file_path = temp_audio_file.name
175
+ video_clip.audio.write_audiofile(temp_audio_file_path, codec="pcm_s16le", fps=sr)
176
+ with warnings.catch_warnings():
177
+ warnings.filterwarnings("ignore", message="PySoundFile failed")
178
+ audio_np, sr = librosa.load(temp_audio_file_path, sr=sr, mono=True)
179
+ else:
180
+ with warnings.catch_warnings():
181
+ warnings.filterwarnings("ignore", message="PySoundFile failed")
182
+ audio_np, sr = librosa.load(audio_path, sr=sr, mono=True)
183
+
184
+ if adjust_length:
185
+ num_frames = len(timestamps)
186
+ target_length = num_frames * sr
187
+ current_length = len(audio_np)
188
+ if current_length < target_length:
189
+ padding = np.zeros(target_length - current_length, dtype=audio_np.dtype)
190
+ audio_np = np.concatenate([audio_np, padding])
191
+ elif current_length > target_length:
192
+ audio_np = audio_np[:target_length]
193
+
194
+ audio_segments = []
195
+ for i in range(len(timestamps)):
196
+ start_sample = i * sr
197
+ end_sample = (i + 1) * sr
198
+ segment = audio_np[start_sample:end_sample]
199
+ audio_segments.append(segment)
200
+ else:
201
+ audio_segments = []
202
+ for i in range(len(timestamps)):
203
+ start_time = timestamps[i]
204
+ if i < len(timestamps) - 1:
205
+ end_time = timestamps[i + 1]
206
+ else:
207
+ end_time = duration
208
+
209
+ start_sample = int(start_time * sr)
210
+ end_sample = int(end_time * sr)
211
+ segment = audio_np[start_sample:end_sample]
212
+
213
+ if i == len(timestamps) - 1 and len(segment) < 1600:
214
+ segment = np.concatenate([segment, np.zeros(1600 - len(segment), dtype=segment.dtype)])
215
+ audio_segments.append(segment)
216
+
217
+ return audio_segments
218
+
219
+
220
+ def get_video_duration(video_path: str) -> float:
221
+ cmd = [
222
+ "ffprobe",
223
+ "-v",
224
+ "error",
225
+ "-show_entries",
226
+ "format=duration",
227
+ "-of",
228
+ "default=noprint_wrappers=1:nokey=1",
229
+ video_path,
230
+ ]
231
+ result = subprocess.run(cmd, capture_output=True, text=True, check=True)
232
+ return float(result.stdout.strip())
233
+
234
+
235
+ def get_video_frame_audio_segments(
236
+ video_path, audio_path=None, last_vad_timestamp=None, stack_frames=1, use_ffmpeg=False, adjust_audio_length=False
237
+ ):
238
+ """
239
+ 同时抽取视频帧和对应音频片段,返回:
240
+ - video_segments: List[PIL.Image]
241
+ - audio_segments: List[np.ndarray]
242
+ - stacked_video_segments: List[PIL.Image] or None
243
+ """
244
+
245
+ if use_ffmpeg:
246
+ _duration = get_video_duration(video_path)
247
+ _temp_dir = tempfile.TemporaryDirectory()
248
+ _temp_dir_path = _temp_dir.name
249
+
250
+ def _get_duration_and_fps():
251
+ return _duration, None
252
+
253
+ def _extract_frames_by_timestamps(timestamps, is_long_video):
254
+ frames_dir = os.path.join(_temp_dir_path, "frames_1fps")
255
+ os.makedirs(frames_dir, exist_ok=True)
256
+
257
+ if is_long_video:
258
+ fps_to_extract = 10
259
+ else:
260
+ fps_to_extract = 1
261
+
262
+ frame_cmd = [
263
+ "ffmpeg",
264
+ "-y",
265
+ "-i",
266
+ video_path,
267
+ "-vf",
268
+ f"fps={fps_to_extract}",
269
+ os.path.join(frames_dir, "frame_%06d.jpg"),
270
+ ]
271
+ subprocess.run(frame_cmd, capture_output=True, check=True)
272
+
273
+ frame_files = sorted([f for f in os.listdir(frames_dir) if f.endswith(".jpg")])
274
+
275
+ if is_long_video:
276
+ total_frames = len(frame_files)
277
+ sampled_indices = uniform_sample(list(range(total_frames)), MAX_NUM_FRAMES)
278
+ new_timestamps = [round(i / fps_to_extract, 1) for i in sampled_indices]
279
+ frames = []
280
+ for idx in sampled_indices:
281
+ frame_path = os.path.join(frames_dir, frame_files[idx])
282
+ frames.append(Image.open(frame_path).convert("RGB"))
283
+ return frames, new_timestamps
284
+ else:
285
+ new_timestamps = list(range(len(frame_files)))
286
+ frames = []
287
+ for f in frame_files:
288
+ frame_path = os.path.join(frames_dir, f)
289
+ frames.append(Image.open(frame_path).convert("RGB"))
290
+ return frames, new_timestamps
291
+
292
+ def _extract_stack_frames(all_frame_timestamps, duration, num_seconds):
293
+ stack_frames_dir = os.path.join(_temp_dir_path, "frames_stack")
294
+ os.makedirs(stack_frames_dir, exist_ok=True)
295
+
296
+ frame_cmd = [
297
+ "ffmpeg",
298
+ "-y",
299
+ "-i",
300
+ video_path,
301
+ "-vf",
302
+ f"fps={stack_frames}",
303
+ os.path.join(stack_frames_dir, "frame_%06d.jpg"),
304
+ ]
305
+ subprocess.run(frame_cmd, capture_output=True, check=True)
306
+
307
+ stack_frame_files = sorted([f for f in os.listdir(stack_frames_dir) if f.endswith(".jpg")])
308
+
309
+ new_timestamps = []
310
+ valid_frame_indices = []
311
+ for i, f in enumerate(stack_frame_files):
312
+ if i % stack_frames != 0:
313
+ ts = i / stack_frames
314
+ if ts < duration:
315
+ new_timestamps.append(ts)
316
+ valid_frame_indices.append(i)
317
+
318
+ max_stack_frames_count = MAX_NUM_FRAMES * (stack_frames - 1)
319
+ if len(valid_frame_indices) > max_stack_frames_count:
320
+ sampled = uniform_sample(list(zip(valid_frame_indices, new_timestamps)), max_stack_frames_count)
321
+ valid_frame_indices = [x[0] for x in sampled]
322
+ new_timestamps = [x[1] for x in sampled]
323
+
324
+ frames = []
325
+ for idx in valid_frame_indices:
326
+ frame_path = os.path.join(stack_frames_dir, stack_frame_files[idx])
327
+ frames.append(Image.open(frame_path).convert("RGB"))
328
+
329
+ return frames, new_timestamps
330
+
331
+ def _cleanup():
332
+ _temp_dir.cleanup()
333
+
334
+ else:
335
+ _vr = VideoReader(str(video_path), ctx=cpu(0))
336
+ _avg_fps = _vr.get_avg_fps()
337
+ _duration = len(_vr) / _avg_fps
338
+
339
+ def _get_duration_and_fps():
340
+ return _duration, _avg_fps
341
+
342
+ def _extract_frames_by_timestamps(timestamps, is_long_video):
343
+ if is_long_video:
344
+ frame_idx = [min(int(ts * _avg_fps), len(_vr) - 1) for ts in timestamps]
345
+ frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
346
+ new_timestamps = uniform_sample(timestamps, MAX_NUM_FRAMES)
347
+ else:
348
+ num_seconds = len(timestamps)
349
+ frame_idx = [int(i * _avg_fps) for i in range(num_seconds)]
350
+ new_timestamps = timestamps
351
+
352
+ video = _vr.get_batch(frame_idx).asnumpy()
353
+ frames = [Image.fromarray(v.astype("uint8")).convert("RGB") for v in video]
354
+ return frames, new_timestamps
355
+
356
+ def _extract_stack_frames(all_frame_timestamps, duration, num_seconds):
357
+ stack_frame_idx = [min(int(ts * _avg_fps), len(_vr) - 1) for ts in all_frame_timestamps]
358
+
359
+ max_stack_frames_count = MAX_NUM_FRAMES * (stack_frames - 1)
360
+ if len(stack_frame_idx) > max_stack_frames_count:
361
+ stack_frame_idx = uniform_sample(stack_frame_idx, max_stack_frames_count)
362
+ new_timestamps = uniform_sample(all_frame_timestamps, max_stack_frames_count)
363
+ else:
364
+ new_timestamps = all_frame_timestamps
365
+
366
+ stack_video = _vr.get_batch(stack_frame_idx).asnumpy()
367
+ frames = [Image.fromarray(v.astype("uint8")).convert("RGB") for v in stack_video]
368
+ return frames, new_timestamps
369
+
370
+ def _cleanup():
371
+ pass
372
+
373
+ try:
374
+ duration, avg_fps = _get_duration_and_fps()
375
+ if last_vad_timestamp is not None:
376
+ duration = last_vad_timestamp
377
+
378
+ num_seconds = math.ceil(duration)
379
+ second_timestamps = list(range(num_seconds))
380
+
381
+ is_long_video = duration > MAX_NUM_FRAMES
382
+ if is_long_video:
383
+ timestamps = [round(i * 0.1, 1) for i in range(int(duration / 0.1))]
384
+ else:
385
+ timestamps = second_timestamps
386
+
387
+ video_segments, timestamps = _extract_frames_by_timestamps(timestamps, is_long_video)
388
+
389
+ stacked_video_segments = None
390
+ if stack_frames > 1:
391
+ all_frame_timestamps = []
392
+ for sec in range(num_seconds):
393
+ for i in range(1, stack_frames):
394
+ ts = sec + i / stack_frames
395
+ if ts < duration:
396
+ all_frame_timestamps.append(ts)
397
+
398
+ all_frames, all_frame_timestamps = _extract_stack_frames(all_frame_timestamps, duration, num_seconds)
399
+
400
+ stacked_video_segments = []
401
+ frame_cursor = 0
402
+ for sec in range(num_seconds):
403
+ frames_this_second = []
404
+ while frame_cursor < len(all_frame_timestamps) and all_frame_timestamps[frame_cursor] < sec + 1:
405
+ frames_this_second.append(all_frames[frame_cursor])
406
+ frame_cursor += 1
407
+
408
+ if len(frames_this_second) > 0:
409
+ stacked_frame = concat_images(frames_this_second)
410
+ stacked_video_segments.append(stacked_frame)
411
+ else:
412
+ stacked_video_segments.append(None)
413
+
414
+ audio_segments = get_audio_segments(
415
+ timestamps, duration, video_path, audio_path, adjust_length=adjust_audio_length, use_ffmpeg=use_ffmpeg
416
+ )
417
+
418
+ return video_segments, audio_segments, stacked_video_segments
419
+
420
+ finally:
421
+ _cleanup()
422
+
423
+
424
+ def format_srt_time(seconds: float) -> str:
425
+ """Convert seconds to SRT time format HH:MM:SS,mmm"""
426
+ hours = int(seconds // 3600)
427
+ minutes = int((seconds % 3600) // 60)
428
+ secs = int(seconds % 60)
429
+ millis = int((seconds % 1) * 1000)
430
+ return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
431
+
432
+
433
+ def generate_srt_from_results(results_log: list, video_duration: float, output_srt_path: str) -> int:
434
+ """
435
+ 从推理结果生成 SRT 字幕文件。
436
+ """
437
+
438
+ special_tokens = ["<|tts_pad|>", "<|turn_eos|>", "<|chunk_eos|>", "<|listen|>", "<|speak|>"]
439
+
440
+ srt_lines = []
441
+ subtitle_index = 1
442
+
443
+ for result in results_log:
444
+ chunk_idx = result["chunk_idx"]
445
+ text = result.get("text", "")
446
+ is_listen = result.get("is_listen", True)
447
+
448
+ if not text or is_listen:
449
+ continue
450
+
451
+ clean_text = text
452
+ for token in special_tokens:
453
+ clean_text = clean_text.replace(token, "")
454
+ clean_text = clean_text.strip()
455
+
456
+ if not clean_text:
457
+ continue
458
+
459
+ start_time = chunk_idx + 1
460
+ end_time = chunk_idx + 2
461
+
462
+ if start_time >= video_duration:
463
+ continue
464
+ end_time = min(end_time, video_duration)
465
+
466
+ start_str = format_srt_time(start_time)
467
+ end_str = format_srt_time(end_time)
468
+
469
+ srt_lines.append(f"{subtitle_index}")
470
+ srt_lines.append(f"{start_str} --> {end_str}")
471
+ srt_lines.append(clean_text)
472
+ srt_lines.append("")
473
+
474
+ subtitle_index += 1
475
+
476
+ with open(output_srt_path, "w", encoding="utf-8") as f:
477
+ f.write("\n".join(srt_lines))
478
+
479
+ return subtitle_index - 1
480
+
481
+
482
+ def generate_ai_audio_file(
483
+ timed_output_audio: list,
484
+ video_duration: float,
485
+ output_sample_rate: int,
486
+ ) -> str:
487
+ import soundfile as sf
488
+
489
+ max_end_time = 0
490
+ for chunk_idx, audio in timed_output_audio:
491
+ start_time = chunk_idx + 1
492
+ duration = len(audio) / output_sample_rate
493
+ end_time = start_time + duration
494
+ max_end_time = max(max_end_time, end_time)
495
+
496
+ total_duration = max(video_duration, max_end_time)
497
+ total_samples = int(total_duration * output_sample_rate)
498
+ ai_audio_track = np.zeros(total_samples, dtype=np.float32)
499
+
500
+ for chunk_idx, audio in timed_output_audio:
501
+ start_time = chunk_idx + 1
502
+ start_sample = int(start_time * output_sample_rate)
503
+ end_sample = start_sample + len(audio)
504
+
505
+ if end_sample <= len(ai_audio_track):
506
+ ai_audio_track[start_sample:end_sample] += audio
507
+ else:
508
+ available_len = len(ai_audio_track) - start_sample
509
+ if available_len > 0:
510
+ ai_audio_track[start_sample:] += audio[:available_len]
511
+
512
+ ai_audio_track = np.clip(ai_audio_track, -1.0, 1.0)
513
+
514
+ ai_audio_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
515
+ sf.write(
516
+ ai_audio_path,
517
+ (ai_audio_track * 32768).astype(np.int16),
518
+ output_sample_rate,
519
+ subtype="PCM_16",
520
+ )
521
+
522
+ return ai_audio_path
523
+
524
+
525
+ def generate_duplex_video(
526
+ video_path: str,
527
+ output_video_path: str,
528
+ results_log: list,
529
+ timed_output_audio: list,
530
+ output_sample_rate: int = 24000,
531
+ ):
532
+ """
533
+ 使用 ffmpeg 合成带有 AI 回复与字幕的双声道视频。
534
+ """
535
+ import soundfile as sf
536
+
537
+ try:
538
+ video_duration = get_video_duration(video_path)
539
+ except Exception as e:
540
+ video_duration = 60.0
541
+ logger.warning(f" ffprobe duration failed: {e}, using 60s default")
542
+
543
+ output_dir = os.path.dirname(output_video_path)
544
+ srt_path = os.path.join(output_dir, "subtitles.srt")
545
+ subtitle_count = generate_srt_from_results(results_log, video_duration, srt_path)
546
+
547
+ ai_audio_path = None
548
+ if timed_output_audio:
549
+ ai_audio_path = generate_ai_audio_file(timed_output_audio, video_duration, output_sample_rate)
550
+
551
+ has_original_audio = False
552
+ try:
553
+ probe_audio_cmd = [
554
+ "ffprobe",
555
+ "-v",
556
+ "error",
557
+ "-select_streams",
558
+ "a:0",
559
+ "-show_entries",
560
+ "stream=codec_type",
561
+ "-of",
562
+ "default=noprint_wrappers=1:nokey=1",
563
+ video_path,
564
+ ]
565
+ result = subprocess.run(probe_audio_cmd, capture_output=True, text=True)
566
+ has_original_audio = result.stdout.strip() == "audio"
567
+ except Exception:
568
+ pass
569
+
570
+ has_subtitles = subtitle_count > 0 and os.path.exists(srt_path)
571
+
572
+ if has_subtitles:
573
+ srt_path_escaped = srt_path.replace("\\", "\\\\").replace("'", "'\\''").replace(":", "\\:")
574
+ subtitle_filter = (
575
+ f"subtitles='{srt_path_escaped}':"
576
+ f"force_style='FontSize=28,"
577
+ f"PrimaryColour=&H00FFFFFF,"
578
+ f"OutlineColour=&H00000000,"
579
+ f"BorderStyle=3,"
580
+ f"Outline=2,"
581
+ f"Shadow=1,"
582
+ f"MarginV=30,"
583
+ f"Alignment=2'"
584
+ )
585
+
586
+ cmd = ["ffmpeg", "-y", "-i", video_path]
587
+
588
+ if ai_audio_path:
589
+ cmd.extend(["-i", ai_audio_path])
590
+
591
+ if has_original_audio:
592
+ if has_subtitles:
593
+ filter_complex = f"[0:v]{subtitle_filter}[vout];[0:a][1:a]amix=inputs=2:duration=longest[aout]"
594
+ cmd.extend(["-filter_complex", filter_complex, "-map", "[vout]", "-map", "[aout]"])
595
+ else:
596
+ filter_complex = f"[0:a][1:a]amix=inputs=2:duration=longest[aout]"
597
+ cmd.extend(["-filter_complex", filter_complex, "-map", "0:v", "-map", "[aout]"])
598
+ else:
599
+ if has_subtitles:
600
+ filter_complex = f"[0:v]{subtitle_filter}[vout]"
601
+ cmd.extend(["-filter_complex", filter_complex, "-map", "[vout]", "-map", "1:a"])
602
+ else:
603
+ cmd.extend(["-map", "0:v", "-map", "1:a"])
604
+ else:
605
+ if has_subtitles:
606
+ cmd.extend(["-vf", subtitle_filter])
607
+ if has_original_audio:
608
+ cmd.extend(["-c:a", "copy"])
609
+
610
+ cmd.extend(["-c:v", "libx264", "-c:a", "aac", "-preset", "medium", "-crf", "23", output_video_path])
611
+
612
+ try:
613
+ _ = subprocess.run(cmd, capture_output=True, text=True, check=True)
614
+ except subprocess.CalledProcessError:
615
+ raise
616
+ finally:
617
+ if os.path.exists(srt_path):
618
+ os.remove(srt_path)
619
+ if ai_audio_path and os.path.exists(ai_audio_path):
620
+ os.remove(ai_audio_path)
621
+
622
+ return output_video_path
623
+
624
+
625
+ __all__ = [
626
+ "concat_images",
627
+ "uniform_sample",
628
+ "get_audio_segments",
629
+ "get_video_duration",
630
+ "get_video_frame_audio_segments",
631
+ "format_srt_time",
632
+ "generate_srt_from_results",
633
+ "generate_ai_audio_file",
634
+ "generate_duplex_video",
635
+ ]
636
+
minicpmo/version.py ADDED
@@ -0,0 +1,2 @@
1
+ __version__ = "0.1.0"
2
+