groq-whisper-cli 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,30 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+ tmp.*
12
+ tmp/
13
+ presentations/
14
+
15
+ # Playwright
16
+ node_modules/
17
+ /test-results/
18
+ /playwright-report/
19
+ /blob-report/
20
+ /playwright/.cache/
21
+ /playwright/.auth/
22
+ work_product/
23
+ .env
24
+ TODO
25
+ archive/
26
+ docs/
27
+ key/
28
+ backup_archives.ps1
29
+ .codex/
30
+ .opencode/
@@ -0,0 +1,115 @@
1
+ Metadata-Version: 2.4
2
+ Name: groq-whisper-cli
3
+ Version: 0.1.0
4
+ Summary: Groq Whisper 多 Key 并发语音转写 CLI 工具
5
+ Author: Your Name
6
+ License-Expression: MIT
7
+ Classifier: Development Status :: 4 - Beta
8
+ Classifier: Environment :: Console
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
11
+ Requires-Python: >=3.12
12
+ Requires-Dist: httpx>=0.27
13
+ Requires-Dist: numpy>=2.0
14
+ Requires-Dist: onnxruntime>=1.19
15
+ Requires-Dist: silero-vad>=5.1
16
+ Description-Content-Type: text/markdown
17
+
18
+ # groq-whisper-cli
19
+
20
+ Groq Whisper 多 Key 并发语音转写 CLI 工具。
21
+
22
+ ## 功能
23
+
24
+ - silero-vad 本地 VAD 智能切片
25
+ - 多 API Key 配额感知负载均衡
26
+ - 429 自动退避 + Key 切换
27
+ - 异步并发协程池
28
+ - 输出 SRT 字幕 + TXT 文本
29
+
30
+ ## 安装
31
+
32
+ ```bash
33
+ # 直接通过 uvx 运行(无需安装)
34
+ uvx run groq-whisper-cli -i audio.mp3
35
+
36
+ # 或安装到本地
37
+ uv tool install groq-whisper-cli
38
+ groq-whisper-cli -i audio.mp3
39
+ ```
40
+
41
+ ## 前置要求
42
+
43
+ - Python 3.12+
44
+ - ffmpeg(用于音频提取)
45
+
46
+ ## API Key 配置
47
+
48
+ 按优先级查找:
49
+
50
+ 1. `--key-file` / `-k` 参数指定文件(每行一个 key)
51
+ 2. 当前目录下的 `key.txt`
52
+ 3. 环境变量 `GROQ_API_KEY`(多个 key 用逗号分隔)
53
+
54
+ ```bash
55
+ # 方式一:环境变量
56
+ export GROQ_API_KEY="gsk_xxx,gsk_yyy"
57
+
58
+ # 方式二:key 文件
59
+ echo "gsk_xxx" > key.txt
60
+ echo "gsk_yyy" >> key.txt
61
+
62
+ # 方式三:命令行参数
63
+ groq-whisper-cli -i audio.mp3 -k /path/to/key.txt
64
+ ```
65
+
66
+ ## 用法
67
+
68
+ ```bash
69
+ # 基本用法
70
+ groq-whisper-cli -i audio.mp3
71
+
72
+ # 指定语言和输出目录
73
+ groq-whisper-cli -i video.mp4 -l en -o ./output
74
+
75
+ # 多文件处理
76
+ groq-whisper-cli -i *.mp4 -w 8
77
+
78
+ # 完整参数
79
+ groq-whisper-cli -i audio.mp3 \
80
+ -k key.txt \
81
+ -o ./transcript \
82
+ -w 4 \
83
+ --chunk-sec 120 \
84
+ -l zh \
85
+ --retries 8
86
+ ```
87
+
88
+ ## 参数
89
+
90
+ | 参数 | 缩写 | 默认值 | 说明 |
91
+ |------|------|--------|------|
92
+ | `input` | - | 必填 | 输入文件路径,支持通配符 |
93
+ | `--key-file` | `-k` | - | API Key 文件路径 |
94
+ | `--out` | `-o` | `./transcript` | 输出目录 |
95
+ | `--workers` | `-w` | `4` | 并发 worker 数 |
96
+ | `--chunk-sec` | - | `120` | VAD 切片最大时长(秒) |
97
+ | `--lang` | `-l` | `zh` | ISO-639-1 语言代码 |
98
+ | `--retries` | - | `8` | 每个分片最大重试次数 |
99
+
100
+ ## 作为库使用
101
+
102
+ ```python
103
+ from groq_whisper import GroqKeyManager, AudioSplitter, GroqTranscriber
104
+
105
+ key_manager = GroqKeyManager(["gsk_xxx", "gsk_yyy"])
106
+ splitter = AudioSplitter(max_chunk_sec=120)
107
+ chunks = splitter.split("audio.mp3")
108
+
109
+ transcriber = GroqTranscriber(key_manager=key_manager, language="zh")
110
+ results = asyncio.run(transcriber.transcribe_all(chunks))
111
+ ```
112
+
113
+ ## License
114
+
115
+ MIT
@@ -0,0 +1,98 @@
1
+ # groq-whisper-cli
2
+
3
+ Groq Whisper 多 Key 并发语音转写 CLI 工具。
4
+
5
+ ## 功能
6
+
7
+ - silero-vad 本地 VAD 智能切片
8
+ - 多 API Key 配额感知负载均衡
9
+ - 429 自动退避 + Key 切换
10
+ - 异步并发协程池
11
+ - 输出 SRT 字幕 + TXT 文本
12
+
13
+ ## 安装
14
+
15
+ ```bash
16
+ # 直接通过 uvx 运行(无需安装)
17
+ uvx run groq-whisper-cli -i audio.mp3
18
+
19
+ # 或安装到本地
20
+ uv tool install groq-whisper-cli
21
+ groq-whisper-cli -i audio.mp3
22
+ ```
23
+
24
+ ## 前置要求
25
+
26
+ - Python 3.12+
27
+ - ffmpeg(用于音频提取)
28
+
29
+ ## API Key 配置
30
+
31
+ 按优先级查找:
32
+
33
+ 1. `--key-file` / `-k` 参数指定文件(每行一个 key)
34
+ 2. 当前目录下的 `key.txt`
35
+ 3. 环境变量 `GROQ_API_KEY`(多个 key 用逗号分隔)
36
+
37
+ ```bash
38
+ # 方式一:环境变量
39
+ export GROQ_API_KEY="gsk_xxx,gsk_yyy"
40
+
41
+ # 方式二:key 文件
42
+ echo "gsk_xxx" > key.txt
43
+ echo "gsk_yyy" >> key.txt
44
+
45
+ # 方式三:命令行参数
46
+ groq-whisper-cli -i audio.mp3 -k /path/to/key.txt
47
+ ```
48
+
49
+ ## 用法
50
+
51
+ ```bash
52
+ # 基本用法
53
+ groq-whisper-cli -i audio.mp3
54
+
55
+ # 指定语言和输出目录
56
+ groq-whisper-cli -i video.mp4 -l en -o ./output
57
+
58
+ # 多文件处理
59
+ groq-whisper-cli -i *.mp4 -w 8
60
+
61
+ # 完整参数
62
+ groq-whisper-cli -i audio.mp3 \
63
+ -k key.txt \
64
+ -o ./transcript \
65
+ -w 4 \
66
+ --chunk-sec 120 \
67
+ -l zh \
68
+ --retries 8
69
+ ```
70
+
71
+ ## 参数
72
+
73
+ | 参数 | 缩写 | 默认值 | 说明 |
74
+ |------|------|--------|------|
75
+ | `input` | - | 必填 | 输入文件路径,支持通配符 |
76
+ | `--key-file` | `-k` | - | API Key 文件路径 |
77
+ | `--out` | `-o` | `./transcript` | 输出目录 |
78
+ | `--workers` | `-w` | `4` | 并发 worker 数 |
79
+ | `--chunk-sec` | - | `120` | VAD 切片最大时长(秒) |
80
+ | `--lang` | `-l` | `zh` | ISO-639-1 语言代码 |
81
+ | `--retries` | - | `8` | 每个分片最大重试次数 |
82
+
83
+ ## 作为库使用
84
+
85
+ ```python
86
+ from groq_whisper import GroqKeyManager, AudioSplitter, GroqTranscriber
87
+
88
+ key_manager = GroqKeyManager(["gsk_xxx", "gsk_yyy"])
89
+ splitter = AudioSplitter(max_chunk_sec=120)
90
+ chunks = splitter.split("audio.mp3")
91
+
92
+ transcriber = GroqTranscriber(key_manager=key_manager, language="zh")
93
+ results = asyncio.run(transcriber.transcribe_all(chunks))
94
+ ```
95
+
96
+ ## License
97
+
98
+ MIT
@@ -0,0 +1,6 @@
1
+ from .key_manager import GroqKeyManager
2
+ from .audio_splitter import AudioSplitter
3
+ from .transcriber import GroqTranscriber
4
+ from .utils import wav_bytes
5
+
6
+ __all__ = ["GroqKeyManager", "AudioSplitter", "GroqTranscriber", "wav_bytes"]
@@ -0,0 +1,232 @@
1
+ """
2
+ Groq Whisper 多 Key 并发语音转写 CLI 工具
3
+
4
+ 支持 silero-vad 本地 VAD 切片、多 key 配额感知负载均衡、
5
+ 429 自动退避 + key 切换、并发协程池,输出 SRT + TXT。
6
+ """
7
+
8
+ import argparse
9
+ import asyncio
10
+ import os
11
+ import sys
12
+ import time
13
+ from pathlib import Path
14
+
15
+ from . import GroqKeyManager, AudioSplitter, GroqTranscriber
16
+
17
+
18
+ def _fmt_srt_time(sec: float) -> str:
19
+ h = int(sec // 3600)
20
+ m = int((sec % 3600) // 60)
21
+ s = sec % 60
22
+ return f"{h:02d}:{m:02d}:{s:06.3f}".replace(".", ",")
23
+
24
+
25
+ def write_outputs(results: list[dict], video_path: str, out_dir: str):
26
+ stem = Path(video_path).stem
27
+ out = Path(out_dir)
28
+ out.mkdir(parents=True, exist_ok=True)
29
+
30
+ full_text_parts = []
31
+ srt_parts = []
32
+
33
+ for r in results:
34
+ if r is None:
35
+ continue
36
+ full_text_parts.append((r["start"], r["text"]))
37
+ srt_parts.append((r["start"], r["segments"]))
38
+
39
+ if not full_text_parts:
40
+ print(f" Warning: all chunks failed for {Path(video_path).name}, no output written")
41
+ return
42
+
43
+ # TXT
44
+ full_text_parts.sort(key=lambda x: x[0])
45
+ txt_path = out / f"{stem}.txt"
46
+ txt_content = "\n".join(t[1] for t in full_text_parts)
47
+ txt_path.write_text(txt_content, encoding="utf-8")
48
+ print(f" TXT -> {txt_path}")
49
+
50
+ # SRT
51
+ srt_lines = []
52
+ seg_idx = 0
53
+ srt_parts.sort(key=lambda x: x[0])
54
+ for chunk_start, segments in srt_parts:
55
+ for seg in segments:
56
+ seg_idx += 1
57
+ s = chunk_start + seg.get("start", 0)
58
+ e = chunk_start + seg.get("end", 0)
59
+ text = seg.get("text", "").strip()
60
+ if not text:
61
+ continue
62
+ srt_lines.append(f"{seg_idx}")
63
+ srt_lines.append(f"{_fmt_srt_time(s)} --> {_fmt_srt_time(e)}")
64
+ srt_lines.append(text)
65
+ srt_lines.append("")
66
+
67
+ srt_path = out / f"{stem}.srt"
68
+ srt_path.write_text("\n".join(srt_lines), encoding="utf-8")
69
+ print(f" SRT -> {srt_path}")
70
+
71
+
72
+ def load_keys(key_file: str | None) -> list[str]:
73
+ # 1. --key-file 参数优先
74
+ if key_file is not None:
75
+ if not Path(key_file).exists():
76
+ print(f"Error: key file not found: {key_file}")
77
+ sys.exit(1)
78
+ return _read_key_file(key_file)
79
+
80
+ # 2. 当前目录 key.txt
81
+ local_keys = Path("key.txt")
82
+ if local_keys.exists():
83
+ return _read_key_file(str(local_keys))
84
+
85
+ # 3. 环境变量 GROQ_API_KEY(逗号分隔)
86
+ env_keys = os.environ.get("GROQ_API_KEY", "").strip()
87
+ if env_keys:
88
+ keys = [k.strip() for k in env_keys.split(",") if k.strip()]
89
+ if keys:
90
+ return keys
91
+
92
+ return []
93
+
94
+
95
+ def _read_key_file(path: str) -> list[str]:
96
+ text = Path(path).read_text(encoding="utf-8")
97
+ keys = [k.strip() for k in text.strip().splitlines() if k.strip() and not k.strip().startswith("#")]
98
+ return keys
99
+
100
+
101
+ def progress_callback(completed: int, total: int, elapsed: float, eta: float, key_status: list[str]):
102
+ pass
103
+
104
+
105
+ def main():
106
+ parser = argparse.ArgumentParser(
107
+ prog="groq-whisper-cli",
108
+ description="Transcribe long audio/video via Groq Whisper with multi-key concurrency",
109
+ )
110
+ parser.add_argument(
111
+ "input", nargs="+",
112
+ help="input MP4/audio file path(s), supports wildcards",
113
+ )
114
+ parser.add_argument(
115
+ "--key-file", "-k",
116
+ default=None,
117
+ help="path to key file (one key per line); falls back to GROQ_API_KEY env var",
118
+ )
119
+ parser.add_argument(
120
+ "--out", "-o",
121
+ default="./transcript",
122
+ help="output directory (default: ./transcript)",
123
+ )
124
+ parser.add_argument(
125
+ "--workers", "-w",
126
+ type=int, default=4,
127
+ help="number of concurrent workers (default: 4)",
128
+ )
129
+ parser.add_argument(
130
+ "--chunk-sec",
131
+ type=float, default=120.0,
132
+ help="max VAD chunk duration in seconds (default: 120)",
133
+ )
134
+ parser.add_argument(
135
+ "--lang", "-l",
136
+ default="zh",
137
+ help="audio language ISO-639-1 code (default: zh)",
138
+ )
139
+ parser.add_argument(
140
+ "--retries",
141
+ type=int, default=8,
142
+ help="max retries per chunk (default: 8)",
143
+ )
144
+
145
+ args = parser.parse_args()
146
+
147
+ # resolve input files
148
+ input_files = []
149
+ for pat in args.input:
150
+ p = Path(pat)
151
+ if any(c in pat for c in "*?"):
152
+ if p.is_absolute():
153
+ parent = p.parent
154
+ pattern = p.name
155
+ matched = list(parent.glob(pattern))
156
+ else:
157
+ matched = list(Path().glob(pat))
158
+ else:
159
+ matched = [p]
160
+ for m in matched:
161
+ if m.suffix.lower() in (".mp4", ".m4a", ".mp3", ".wav", ".flac", ".ogg"):
162
+ input_files.append(m)
163
+
164
+ if not input_files:
165
+ print("Error: no valid audio/video files found")
166
+ sys.exit(1)
167
+
168
+ # read keys
169
+ keys = load_keys(args.key_file)
170
+ if not keys:
171
+ print("Error: no API keys found. Use --key-file, set GROQ_API_KEY env var, or place key.txt in current directory")
172
+ sys.exit(1)
173
+ for i, k in enumerate(keys, 1):
174
+ print(f" key{i}: {k[:16]}...{k[-4:]}")
175
+ print(f"Loaded {len(keys)} Groq API keys")
176
+
177
+ key_manager = GroqKeyManager(keys)
178
+ splitter = AudioSplitter(max_chunk_sec=args.chunk_sec)
179
+
180
+ total_chunks = 0
181
+
182
+ for video_idx, video_path in enumerate(input_files, 1):
183
+ print(f"\n[{video_idx}/{len(input_files)}] Processing: {video_path.name}")
184
+ print(f" Path: {video_path.resolve()}")
185
+
186
+ t0 = time.monotonic()
187
+ chunks = splitter.split(str(video_path))
188
+ dt = time.monotonic() - t0
189
+ print(f" Split: {len(chunks)} chunks in {dt:.1f}s")
190
+ total_chunks += len(chunks)
191
+
192
+ if not chunks:
193
+ print(f" Warning: no valid audio chunks, skipping")
194
+ continue
195
+
196
+ total_audio_sec = sum(c["end"] - c["start"] for c in chunks)
197
+ print(f" Total audio: {total_audio_sec:.0f}s ({total_audio_sec/60:.1f}min)")
198
+
199
+ transcriber = GroqTranscriber(
200
+ key_manager=key_manager,
201
+ language=args.lang,
202
+ workers=args.workers,
203
+ max_retries=args.retries,
204
+ )
205
+ transcriber.on_progress(progress_callback)
206
+
207
+ try:
208
+ results = asyncio.run(transcriber.transcribe_all(chunks))
209
+ except KeyboardInterrupt:
210
+ print("\nInterrupted by user")
211
+ sys.exit(1)
212
+ print()
213
+
214
+ success = sum(1 for r in results if r is not None)
215
+ failed = sum(1 for r in results if r is None)
216
+ print(f" Result: {success}/{len(results)} ok", end="")
217
+ if failed:
218
+ print(f", {failed} failed", end="")
219
+ print()
220
+
221
+ write_outputs(results, str(video_path), args.out)
222
+
223
+ # final summary
224
+ print(f"\n{'='*40}")
225
+ print(f"All done: {len(input_files)} files, {total_chunks} chunks")
226
+ for line in key_manager.status_lines():
227
+ print(line)
228
+ print(f"{'='*40}")
229
+
230
+
231
+ if __name__ == "__main__":
232
+ main()
@@ -0,0 +1,172 @@
1
+ import subprocess
2
+ import time
3
+ import numpy as np
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ SAMPLE_RATE = 16000
8
+
9
+
10
+ class AudioSplitter:
11
+ """ffmpeg 提取音频 + silero-vad 语音活动检测切片"""
12
+
13
+ def __init__(self, max_chunk_sec: float = 120.0, min_speech_ms: int = 1000, min_silence_ms: int = 500):
14
+ self.max_chunk_sec = max_chunk_sec
15
+ self.min_speech_ms = min_speech_ms
16
+ self.min_silence_ms = min_silence_ms
17
+ self._vad_model = None
18
+
19
+ def _load_vad(self):
20
+ if self._vad_model is None:
21
+ from silero_vad import load_silero_vad, get_speech_timestamps
22
+ self._vad_model = load_silero_vad(onnx=True)
23
+ self._get_speech_timestamps = get_speech_timestamps
24
+
25
+ def _ffmpeg_extract_pcm(self, mp4_path: str) -> np.ndarray:
26
+ """ffmpeg 提取音频为 16kHz mono float32 numpy array"""
27
+ import subprocess as sp
28
+ cmd = [
29
+ "ffmpeg", "-i", mp4_path,
30
+ "-f", "f32le",
31
+ "-acodec", "pcm_f32le",
32
+ "-ar", str(SAMPLE_RATE),
33
+ "-ac", "1",
34
+ "-loglevel", "error",
35
+ "-"
36
+ ]
37
+ try:
38
+ result = sp.run(cmd, capture_output=True, check=True)
39
+ except FileNotFoundError:
40
+ raise RuntimeError(
41
+ "ffmpeg not found. Please install ffmpeg: https://ffmpeg.org/download.html"
42
+ )
43
+ except sp.CalledProcessError as e:
44
+ raise RuntimeError(
45
+ f"ffmpeg failed (exit {e.returncode}): {e.stderr.decode(errors='replace')[:500]}"
46
+ ) from e
47
+ return np.frombuffer(result.stdout, dtype=np.float32)
48
+
49
+ def _get_duration_sec(self, mp4_path: str) -> float:
50
+ """ffprobe 获取音频时长(秒)"""
51
+ import subprocess as sp
52
+ import json
53
+ cmd = [
54
+ "ffprobe", "-v", "error",
55
+ "-show_entries", "format=duration",
56
+ "-of", "json",
57
+ mp4_path
58
+ ]
59
+ try:
60
+ result = sp.run(cmd, capture_output=True, check=True, text=True)
61
+ except FileNotFoundError:
62
+ raise RuntimeError(
63
+ "ffprobe not found. Please install ffmpeg: https://ffmpeg.org/download.html"
64
+ )
65
+ except sp.CalledProcessError as e:
66
+ raise RuntimeError(
67
+ f"ffprobe failed (exit {e.returncode}): {e.stderr.decode(errors='replace')[:500]}"
68
+ ) from e
69
+ data = json.loads(result.stdout)
70
+ return float(data["format"]["duration"])
71
+
72
+ def split(self, mp4_path: str) -> list[dict]:
73
+ """
74
+ 提取音频并用 VAD 切分。
75
+
76
+ 返回 [{"start": float, "end": float, "audio": np.ndarray}, ...]
77
+ """
78
+ self._load_vad()
79
+
80
+ path = Path(mp4_path)
81
+ if not path.exists():
82
+ raise FileNotFoundError(f"File not found: {mp4_path}")
83
+
84
+ t0 = time.monotonic()
85
+
86
+ total_sec = self._get_duration_sec(mp4_path)
87
+ file_size_mb = path.stat().st_size / 1024 / 1024
88
+ print(f" File size: {file_size_mb:.0f}MB, duration: {total_sec:.0f}s ({total_sec/60:.1f}min)")
89
+
90
+ print(f" ffmpeg extract PCM (16kHz mono)...", end=" ", flush=True)
91
+ t1 = time.monotonic()
92
+ audio = self._ffmpeg_extract_pcm(mp4_path)
93
+ print(f"{len(audio)/SAMPLE_RATE:.0f}s audio in {time.monotonic()-t1:.1f}s")
94
+
95
+ print(f" silero-vad detecting speech...", end=" ", flush=True)
96
+ t1 = time.monotonic()
97
+ speech_ts = self._get_speech_timestamps(
98
+ audio,
99
+ self._vad_model,
100
+ sampling_rate=SAMPLE_RATE,
101
+ min_speech_duration_ms=self.min_speech_ms,
102
+ min_silence_duration_ms=self.min_silence_ms,
103
+ )
104
+ print(f"{len(speech_ts)} segments in {time.monotonic()-t1:.1f}s")
105
+
106
+ if not speech_ts:
107
+ print(f" Warning: VAD detected no speech, falling back to uniform split")
108
+ return self._fallback_split(audio)
109
+
110
+ print(f" Merging and splitting at {self.max_chunk_sec}s max...", end=" ", flush=True)
111
+ chunks = self._merge_and_chunk(audio, speech_ts)
112
+ total_audio = sum(c["end"] - c["start"] for c in chunks)
113
+ print(f"{len(chunks)} chunks, {total_audio:.0f}s audio, done in {time.monotonic()-t0:.1f}s")
114
+
115
+ # preview first few chunks
116
+ preview = min(3, len(chunks))
117
+ for i in range(preview):
118
+ dur = chunks[i]["end"] - chunks[i]["start"]
119
+ print(f" chunk[{i}]: {chunks[i]['start']:.1f}s - {chunks[i]['end']:.1f}s ({dur:.1f}s)")
120
+ if len(chunks) > preview:
121
+ print(f" ... {len(chunks)} total")
122
+
123
+ return chunks
124
+
125
+ def _merge_and_chunk(self, audio: np.ndarray, speech_ts: list) -> list[dict]:
126
+ """合并短语音段(间隔 < 1s),再按 max_chunk_sec 切分"""
127
+ merged = []
128
+ for seg in speech_ts:
129
+ start_s = seg["start"]
130
+ end_s = seg["end"]
131
+ if merged and (start_s - merged[-1]["end"]) / SAMPLE_RATE < 1.0:
132
+ merged[-1]["end"] = end_s
133
+ else:
134
+ merged.append({"start": start_s, "end": end_s})
135
+
136
+ chunks = []
137
+ for seg in merged:
138
+ start = seg["start"]
139
+ end = seg["end"]
140
+ dur_sec = (end - start) / SAMPLE_RATE
141
+
142
+ if dur_sec <= self.max_chunk_sec:
143
+ chunks.append({
144
+ "start": start / SAMPLE_RATE,
145
+ "end": end / SAMPLE_RATE,
146
+ "audio": audio[start:end].copy(),
147
+ })
148
+ else:
149
+ chunk_samples = int(self.max_chunk_sec * SAMPLE_RATE)
150
+ for s in range(start, end, chunk_samples):
151
+ e = min(s + chunk_samples, end)
152
+ chunks.append({
153
+ "start": s / SAMPLE_RATE,
154
+ "end": e / SAMPLE_RATE,
155
+ "audio": audio[s:e].copy(),
156
+ })
157
+
158
+ return chunks
159
+
160
+ def _fallback_split(self, audio: np.ndarray) -> list[dict]:
161
+ """VAD 无结果回退:等距切片"""
162
+ total_samples = len(audio)
163
+ chunk_samples = int(self.max_chunk_sec * SAMPLE_RATE)
164
+ chunks = []
165
+ for s in range(0, total_samples, chunk_samples):
166
+ e = min(s + chunk_samples, total_samples)
167
+ chunks.append({
168
+ "start": s / SAMPLE_RATE,
169
+ "end": e / SAMPLE_RATE,
170
+ "audio": audio[s:e].copy(),
171
+ })
172
+ return chunks