groq-whisper-cli 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- groq_whisper_cli-0.1.0/.gitignore +30 -0
- groq_whisper_cli-0.1.0/PKG-INFO +115 -0
- groq_whisper_cli-0.1.0/README.md +98 -0
- groq_whisper_cli-0.1.0/groq_whisper/__init__.py +6 -0
- groq_whisper_cli-0.1.0/groq_whisper/__main__.py +232 -0
- groq_whisper_cli-0.1.0/groq_whisper/audio_splitter.py +172 -0
- groq_whisper_cli-0.1.0/groq_whisper/key_manager.py +198 -0
- groq_whisper_cli-0.1.0/groq_whisper/transcriber.py +185 -0
- groq_whisper_cli-0.1.0/groq_whisper/utils.py +26 -0
- groq_whisper_cli-0.1.0/pyproject.toml +30 -0
- groq_whisper_cli-0.1.0/uv.lock +676 -0
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# Python-generated files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[oc]
|
|
4
|
+
build/
|
|
5
|
+
dist/
|
|
6
|
+
wheels/
|
|
7
|
+
*.egg-info
|
|
8
|
+
|
|
9
|
+
# Virtual environments
|
|
10
|
+
.venv
|
|
11
|
+
tmp.*
|
|
12
|
+
tmp/
|
|
13
|
+
presentations/
|
|
14
|
+
|
|
15
|
+
# Playwright
|
|
16
|
+
node_modules/
|
|
17
|
+
/test-results/
|
|
18
|
+
/playwright-report/
|
|
19
|
+
/blob-report/
|
|
20
|
+
/playwright/.cache/
|
|
21
|
+
/playwright/.auth/
|
|
22
|
+
work_product/
|
|
23
|
+
.env
|
|
24
|
+
TODO
|
|
25
|
+
archive/
|
|
26
|
+
docs/
|
|
27
|
+
key/
|
|
28
|
+
backup_archives.ps1
|
|
29
|
+
.codex/
|
|
30
|
+
.opencode/
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: groq-whisper-cli
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Groq Whisper 多 Key 并发语音转写 CLI 工具
|
|
5
|
+
Author: Your Name
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Classifier: Development Status :: 4 - Beta
|
|
8
|
+
Classifier: Environment :: Console
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
10
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
|
11
|
+
Requires-Python: >=3.12
|
|
12
|
+
Requires-Dist: httpx>=0.27
|
|
13
|
+
Requires-Dist: numpy>=2.0
|
|
14
|
+
Requires-Dist: onnxruntime>=1.19
|
|
15
|
+
Requires-Dist: silero-vad>=5.1
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
|
|
18
|
+
# groq-whisper-cli
|
|
19
|
+
|
|
20
|
+
Groq Whisper 多 Key 并发语音转写 CLI 工具。
|
|
21
|
+
|
|
22
|
+
## 功能
|
|
23
|
+
|
|
24
|
+
- silero-vad 本地 VAD 智能切片
|
|
25
|
+
- 多 API Key 配额感知负载均衡
|
|
26
|
+
- 429 自动退避 + Key 切换
|
|
27
|
+
- 异步并发协程池
|
|
28
|
+
- 输出 SRT 字幕 + TXT 文本
|
|
29
|
+
|
|
30
|
+
## 安装
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
# 直接通过 uvx 运行(无需安装)
|
|
34
|
+
uvx run groq-whisper-cli -i audio.mp3
|
|
35
|
+
|
|
36
|
+
# 或安装到本地
|
|
37
|
+
uv tool install groq-whisper-cli
|
|
38
|
+
groq-whisper-cli -i audio.mp3
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## 前置要求
|
|
42
|
+
|
|
43
|
+
- Python 3.12+
|
|
44
|
+
- ffmpeg(用于音频提取)
|
|
45
|
+
|
|
46
|
+
## API Key 配置
|
|
47
|
+
|
|
48
|
+
按优先级查找:
|
|
49
|
+
|
|
50
|
+
1. `--key-file` / `-k` 参数指定文件(每行一个 key)
|
|
51
|
+
2. 当前目录下的 `key.txt`
|
|
52
|
+
3. 环境变量 `GROQ_API_KEY`(多个 key 用逗号分隔)
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
# 方式一:环境变量
|
|
56
|
+
export GROQ_API_KEY="gsk_xxx,gsk_yyy"
|
|
57
|
+
|
|
58
|
+
# 方式二:key 文件
|
|
59
|
+
echo "gsk_xxx" > key.txt
|
|
60
|
+
echo "gsk_yyy" >> key.txt
|
|
61
|
+
|
|
62
|
+
# 方式三:命令行参数
|
|
63
|
+
groq-whisper-cli -i audio.mp3 -k /path/to/key.txt
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## 用法
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
# 基本用法
|
|
70
|
+
groq-whisper-cli -i audio.mp3
|
|
71
|
+
|
|
72
|
+
# 指定语言和输出目录
|
|
73
|
+
groq-whisper-cli -i video.mp4 -l en -o ./output
|
|
74
|
+
|
|
75
|
+
# 多文件处理
|
|
76
|
+
groq-whisper-cli -i *.mp4 -w 8
|
|
77
|
+
|
|
78
|
+
# 完整参数
|
|
79
|
+
groq-whisper-cli -i audio.mp3 \
|
|
80
|
+
-k key.txt \
|
|
81
|
+
-o ./transcript \
|
|
82
|
+
-w 4 \
|
|
83
|
+
--chunk-sec 120 \
|
|
84
|
+
-l zh \
|
|
85
|
+
--retries 8
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## 参数
|
|
89
|
+
|
|
90
|
+
| 参数 | 缩写 | 默认值 | 说明 |
|
|
91
|
+
|------|------|--------|------|
|
|
92
|
+
| `input` | - | 必填 | 输入文件路径,支持通配符 |
|
|
93
|
+
| `--key-file` | `-k` | - | API Key 文件路径 |
|
|
94
|
+
| `--out` | `-o` | `./transcript` | 输出目录 |
|
|
95
|
+
| `--workers` | `-w` | `4` | 并发 worker 数 |
|
|
96
|
+
| `--chunk-sec` | - | `120` | VAD 切片最大时长(秒) |
|
|
97
|
+
| `--lang` | `-l` | `zh` | ISO-639-1 语言代码 |
|
|
98
|
+
| `--retries` | - | `8` | 每个分片最大重试次数 |
|
|
99
|
+
|
|
100
|
+
## 作为库使用
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
from groq_whisper import GroqKeyManager, AudioSplitter, GroqTranscriber
|
|
104
|
+
|
|
105
|
+
key_manager = GroqKeyManager(["gsk_xxx", "gsk_yyy"])
|
|
106
|
+
splitter = AudioSplitter(max_chunk_sec=120)
|
|
107
|
+
chunks = splitter.split("audio.mp3")
|
|
108
|
+
|
|
109
|
+
transcriber = GroqTranscriber(key_manager=key_manager, language="zh")
|
|
110
|
+
results = asyncio.run(transcriber.transcribe_all(chunks))
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## License
|
|
114
|
+
|
|
115
|
+
MIT
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# groq-whisper-cli
|
|
2
|
+
|
|
3
|
+
Groq Whisper 多 Key 并发语音转写 CLI 工具。
|
|
4
|
+
|
|
5
|
+
## 功能
|
|
6
|
+
|
|
7
|
+
- silero-vad 本地 VAD 智能切片
|
|
8
|
+
- 多 API Key 配额感知负载均衡
|
|
9
|
+
- 429 自动退避 + Key 切换
|
|
10
|
+
- 异步并发协程池
|
|
11
|
+
- 输出 SRT 字幕 + TXT 文本
|
|
12
|
+
|
|
13
|
+
## 安装
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
# 直接通过 uvx 运行(无需安装)
|
|
17
|
+
uvx run groq-whisper-cli -i audio.mp3
|
|
18
|
+
|
|
19
|
+
# 或安装到本地
|
|
20
|
+
uv tool install groq-whisper-cli
|
|
21
|
+
groq-whisper-cli -i audio.mp3
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## 前置要求
|
|
25
|
+
|
|
26
|
+
- Python 3.12+
|
|
27
|
+
- ffmpeg(用于音频提取)
|
|
28
|
+
|
|
29
|
+
## API Key 配置
|
|
30
|
+
|
|
31
|
+
按优先级查找:
|
|
32
|
+
|
|
33
|
+
1. `--key-file` / `-k` 参数指定文件(每行一个 key)
|
|
34
|
+
2. 当前目录下的 `key.txt`
|
|
35
|
+
3. 环境变量 `GROQ_API_KEY`(多个 key 用逗号分隔)
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
# 方式一:环境变量
|
|
39
|
+
export GROQ_API_KEY="gsk_xxx,gsk_yyy"
|
|
40
|
+
|
|
41
|
+
# 方式二:key 文件
|
|
42
|
+
echo "gsk_xxx" > key.txt
|
|
43
|
+
echo "gsk_yyy" >> key.txt
|
|
44
|
+
|
|
45
|
+
# 方式三:命令行参数
|
|
46
|
+
groq-whisper-cli -i audio.mp3 -k /path/to/key.txt
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## 用法
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
# 基本用法
|
|
53
|
+
groq-whisper-cli -i audio.mp3
|
|
54
|
+
|
|
55
|
+
# 指定语言和输出目录
|
|
56
|
+
groq-whisper-cli -i video.mp4 -l en -o ./output
|
|
57
|
+
|
|
58
|
+
# 多文件处理
|
|
59
|
+
groq-whisper-cli -i *.mp4 -w 8
|
|
60
|
+
|
|
61
|
+
# 完整参数
|
|
62
|
+
groq-whisper-cli -i audio.mp3 \
|
|
63
|
+
-k key.txt \
|
|
64
|
+
-o ./transcript \
|
|
65
|
+
-w 4 \
|
|
66
|
+
--chunk-sec 120 \
|
|
67
|
+
-l zh \
|
|
68
|
+
--retries 8
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## 参数
|
|
72
|
+
|
|
73
|
+
| 参数 | 缩写 | 默认值 | 说明 |
|
|
74
|
+
|------|------|--------|------|
|
|
75
|
+
| `input` | - | 必填 | 输入文件路径,支持通配符 |
|
|
76
|
+
| `--key-file` | `-k` | - | API Key 文件路径 |
|
|
77
|
+
| `--out` | `-o` | `./transcript` | 输出目录 |
|
|
78
|
+
| `--workers` | `-w` | `4` | 并发 worker 数 |
|
|
79
|
+
| `--chunk-sec` | - | `120` | VAD 切片最大时长(秒) |
|
|
80
|
+
| `--lang` | `-l` | `zh` | ISO-639-1 语言代码 |
|
|
81
|
+
| `--retries` | - | `8` | 每个分片最大重试次数 |
|
|
82
|
+
|
|
83
|
+
## 作为库使用
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
from groq_whisper import GroqKeyManager, AudioSplitter, GroqTranscriber
|
|
87
|
+
|
|
88
|
+
key_manager = GroqKeyManager(["gsk_xxx", "gsk_yyy"])
|
|
89
|
+
splitter = AudioSplitter(max_chunk_sec=120)
|
|
90
|
+
chunks = splitter.split("audio.mp3")
|
|
91
|
+
|
|
92
|
+
transcriber = GroqTranscriber(key_manager=key_manager, language="zh")
|
|
93
|
+
results = asyncio.run(transcriber.transcribe_all(chunks))
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## License
|
|
97
|
+
|
|
98
|
+
MIT
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Groq Whisper 多 Key 并发语音转写 CLI 工具
|
|
3
|
+
|
|
4
|
+
支持 silero-vad 本地 VAD 切片、多 key 配额感知负载均衡、
|
|
5
|
+
429 自动退避 + key 切换、并发协程池,输出 SRT + TXT。
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import argparse
|
|
9
|
+
import asyncio
|
|
10
|
+
import os
|
|
11
|
+
import sys
|
|
12
|
+
import time
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
from . import GroqKeyManager, AudioSplitter, GroqTranscriber
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _fmt_srt_time(sec: float) -> str:
|
|
19
|
+
h = int(sec // 3600)
|
|
20
|
+
m = int((sec % 3600) // 60)
|
|
21
|
+
s = sec % 60
|
|
22
|
+
return f"{h:02d}:{m:02d}:{s:06.3f}".replace(".", ",")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def write_outputs(results: list[dict], video_path: str, out_dir: str):
|
|
26
|
+
stem = Path(video_path).stem
|
|
27
|
+
out = Path(out_dir)
|
|
28
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
29
|
+
|
|
30
|
+
full_text_parts = []
|
|
31
|
+
srt_parts = []
|
|
32
|
+
|
|
33
|
+
for r in results:
|
|
34
|
+
if r is None:
|
|
35
|
+
continue
|
|
36
|
+
full_text_parts.append((r["start"], r["text"]))
|
|
37
|
+
srt_parts.append((r["start"], r["segments"]))
|
|
38
|
+
|
|
39
|
+
if not full_text_parts:
|
|
40
|
+
print(f" Warning: all chunks failed for {Path(video_path).name}, no output written")
|
|
41
|
+
return
|
|
42
|
+
|
|
43
|
+
# TXT
|
|
44
|
+
full_text_parts.sort(key=lambda x: x[0])
|
|
45
|
+
txt_path = out / f"{stem}.txt"
|
|
46
|
+
txt_content = "\n".join(t[1] for t in full_text_parts)
|
|
47
|
+
txt_path.write_text(txt_content, encoding="utf-8")
|
|
48
|
+
print(f" TXT -> {txt_path}")
|
|
49
|
+
|
|
50
|
+
# SRT
|
|
51
|
+
srt_lines = []
|
|
52
|
+
seg_idx = 0
|
|
53
|
+
srt_parts.sort(key=lambda x: x[0])
|
|
54
|
+
for chunk_start, segments in srt_parts:
|
|
55
|
+
for seg in segments:
|
|
56
|
+
seg_idx += 1
|
|
57
|
+
s = chunk_start + seg.get("start", 0)
|
|
58
|
+
e = chunk_start + seg.get("end", 0)
|
|
59
|
+
text = seg.get("text", "").strip()
|
|
60
|
+
if not text:
|
|
61
|
+
continue
|
|
62
|
+
srt_lines.append(f"{seg_idx}")
|
|
63
|
+
srt_lines.append(f"{_fmt_srt_time(s)} --> {_fmt_srt_time(e)}")
|
|
64
|
+
srt_lines.append(text)
|
|
65
|
+
srt_lines.append("")
|
|
66
|
+
|
|
67
|
+
srt_path = out / f"{stem}.srt"
|
|
68
|
+
srt_path.write_text("\n".join(srt_lines), encoding="utf-8")
|
|
69
|
+
print(f" SRT -> {srt_path}")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def load_keys(key_file: str | None) -> list[str]:
|
|
73
|
+
# 1. --key-file 参数优先
|
|
74
|
+
if key_file is not None:
|
|
75
|
+
if not Path(key_file).exists():
|
|
76
|
+
print(f"Error: key file not found: {key_file}")
|
|
77
|
+
sys.exit(1)
|
|
78
|
+
return _read_key_file(key_file)
|
|
79
|
+
|
|
80
|
+
# 2. 当前目录 key.txt
|
|
81
|
+
local_keys = Path("key.txt")
|
|
82
|
+
if local_keys.exists():
|
|
83
|
+
return _read_key_file(str(local_keys))
|
|
84
|
+
|
|
85
|
+
# 3. 环境变量 GROQ_API_KEY(逗号分隔)
|
|
86
|
+
env_keys = os.environ.get("GROQ_API_KEY", "").strip()
|
|
87
|
+
if env_keys:
|
|
88
|
+
keys = [k.strip() for k in env_keys.split(",") if k.strip()]
|
|
89
|
+
if keys:
|
|
90
|
+
return keys
|
|
91
|
+
|
|
92
|
+
return []
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _read_key_file(path: str) -> list[str]:
|
|
96
|
+
text = Path(path).read_text(encoding="utf-8")
|
|
97
|
+
keys = [k.strip() for k in text.strip().splitlines() if k.strip() and not k.strip().startswith("#")]
|
|
98
|
+
return keys
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def progress_callback(completed: int, total: int, elapsed: float, eta: float, key_status: list[str]):
|
|
102
|
+
pass
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def main():
|
|
106
|
+
parser = argparse.ArgumentParser(
|
|
107
|
+
prog="groq-whisper-cli",
|
|
108
|
+
description="Transcribe long audio/video via Groq Whisper with multi-key concurrency",
|
|
109
|
+
)
|
|
110
|
+
parser.add_argument(
|
|
111
|
+
"input", nargs="+",
|
|
112
|
+
help="input MP4/audio file path(s), supports wildcards",
|
|
113
|
+
)
|
|
114
|
+
parser.add_argument(
|
|
115
|
+
"--key-file", "-k",
|
|
116
|
+
default=None,
|
|
117
|
+
help="path to key file (one key per line); falls back to GROQ_API_KEY env var",
|
|
118
|
+
)
|
|
119
|
+
parser.add_argument(
|
|
120
|
+
"--out", "-o",
|
|
121
|
+
default="./transcript",
|
|
122
|
+
help="output directory (default: ./transcript)",
|
|
123
|
+
)
|
|
124
|
+
parser.add_argument(
|
|
125
|
+
"--workers", "-w",
|
|
126
|
+
type=int, default=4,
|
|
127
|
+
help="number of concurrent workers (default: 4)",
|
|
128
|
+
)
|
|
129
|
+
parser.add_argument(
|
|
130
|
+
"--chunk-sec",
|
|
131
|
+
type=float, default=120.0,
|
|
132
|
+
help="max VAD chunk duration in seconds (default: 120)",
|
|
133
|
+
)
|
|
134
|
+
parser.add_argument(
|
|
135
|
+
"--lang", "-l",
|
|
136
|
+
default="zh",
|
|
137
|
+
help="audio language ISO-639-1 code (default: zh)",
|
|
138
|
+
)
|
|
139
|
+
parser.add_argument(
|
|
140
|
+
"--retries",
|
|
141
|
+
type=int, default=8,
|
|
142
|
+
help="max retries per chunk (default: 8)",
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
args = parser.parse_args()
|
|
146
|
+
|
|
147
|
+
# resolve input files
|
|
148
|
+
input_files = []
|
|
149
|
+
for pat in args.input:
|
|
150
|
+
p = Path(pat)
|
|
151
|
+
if any(c in pat for c in "*?"):
|
|
152
|
+
if p.is_absolute():
|
|
153
|
+
parent = p.parent
|
|
154
|
+
pattern = p.name
|
|
155
|
+
matched = list(parent.glob(pattern))
|
|
156
|
+
else:
|
|
157
|
+
matched = list(Path().glob(pat))
|
|
158
|
+
else:
|
|
159
|
+
matched = [p]
|
|
160
|
+
for m in matched:
|
|
161
|
+
if m.suffix.lower() in (".mp4", ".m4a", ".mp3", ".wav", ".flac", ".ogg"):
|
|
162
|
+
input_files.append(m)
|
|
163
|
+
|
|
164
|
+
if not input_files:
|
|
165
|
+
print("Error: no valid audio/video files found")
|
|
166
|
+
sys.exit(1)
|
|
167
|
+
|
|
168
|
+
# read keys
|
|
169
|
+
keys = load_keys(args.key_file)
|
|
170
|
+
if not keys:
|
|
171
|
+
print("Error: no API keys found. Use --key-file, set GROQ_API_KEY env var, or place key.txt in current directory")
|
|
172
|
+
sys.exit(1)
|
|
173
|
+
for i, k in enumerate(keys, 1):
|
|
174
|
+
print(f" key{i}: {k[:16]}...{k[-4:]}")
|
|
175
|
+
print(f"Loaded {len(keys)} Groq API keys")
|
|
176
|
+
|
|
177
|
+
key_manager = GroqKeyManager(keys)
|
|
178
|
+
splitter = AudioSplitter(max_chunk_sec=args.chunk_sec)
|
|
179
|
+
|
|
180
|
+
total_chunks = 0
|
|
181
|
+
|
|
182
|
+
for video_idx, video_path in enumerate(input_files, 1):
|
|
183
|
+
print(f"\n[{video_idx}/{len(input_files)}] Processing: {video_path.name}")
|
|
184
|
+
print(f" Path: {video_path.resolve()}")
|
|
185
|
+
|
|
186
|
+
t0 = time.monotonic()
|
|
187
|
+
chunks = splitter.split(str(video_path))
|
|
188
|
+
dt = time.monotonic() - t0
|
|
189
|
+
print(f" Split: {len(chunks)} chunks in {dt:.1f}s")
|
|
190
|
+
total_chunks += len(chunks)
|
|
191
|
+
|
|
192
|
+
if not chunks:
|
|
193
|
+
print(f" Warning: no valid audio chunks, skipping")
|
|
194
|
+
continue
|
|
195
|
+
|
|
196
|
+
total_audio_sec = sum(c["end"] - c["start"] for c in chunks)
|
|
197
|
+
print(f" Total audio: {total_audio_sec:.0f}s ({total_audio_sec/60:.1f}min)")
|
|
198
|
+
|
|
199
|
+
transcriber = GroqTranscriber(
|
|
200
|
+
key_manager=key_manager,
|
|
201
|
+
language=args.lang,
|
|
202
|
+
workers=args.workers,
|
|
203
|
+
max_retries=args.retries,
|
|
204
|
+
)
|
|
205
|
+
transcriber.on_progress(progress_callback)
|
|
206
|
+
|
|
207
|
+
try:
|
|
208
|
+
results = asyncio.run(transcriber.transcribe_all(chunks))
|
|
209
|
+
except KeyboardInterrupt:
|
|
210
|
+
print("\nInterrupted by user")
|
|
211
|
+
sys.exit(1)
|
|
212
|
+
print()
|
|
213
|
+
|
|
214
|
+
success = sum(1 for r in results if r is not None)
|
|
215
|
+
failed = sum(1 for r in results if r is None)
|
|
216
|
+
print(f" Result: {success}/{len(results)} ok", end="")
|
|
217
|
+
if failed:
|
|
218
|
+
print(f", {failed} failed", end="")
|
|
219
|
+
print()
|
|
220
|
+
|
|
221
|
+
write_outputs(results, str(video_path), args.out)
|
|
222
|
+
|
|
223
|
+
# final summary
|
|
224
|
+
print(f"\n{'='*40}")
|
|
225
|
+
print(f"All done: {len(input_files)} files, {total_chunks} chunks")
|
|
226
|
+
for line in key_manager.status_lines():
|
|
227
|
+
print(line)
|
|
228
|
+
print(f"{'='*40}")
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
if __name__ == "__main__":
|
|
232
|
+
main()
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
import time
|
|
3
|
+
import numpy as np
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
SAMPLE_RATE = 16000
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AudioSplitter:
|
|
11
|
+
"""ffmpeg 提取音频 + silero-vad 语音活动检测切片"""
|
|
12
|
+
|
|
13
|
+
def __init__(self, max_chunk_sec: float = 120.0, min_speech_ms: int = 1000, min_silence_ms: int = 500):
|
|
14
|
+
self.max_chunk_sec = max_chunk_sec
|
|
15
|
+
self.min_speech_ms = min_speech_ms
|
|
16
|
+
self.min_silence_ms = min_silence_ms
|
|
17
|
+
self._vad_model = None
|
|
18
|
+
|
|
19
|
+
def _load_vad(self):
|
|
20
|
+
if self._vad_model is None:
|
|
21
|
+
from silero_vad import load_silero_vad, get_speech_timestamps
|
|
22
|
+
self._vad_model = load_silero_vad(onnx=True)
|
|
23
|
+
self._get_speech_timestamps = get_speech_timestamps
|
|
24
|
+
|
|
25
|
+
def _ffmpeg_extract_pcm(self, mp4_path: str) -> np.ndarray:
|
|
26
|
+
"""ffmpeg 提取音频为 16kHz mono float32 numpy array"""
|
|
27
|
+
import subprocess as sp
|
|
28
|
+
cmd = [
|
|
29
|
+
"ffmpeg", "-i", mp4_path,
|
|
30
|
+
"-f", "f32le",
|
|
31
|
+
"-acodec", "pcm_f32le",
|
|
32
|
+
"-ar", str(SAMPLE_RATE),
|
|
33
|
+
"-ac", "1",
|
|
34
|
+
"-loglevel", "error",
|
|
35
|
+
"-"
|
|
36
|
+
]
|
|
37
|
+
try:
|
|
38
|
+
result = sp.run(cmd, capture_output=True, check=True)
|
|
39
|
+
except FileNotFoundError:
|
|
40
|
+
raise RuntimeError(
|
|
41
|
+
"ffmpeg not found. Please install ffmpeg: https://ffmpeg.org/download.html"
|
|
42
|
+
)
|
|
43
|
+
except sp.CalledProcessError as e:
|
|
44
|
+
raise RuntimeError(
|
|
45
|
+
f"ffmpeg failed (exit {e.returncode}): {e.stderr.decode(errors='replace')[:500]}"
|
|
46
|
+
) from e
|
|
47
|
+
return np.frombuffer(result.stdout, dtype=np.float32)
|
|
48
|
+
|
|
49
|
+
def _get_duration_sec(self, mp4_path: str) -> float:
|
|
50
|
+
"""ffprobe 获取音频时长(秒)"""
|
|
51
|
+
import subprocess as sp
|
|
52
|
+
import json
|
|
53
|
+
cmd = [
|
|
54
|
+
"ffprobe", "-v", "error",
|
|
55
|
+
"-show_entries", "format=duration",
|
|
56
|
+
"-of", "json",
|
|
57
|
+
mp4_path
|
|
58
|
+
]
|
|
59
|
+
try:
|
|
60
|
+
result = sp.run(cmd, capture_output=True, check=True, text=True)
|
|
61
|
+
except FileNotFoundError:
|
|
62
|
+
raise RuntimeError(
|
|
63
|
+
"ffprobe not found. Please install ffmpeg: https://ffmpeg.org/download.html"
|
|
64
|
+
)
|
|
65
|
+
except sp.CalledProcessError as e:
|
|
66
|
+
raise RuntimeError(
|
|
67
|
+
f"ffprobe failed (exit {e.returncode}): {e.stderr.decode(errors='replace')[:500]}"
|
|
68
|
+
) from e
|
|
69
|
+
data = json.loads(result.stdout)
|
|
70
|
+
return float(data["format"]["duration"])
|
|
71
|
+
|
|
72
|
+
def split(self, mp4_path: str) -> list[dict]:
|
|
73
|
+
"""
|
|
74
|
+
提取音频并用 VAD 切分。
|
|
75
|
+
|
|
76
|
+
返回 [{"start": float, "end": float, "audio": np.ndarray}, ...]
|
|
77
|
+
"""
|
|
78
|
+
self._load_vad()
|
|
79
|
+
|
|
80
|
+
path = Path(mp4_path)
|
|
81
|
+
if not path.exists():
|
|
82
|
+
raise FileNotFoundError(f"File not found: {mp4_path}")
|
|
83
|
+
|
|
84
|
+
t0 = time.monotonic()
|
|
85
|
+
|
|
86
|
+
total_sec = self._get_duration_sec(mp4_path)
|
|
87
|
+
file_size_mb = path.stat().st_size / 1024 / 1024
|
|
88
|
+
print(f" File size: {file_size_mb:.0f}MB, duration: {total_sec:.0f}s ({total_sec/60:.1f}min)")
|
|
89
|
+
|
|
90
|
+
print(f" ffmpeg extract PCM (16kHz mono)...", end=" ", flush=True)
|
|
91
|
+
t1 = time.monotonic()
|
|
92
|
+
audio = self._ffmpeg_extract_pcm(mp4_path)
|
|
93
|
+
print(f"{len(audio)/SAMPLE_RATE:.0f}s audio in {time.monotonic()-t1:.1f}s")
|
|
94
|
+
|
|
95
|
+
print(f" silero-vad detecting speech...", end=" ", flush=True)
|
|
96
|
+
t1 = time.monotonic()
|
|
97
|
+
speech_ts = self._get_speech_timestamps(
|
|
98
|
+
audio,
|
|
99
|
+
self._vad_model,
|
|
100
|
+
sampling_rate=SAMPLE_RATE,
|
|
101
|
+
min_speech_duration_ms=self.min_speech_ms,
|
|
102
|
+
min_silence_duration_ms=self.min_silence_ms,
|
|
103
|
+
)
|
|
104
|
+
print(f"{len(speech_ts)} segments in {time.monotonic()-t1:.1f}s")
|
|
105
|
+
|
|
106
|
+
if not speech_ts:
|
|
107
|
+
print(f" Warning: VAD detected no speech, falling back to uniform split")
|
|
108
|
+
return self._fallback_split(audio)
|
|
109
|
+
|
|
110
|
+
print(f" Merging and splitting at {self.max_chunk_sec}s max...", end=" ", flush=True)
|
|
111
|
+
chunks = self._merge_and_chunk(audio, speech_ts)
|
|
112
|
+
total_audio = sum(c["end"] - c["start"] for c in chunks)
|
|
113
|
+
print(f"{len(chunks)} chunks, {total_audio:.0f}s audio, done in {time.monotonic()-t0:.1f}s")
|
|
114
|
+
|
|
115
|
+
# preview first few chunks
|
|
116
|
+
preview = min(3, len(chunks))
|
|
117
|
+
for i in range(preview):
|
|
118
|
+
dur = chunks[i]["end"] - chunks[i]["start"]
|
|
119
|
+
print(f" chunk[{i}]: {chunks[i]['start']:.1f}s - {chunks[i]['end']:.1f}s ({dur:.1f}s)")
|
|
120
|
+
if len(chunks) > preview:
|
|
121
|
+
print(f" ... {len(chunks)} total")
|
|
122
|
+
|
|
123
|
+
return chunks
|
|
124
|
+
|
|
125
|
+
def _merge_and_chunk(self, audio: np.ndarray, speech_ts: list) -> list[dict]:
|
|
126
|
+
"""合并短语音段(间隔 < 1s),再按 max_chunk_sec 切分"""
|
|
127
|
+
merged = []
|
|
128
|
+
for seg in speech_ts:
|
|
129
|
+
start_s = seg["start"]
|
|
130
|
+
end_s = seg["end"]
|
|
131
|
+
if merged and (start_s - merged[-1]["end"]) / SAMPLE_RATE < 1.0:
|
|
132
|
+
merged[-1]["end"] = end_s
|
|
133
|
+
else:
|
|
134
|
+
merged.append({"start": start_s, "end": end_s})
|
|
135
|
+
|
|
136
|
+
chunks = []
|
|
137
|
+
for seg in merged:
|
|
138
|
+
start = seg["start"]
|
|
139
|
+
end = seg["end"]
|
|
140
|
+
dur_sec = (end - start) / SAMPLE_RATE
|
|
141
|
+
|
|
142
|
+
if dur_sec <= self.max_chunk_sec:
|
|
143
|
+
chunks.append({
|
|
144
|
+
"start": start / SAMPLE_RATE,
|
|
145
|
+
"end": end / SAMPLE_RATE,
|
|
146
|
+
"audio": audio[start:end].copy(),
|
|
147
|
+
})
|
|
148
|
+
else:
|
|
149
|
+
chunk_samples = int(self.max_chunk_sec * SAMPLE_RATE)
|
|
150
|
+
for s in range(start, end, chunk_samples):
|
|
151
|
+
e = min(s + chunk_samples, end)
|
|
152
|
+
chunks.append({
|
|
153
|
+
"start": s / SAMPLE_RATE,
|
|
154
|
+
"end": e / SAMPLE_RATE,
|
|
155
|
+
"audio": audio[s:e].copy(),
|
|
156
|
+
})
|
|
157
|
+
|
|
158
|
+
return chunks
|
|
159
|
+
|
|
160
|
+
def _fallback_split(self, audio: np.ndarray) -> list[dict]:
|
|
161
|
+
"""VAD 无结果回退:等距切片"""
|
|
162
|
+
total_samples = len(audio)
|
|
163
|
+
chunk_samples = int(self.max_chunk_sec * SAMPLE_RATE)
|
|
164
|
+
chunks = []
|
|
165
|
+
for s in range(0, total_samples, chunk_samples):
|
|
166
|
+
e = min(s + chunk_samples, total_samples)
|
|
167
|
+
chunks.append({
|
|
168
|
+
"start": s / SAMPLE_RATE,
|
|
169
|
+
"end": e / SAMPLE_RATE,
|
|
170
|
+
"audio": audio[s:e].copy(),
|
|
171
|
+
})
|
|
172
|
+
return chunks
|