@xdfnet/ispeak 1.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Docs/ARCHITECTURE.md +177 -0
- package/Docs/HTTP Chunked:SSE/345/215/225/345/220/221/346/265/201/345/274/217-V3.md" +896 -0
- package/Docs//345/243/260/351/237/263/345/244/215/345/210/273API-V3.md +873 -0
- package/Docs//351/237/263/350/211/262/345/210/227/350/241/250.md +998 -0
- package/LICENSE +21 -0
- package/README.md +194 -0
- package/configs/com.iSpeak.plist +20 -0
- package/configs/config.example.json +18 -0
- package/configs/hook-speak.sh +76 -0
- package/go.mod +5 -0
- package/go.sum +4 -0
- package/main.go +858 -0
- package/npm/postinstall.js +134 -0
- package/package.json +46 -0
- package/scripts/ispeak +58 -0
package/main.go
ADDED
|
@@ -0,0 +1,858 @@
|
|
|
1
|
+
// ttsd — 独立 TTS 播报守护进程
|
|
2
|
+
// 监听 Unix Socket,收到文本 → 字节跳动 TTS SSE → 流式播放
|
|
3
|
+
package main
|
|
4
|
+
|
|
5
|
+
import (
|
|
6
|
+
"bufio"
|
|
7
|
+
"context"
|
|
8
|
+
"encoding/base64"
|
|
9
|
+
"encoding/json"
|
|
10
|
+
"errors"
|
|
11
|
+
"fmt"
|
|
12
|
+
"io"
|
|
13
|
+
"log"
|
|
14
|
+
"net"
|
|
15
|
+
"net/http"
|
|
16
|
+
"os"
|
|
17
|
+
"os/exec"
|
|
18
|
+
"os/signal"
|
|
19
|
+
"path/filepath"
|
|
20
|
+
"strings"
|
|
21
|
+
"sync"
|
|
22
|
+
"syscall"
|
|
23
|
+
"time"
|
|
24
|
+
|
|
25
|
+
"gopkg.in/natefinch/lumberjack.v2"
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
const (
|
|
29
|
+
ttsMaxAttempts = 2
|
|
30
|
+
ttsRetryBackoff = 400 * time.Millisecond
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
var configDir = os.ExpandEnv("$HOME/.config/iSpeak")
|
|
34
|
+
|
|
35
|
+
var (
|
|
36
|
+
configCacheMu sync.Mutex
|
|
37
|
+
configCachePath string
|
|
38
|
+
configCacheModTime time.Time
|
|
39
|
+
configCache Config
|
|
40
|
+
configCacheValid bool
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
var ttsHTTPClient = &http.Client{Timeout: 30 * time.Second}
|
|
44
|
+
|
|
45
|
+
// 进程级 temp 目录(进程退出时清理)
|
|
46
|
+
var tempDir string
|
|
47
|
+
|
|
48
|
+
var errAlreadyRunning = errors.New("iSpeak already running")
|
|
49
|
+
|
|
50
|
+
type StreamPlayer interface {
|
|
51
|
+
Write(audio []byte) error
|
|
52
|
+
CloseAndWait() error
|
|
53
|
+
Abort() error
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
type ffplayStreamPlayer struct {
|
|
57
|
+
path string
|
|
58
|
+
cmd *exec.Cmd
|
|
59
|
+
stdin io.WriteCloser
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
func newDefaultStreamPlayer() (StreamPlayer, error) {
|
|
63
|
+
if path, ok := findExecutable("ffplay", "/opt/homebrew/bin/ffplay", "/usr/local/bin/ffplay"); ok {
|
|
64
|
+
log.Printf("播放器模式: ffplay 流式 stdin (%s)", path)
|
|
65
|
+
return newFFplayStreamPlayer(path)
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
log.Printf("播放器模式: afplay 完整音频 fallback")
|
|
69
|
+
return &bufferedStreamPlayer{}, nil
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
func findExecutable(name string, candidates ...string) (string, bool) {
|
|
73
|
+
if path, err := exec.LookPath(name); err == nil {
|
|
74
|
+
return path, true
|
|
75
|
+
}
|
|
76
|
+
for _, path := range candidates {
|
|
77
|
+
if st, err := os.Stat(path); err == nil && !st.IsDir() && st.Mode()&0111 != 0 {
|
|
78
|
+
return path, true
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
return "", false
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
func newFFplayStreamPlayer(path string) (*ffplayStreamPlayer, error) {
|
|
85
|
+
cmd := exec.Command(path, "-nodisp", "-autoexit", "-loglevel", "error", "-i", "pipe:0")
|
|
86
|
+
stdin, err := cmd.StdinPipe()
|
|
87
|
+
if err != nil {
|
|
88
|
+
return nil, err
|
|
89
|
+
}
|
|
90
|
+
cmd.Stderr = os.Stderr
|
|
91
|
+
if err := cmd.Start(); err != nil {
|
|
92
|
+
_ = stdin.Close()
|
|
93
|
+
return nil, err
|
|
94
|
+
}
|
|
95
|
+
return &ffplayStreamPlayer{path: path, cmd: cmd, stdin: stdin}, nil
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
func (p *ffplayStreamPlayer) Write(audio []byte) error {
|
|
99
|
+
if len(audio) == 0 {
|
|
100
|
+
return nil
|
|
101
|
+
}
|
|
102
|
+
if _, err := p.stdin.Write(audio); err != nil {
|
|
103
|
+
return fmt.Errorf("写入播放器失败: %w", err)
|
|
104
|
+
}
|
|
105
|
+
return nil
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
func (p *ffplayStreamPlayer) CloseAndWait() error {
|
|
109
|
+
if p.stdin != nil {
|
|
110
|
+
if err := p.stdin.Close(); err != nil {
|
|
111
|
+
return fmt.Errorf("关闭播放器输入失败: %w", err)
|
|
112
|
+
}
|
|
113
|
+
p.stdin = nil
|
|
114
|
+
}
|
|
115
|
+
if err := p.cmd.Wait(); err != nil {
|
|
116
|
+
return fmt.Errorf("ffplay failed: %w", err)
|
|
117
|
+
}
|
|
118
|
+
return nil
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
func (p *ffplayStreamPlayer) Abort() error {
|
|
122
|
+
if p.stdin != nil {
|
|
123
|
+
_ = p.stdin.Close()
|
|
124
|
+
p.stdin = nil
|
|
125
|
+
}
|
|
126
|
+
if p.cmd != nil && p.cmd.Process != nil {
|
|
127
|
+
_ = p.cmd.Process.Kill()
|
|
128
|
+
_ = p.cmd.Wait()
|
|
129
|
+
}
|
|
130
|
+
return nil
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
type bufferedStreamPlayer struct {
|
|
134
|
+
chunks [][]byte
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
func (p *bufferedStreamPlayer) Write(audio []byte) error {
|
|
138
|
+
if len(audio) == 0 {
|
|
139
|
+
return nil
|
|
140
|
+
}
|
|
141
|
+
chunk := append([]byte(nil), audio...)
|
|
142
|
+
p.chunks = append(p.chunks, chunk)
|
|
143
|
+
return nil
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
func (p *bufferedStreamPlayer) CloseAndWait() error {
|
|
147
|
+
total := 0
|
|
148
|
+
for _, chunk := range p.chunks {
|
|
149
|
+
total += len(chunk)
|
|
150
|
+
}
|
|
151
|
+
audio := make([]byte, 0, total)
|
|
152
|
+
for _, chunk := range p.chunks {
|
|
153
|
+
audio = append(audio, chunk...)
|
|
154
|
+
}
|
|
155
|
+
return playAudio(audio)
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
func (p *bufferedStreamPlayer) Abort() error {
|
|
159
|
+
p.chunks = nil
|
|
160
|
+
return nil
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// 任务状态
|
|
164
|
+
// 生命周期:pending_synth -> speaking -> delete
|
|
165
|
+
type TaskStatus int
|
|
166
|
+
|
|
167
|
+
const (
|
|
168
|
+
TaskStatusPendingSynth TaskStatus = iota
|
|
169
|
+
TaskStatusSpeaking
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
// 单个 TTS 任务
|
|
173
|
+
type Task struct {
|
|
174
|
+
ID uint64
|
|
175
|
+
Text string
|
|
176
|
+
Status TaskStatus
|
|
177
|
+
Voice VoiceInfo
|
|
178
|
+
Cfg Config
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
// 任务引擎:任务仓库 + 单流式合成播放 worker
|
|
182
|
+
type TaskEngine struct {
|
|
183
|
+
mu sync.Mutex
|
|
184
|
+
|
|
185
|
+
nextID uint64
|
|
186
|
+
tasks map[uint64]*Task
|
|
187
|
+
pendingSynth []uint64
|
|
188
|
+
|
|
189
|
+
synthWake chan struct{}
|
|
190
|
+
|
|
191
|
+
synthesizeStreamFn func(ctx context.Context, cfg Config, text string, voice *VoiceInfo, onAudio func([]byte) error) error
|
|
192
|
+
newStreamPlayerFn func() (StreamPlayer, error)
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
func NewTaskEngine() *TaskEngine {
|
|
196
|
+
return &TaskEngine{
|
|
197
|
+
tasks: make(map[uint64]*Task),
|
|
198
|
+
synthWake: make(chan struct{}, 1),
|
|
199
|
+
synthesizeStreamFn: synthesizeStream,
|
|
200
|
+
newStreamPlayerFn: newDefaultStreamPlayer,
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
func (e *TaskEngine) Start() {
|
|
205
|
+
go e.speakWorker()
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
func (e *TaskEngine) Submit(text string, voice VoiceInfo, cfg Config) uint64 {
|
|
209
|
+
e.mu.Lock()
|
|
210
|
+
defer e.mu.Unlock()
|
|
211
|
+
|
|
212
|
+
// 新任务进来先删所有未开始合成任务
|
|
213
|
+
for _, id := range e.pendingSynth {
|
|
214
|
+
delete(e.tasks, id)
|
|
215
|
+
log.Printf("删除待合成任务: id=%d", id)
|
|
216
|
+
}
|
|
217
|
+
e.pendingSynth = e.pendingSynth[:0]
|
|
218
|
+
|
|
219
|
+
e.nextID++
|
|
220
|
+
task := &Task{
|
|
221
|
+
ID: e.nextID,
|
|
222
|
+
Text: text,
|
|
223
|
+
Status: TaskStatusPendingSynth,
|
|
224
|
+
Voice: voice,
|
|
225
|
+
Cfg: cfg,
|
|
226
|
+
}
|
|
227
|
+
e.tasks[task.ID] = task
|
|
228
|
+
e.pendingSynth = append(e.pendingSynth, task.ID)
|
|
229
|
+
log.Printf("任务创建: id=%d text=%s", task.ID, text)
|
|
230
|
+
|
|
231
|
+
notify(e.synthWake)
|
|
232
|
+
return task.ID
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
func (e *TaskEngine) speakWorker() {
|
|
236
|
+
for {
|
|
237
|
+
id := e.claimPendingSynth()
|
|
238
|
+
if id == 0 {
|
|
239
|
+
<-e.synthWake
|
|
240
|
+
continue
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
e.processSpeakTask(id)
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
func (e *TaskEngine) processSpeakTask(id uint64) {
|
|
248
|
+
defer func() {
|
|
249
|
+
if r := recover(); r != nil {
|
|
250
|
+
log.Printf("播报任务崩溃并删除: id=%d err=%v", id, r)
|
|
251
|
+
e.deleteTask(id)
|
|
252
|
+
}
|
|
253
|
+
}()
|
|
254
|
+
|
|
255
|
+
task, ok := e.getTask(id)
|
|
256
|
+
if !ok {
|
|
257
|
+
return
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
var lastErr error
|
|
261
|
+
for i := 1; i <= ttsMaxAttempts; i++ {
|
|
262
|
+
lastErr = e.speakOnce(context.Background(), task)
|
|
263
|
+
if lastErr == nil {
|
|
264
|
+
break
|
|
265
|
+
}
|
|
266
|
+
if i < ttsMaxAttempts {
|
|
267
|
+
time.Sleep(ttsRetryBackoff)
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
if lastErr != nil {
|
|
272
|
+
log.Printf("播报失败并删除任务: id=%d err=%v", id, lastErr)
|
|
273
|
+
e.deleteTask(id)
|
|
274
|
+
return
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
log.Printf("播报完成并删除任务: id=%d", id)
|
|
278
|
+
e.deleteTask(id)
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
func (e *TaskEngine) speakOnce(ctx context.Context, task *Task) error {
|
|
282
|
+
startedAt := time.Now()
|
|
283
|
+
player, err := e.newStreamPlayerFn()
|
|
284
|
+
if err != nil {
|
|
285
|
+
return fmt.Errorf("启动播放器失败: %w", err)
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
firstChunkLogged := false
|
|
289
|
+
onAudio := func(audio []byte) error {
|
|
290
|
+
if len(audio) > 0 && !firstChunkLogged {
|
|
291
|
+
firstChunkLogged = true
|
|
292
|
+
log.Printf("首个音频 chunk: id=%d elapsed=%s bytes=%d", task.ID, time.Since(startedAt).Round(time.Millisecond), len(audio))
|
|
293
|
+
}
|
|
294
|
+
return player.Write(audio)
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
if err := e.synthesizeStreamFn(ctx, task.Cfg, task.Text, &task.Voice, onAudio); err != nil {
|
|
298
|
+
_ = player.Abort()
|
|
299
|
+
return err
|
|
300
|
+
}
|
|
301
|
+
log.Printf("TTS 流结束: id=%d elapsed=%s", task.ID, time.Since(startedAt).Round(time.Millisecond))
|
|
302
|
+
if err := player.CloseAndWait(); err != nil {
|
|
303
|
+
_ = player.Abort()
|
|
304
|
+
return err
|
|
305
|
+
}
|
|
306
|
+
return nil
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
func (e *TaskEngine) claimPendingSynth() uint64 {
|
|
310
|
+
e.mu.Lock()
|
|
311
|
+
defer e.mu.Unlock()
|
|
312
|
+
|
|
313
|
+
for len(e.pendingSynth) > 0 {
|
|
314
|
+
id := e.pendingSynth[0]
|
|
315
|
+
e.pendingSynth = e.pendingSynth[1:]
|
|
316
|
+
task, ok := e.tasks[id]
|
|
317
|
+
if !ok {
|
|
318
|
+
continue
|
|
319
|
+
}
|
|
320
|
+
if task.Status != TaskStatusPendingSynth {
|
|
321
|
+
continue
|
|
322
|
+
}
|
|
323
|
+
task.Status = TaskStatusSpeaking
|
|
324
|
+
return id
|
|
325
|
+
}
|
|
326
|
+
return 0
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
func (e *TaskEngine) getTask(id uint64) (*Task, bool) {
|
|
330
|
+
e.mu.Lock()
|
|
331
|
+
defer e.mu.Unlock()
|
|
332
|
+
|
|
333
|
+
task, ok := e.tasks[id]
|
|
334
|
+
if !ok {
|
|
335
|
+
return nil, false
|
|
336
|
+
}
|
|
337
|
+
clone := *task
|
|
338
|
+
return &clone, true
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
func (e *TaskEngine) deleteTask(id uint64) {
|
|
342
|
+
e.mu.Lock()
|
|
343
|
+
defer e.mu.Unlock()
|
|
344
|
+
delete(e.tasks, id)
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
func notify(ch chan struct{}) {
|
|
348
|
+
select {
|
|
349
|
+
case ch <- struct{}{}:
|
|
350
|
+
default:
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
// 音色信息
|
|
355
|
+
type VoiceInfo struct {
|
|
356
|
+
VoiceType string `json:"voice_type"`
|
|
357
|
+
ResourceID string `json:"resourceId"`
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
// TTS 配置
|
|
361
|
+
type Config struct {
|
|
362
|
+
APIKey string `json:"apiKey"`
|
|
363
|
+
Endpoint string `json:"endpoint"`
|
|
364
|
+
DefaultVoice *VoiceInfo `json:"defaultVoice"` // 默认音色
|
|
365
|
+
SourceVoices map[string]*VoiceInfo `json:"sourceVoices"` // 来源 → 音色 映射
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
func loadConfig() Config {
|
|
369
|
+
configPaths := []string{
|
|
370
|
+
configDir + "/config.json",
|
|
371
|
+
}
|
|
372
|
+
for _, p := range configPaths {
|
|
373
|
+
st, statErr := os.Stat(p)
|
|
374
|
+
if statErr == nil {
|
|
375
|
+
configCacheMu.Lock()
|
|
376
|
+
if configCacheValid && configCachePath == p && st.ModTime().Equal(configCacheModTime) {
|
|
377
|
+
cached := configCache
|
|
378
|
+
configCacheMu.Unlock()
|
|
379
|
+
return cached
|
|
380
|
+
}
|
|
381
|
+
configCacheMu.Unlock()
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
data, err := os.ReadFile(p)
|
|
385
|
+
if err != nil {
|
|
386
|
+
continue
|
|
387
|
+
}
|
|
388
|
+
var cfg Config
|
|
389
|
+
if json.Unmarshal(data, &cfg) == nil && cfg.APIKey != "" {
|
|
390
|
+
log.Printf("配置文件: %s", p)
|
|
391
|
+
if cfg.DefaultVoice != nil {
|
|
392
|
+
log.Printf("默认音色: %s (%s)", cfg.DefaultVoice.VoiceType, cfg.DefaultVoice.ResourceID)
|
|
393
|
+
}
|
|
394
|
+
for source, v := range cfg.SourceVoices {
|
|
395
|
+
log.Printf("来源 %s → %s (%s)", source, v.VoiceType, v.ResourceID)
|
|
396
|
+
}
|
|
397
|
+
if st, stErr := os.Stat(p); stErr == nil {
|
|
398
|
+
configCacheMu.Lock()
|
|
399
|
+
configCachePath = p
|
|
400
|
+
configCacheModTime = st.ModTime()
|
|
401
|
+
configCache = cfg
|
|
402
|
+
configCacheValid = true
|
|
403
|
+
configCacheMu.Unlock()
|
|
404
|
+
}
|
|
405
|
+
return cfg
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
// 回退到环境变量
|
|
410
|
+
return Config{
|
|
411
|
+
APIKey: envOrDefault("IAGENT_TTS_API_KEY", ""),
|
|
412
|
+
Endpoint: envOrDefault("IAGENT_TTS_ENDPOINT", "https://openspeech.bytedance.com/api/v3/tts/unidirectional"),
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
func envOrDefault(key, fallback string) string {
|
|
417
|
+
if v := os.Getenv(key); v != "" {
|
|
418
|
+
return v
|
|
419
|
+
}
|
|
420
|
+
return fallback
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
// TTS 请求体
|
|
424
|
+
type ttsRequest struct {
|
|
425
|
+
User ttsUser `json:"user"`
|
|
426
|
+
Namespace string `json:"namespace"`
|
|
427
|
+
ReqParams ttsReqParams `json:"req_params"`
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
type ttsUser struct {
|
|
431
|
+
UID string `json:"uid"`
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
type ttsReqParams struct {
|
|
435
|
+
Text string `json:"text"`
|
|
436
|
+
Speaker string `json:"speaker"`
|
|
437
|
+
AudioParams ttsAudioParams `json:"audio_params"`
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
type ttsAudioParams struct {
|
|
441
|
+
Format string `json:"format"`
|
|
442
|
+
SampleRate int `json:"sample_rate"`
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
// 调用字节跳动 TTS API,返回完整 MP3 音频数据。保留给测试和 fallback 使用。
|
|
446
|
+
func synthesize(ctx context.Context, cfg Config, text string, voice *VoiceInfo) ([]byte, error) {
|
|
447
|
+
var chunks [][]byte
|
|
448
|
+
if err := synthesizeStream(ctx, cfg, text, voice, func(audio []byte) error {
|
|
449
|
+
chunk := append([]byte(nil), audio...)
|
|
450
|
+
chunks = append(chunks, chunk)
|
|
451
|
+
return nil
|
|
452
|
+
}); err != nil {
|
|
453
|
+
return nil, err
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
total := 0
|
|
457
|
+
for _, c := range chunks {
|
|
458
|
+
total += len(c)
|
|
459
|
+
}
|
|
460
|
+
result := make([]byte, 0, total)
|
|
461
|
+
for _, c := range chunks {
|
|
462
|
+
result = append(result, c...)
|
|
463
|
+
}
|
|
464
|
+
return result, nil
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
// 调用字节跳动 TTS API,边解析 SSE 边回调 MP3 音频块
|
|
468
|
+
func synthesizeStream(ctx context.Context, cfg Config, text string, voice *VoiceInfo, onAudio func([]byte) error) error {
|
|
469
|
+
speaker := voice.VoiceType
|
|
470
|
+
resourceID := voice.ResourceID
|
|
471
|
+
|
|
472
|
+
log.Printf("音色: %s (resourceId: %s)", speaker, resourceID)
|
|
473
|
+
|
|
474
|
+
reqBody := ttsRequest{
|
|
475
|
+
User: ttsUser{UID: fmt.Sprintf("ttsd-%d", time.Now().UnixNano())},
|
|
476
|
+
Namespace: "BidirectionalTTS",
|
|
477
|
+
ReqParams: ttsReqParams{
|
|
478
|
+
Text: text,
|
|
479
|
+
Speaker: speaker,
|
|
480
|
+
AudioParams: ttsAudioParams{
|
|
481
|
+
Format: "mp3",
|
|
482
|
+
SampleRate: 24000,
|
|
483
|
+
},
|
|
484
|
+
},
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
body, err := json.Marshal(reqBody)
|
|
488
|
+
if err != nil {
|
|
489
|
+
return fmt.Errorf("marshal request: %w", err)
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
req, err := http.NewRequestWithContext(ctx, "POST", cfg.Endpoint, strings.NewReader(string(body)))
|
|
493
|
+
if err != nil {
|
|
494
|
+
return fmt.Errorf("create request: %w", err)
|
|
495
|
+
}
|
|
496
|
+
req.Header.Set("Content-Type", "application/json")
|
|
497
|
+
req.Header.Set("X-Api-Key", cfg.APIKey)
|
|
498
|
+
req.Header.Set("X-Api-Resource-Id", resourceID)
|
|
499
|
+
req.Header.Set("X-Api-Request-Id", fmt.Sprintf("ttsd-%d", time.Now().UnixNano()))
|
|
500
|
+
|
|
501
|
+
resp, err := ttsHTTPClient.Do(req)
|
|
502
|
+
if err != nil {
|
|
503
|
+
return fmt.Errorf("http request: %w", err)
|
|
504
|
+
}
|
|
505
|
+
if resp.StatusCode != 200 {
|
|
506
|
+
io.Copy(io.Discard, resp.Body) // 消费 body 以释放连接
|
|
507
|
+
resp.Body.Close()
|
|
508
|
+
return fmt.Errorf("http status %d", resp.StatusCode)
|
|
509
|
+
}
|
|
510
|
+
defer resp.Body.Close()
|
|
511
|
+
|
|
512
|
+
return parseSSEStream(resp.Body, onAudio)
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
// 解析 SSE 流,提取 base64 音频数据
|
|
516
|
+
func parseSSE(r io.Reader) ([]byte, error) {
|
|
517
|
+
var chunks [][]byte
|
|
518
|
+
if err := parseSSEStream(r, func(audio []byte) error {
|
|
519
|
+
chunk := append([]byte(nil), audio...)
|
|
520
|
+
chunks = append(chunks, chunk)
|
|
521
|
+
return nil
|
|
522
|
+
}); err != nil {
|
|
523
|
+
return nil, err
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
total := 0
|
|
527
|
+
for _, c := range chunks {
|
|
528
|
+
total += len(c)
|
|
529
|
+
}
|
|
530
|
+
result := make([]byte, 0, total)
|
|
531
|
+
for _, c := range chunks {
|
|
532
|
+
result = append(result, c...)
|
|
533
|
+
}
|
|
534
|
+
return result, nil
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
func parseSSEStream(r io.Reader, onAudio func([]byte) error) error {
|
|
538
|
+
audioChunks := 0
|
|
539
|
+
scanner := bufio.NewScanner(r)
|
|
540
|
+
scanner.Buffer(make([]byte, 256*1024), 256*1024)
|
|
541
|
+
|
|
542
|
+
var dataLines []string
|
|
543
|
+
|
|
544
|
+
flush := func() error {
|
|
545
|
+
if len(dataLines) == 0 {
|
|
546
|
+
return nil
|
|
547
|
+
}
|
|
548
|
+
payload := strings.Join(dataLines, "\n")
|
|
549
|
+
dataLines = dataLines[:0]
|
|
550
|
+
ok, err := processEvent(payload, onAudio)
|
|
551
|
+
if ok {
|
|
552
|
+
audioChunks++
|
|
553
|
+
}
|
|
554
|
+
return err
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
for scanner.Scan() {
|
|
558
|
+
line := strings.TrimSpace(scanner.Text())
|
|
559
|
+
if line == "" {
|
|
560
|
+
if err := flush(); err != nil {
|
|
561
|
+
return err
|
|
562
|
+
}
|
|
563
|
+
continue
|
|
564
|
+
}
|
|
565
|
+
if strings.HasPrefix(line, ":") || strings.HasPrefix(line, "event:") ||
|
|
566
|
+
strings.HasPrefix(line, "id:") || strings.HasPrefix(line, "retry:") {
|
|
567
|
+
continue
|
|
568
|
+
}
|
|
569
|
+
if strings.HasPrefix(line, "data:") {
|
|
570
|
+
dataLines = append(dataLines, strings.TrimPrefix(line, "data:"))
|
|
571
|
+
continue
|
|
572
|
+
}
|
|
573
|
+
// 非标准 JSON 直出
|
|
574
|
+
if err := flush(); err != nil {
|
|
575
|
+
return err
|
|
576
|
+
}
|
|
577
|
+
ok, err := processEvent(line, onAudio)
|
|
578
|
+
if ok {
|
|
579
|
+
audioChunks++
|
|
580
|
+
}
|
|
581
|
+
if err != nil {
|
|
582
|
+
return err
|
|
583
|
+
}
|
|
584
|
+
}
|
|
585
|
+
if err := flush(); err != nil {
|
|
586
|
+
return err
|
|
587
|
+
}
|
|
588
|
+
if err := scanner.Err(); err != nil {
|
|
589
|
+
return fmt.Errorf("scan: %w", err)
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
if audioChunks == 0 {
|
|
593
|
+
return fmt.Errorf("no audio data")
|
|
594
|
+
}
|
|
595
|
+
return nil
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
func processEvent(payload string, onAudio func([]byte) error) (bool, error) {
|
|
599
|
+
payload = strings.TrimSpace(payload)
|
|
600
|
+
if payload == "" || payload == "[DONE]" {
|
|
601
|
+
return false, nil
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
var event map[string]any
|
|
605
|
+
if err := json.Unmarshal([]byte(payload), &event); err != nil {
|
|
606
|
+
log.Printf("SSE 数据解析失败: %v", err)
|
|
607
|
+
return false, nil
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
if b64 := extractAudioBase64(event); b64 != "" {
|
|
611
|
+
data, err := base64.StdEncoding.DecodeString(b64)
|
|
612
|
+
if err != nil {
|
|
613
|
+
return false, fmt.Errorf("decode audio chunk: %w", err)
|
|
614
|
+
}
|
|
615
|
+
if err := onAudio(data); err != nil {
|
|
616
|
+
return false, err
|
|
617
|
+
}
|
|
618
|
+
return true, nil
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
return false, nil
|
|
622
|
+
}
|
|
623
|
+
|
|
624
|
+
func extractAudioBase64(event map[string]any) string {
|
|
625
|
+
for _, key := range []string{"data", "audio", "audio_data"} {
|
|
626
|
+
if v, ok := event[key].(string); ok && v != "" {
|
|
627
|
+
return v
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
for _, key := range []string{"data", "result", "payload"} {
|
|
631
|
+
if nested, ok := event[key].(map[string]any); ok {
|
|
632
|
+
if v := extractAudioBase64(nested); v != "" {
|
|
633
|
+
return v
|
|
634
|
+
}
|
|
635
|
+
}
|
|
636
|
+
}
|
|
637
|
+
return ""
|
|
638
|
+
}
|
|
639
|
+
|
|
640
|
+
// 过滤格式符号,保留自然朗读文本
|
|
641
|
+
func cleanText(text string) string {
|
|
642
|
+
var lines []string
|
|
643
|
+
for _, line := range strings.Split(text, "\n") {
|
|
644
|
+
line = strings.TrimSpace(line)
|
|
645
|
+
if strings.HasPrefix(line, "|---") || strings.HasPrefix(line, "|:---") {
|
|
646
|
+
continue
|
|
647
|
+
}
|
|
648
|
+
if strings.HasPrefix(line, "---") && strings.Count(line, "-") > 3 {
|
|
649
|
+
continue
|
|
650
|
+
}
|
|
651
|
+
// 过滤纯表格分隔行(|---|---|、:---|:---| 等)
|
|
652
|
+
if strings.Trim(line, "|-: ") == "" {
|
|
653
|
+
continue
|
|
654
|
+
}
|
|
655
|
+
cleaned := strings.NewReplacer(
|
|
656
|
+
"**", "",
|
|
657
|
+
"*", "",
|
|
658
|
+
"`", "",
|
|
659
|
+
"#", "",
|
|
660
|
+
">", "",
|
|
661
|
+
).Replace(line)
|
|
662
|
+
cleaned = strings.TrimSpace(cleaned)
|
|
663
|
+
if cleaned != "" {
|
|
664
|
+
lines = append(lines, cleaned)
|
|
665
|
+
}
|
|
666
|
+
}
|
|
667
|
+
return strings.Join(lines, ",")
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
func main() {
|
|
671
|
+
log.SetFlags(log.Ltime | log.Lshortfile)
|
|
672
|
+
|
|
673
|
+
// 日志轮转:最大 10MB,保留 3 份
|
|
674
|
+
os.MkdirAll(configDir, 0755)
|
|
675
|
+
log.SetOutput(&lumberjack.Logger{
|
|
676
|
+
Filename: configDir + "/ispeak.log",
|
|
677
|
+
MaxSize: 10,
|
|
678
|
+
MaxBackups: 3,
|
|
679
|
+
Compress: true,
|
|
680
|
+
})
|
|
681
|
+
|
|
682
|
+
// 创建进程级 temp 目录
|
|
683
|
+
cleanupOldTempDirs()
|
|
684
|
+
var err error
|
|
685
|
+
tempDir, err = os.MkdirTemp("", "ttsd-*")
|
|
686
|
+
if err != nil {
|
|
687
|
+
log.Fatalf("创建 temp 目录失败: %v", err)
|
|
688
|
+
}
|
|
689
|
+
defer os.RemoveAll(tempDir)
|
|
690
|
+
|
|
691
|
+
cfg := loadConfig()
|
|
692
|
+
if err := validateConfig(cfg); err != nil {
|
|
693
|
+
log.Fatalf("配置错误: %v", err)
|
|
694
|
+
}
|
|
695
|
+
|
|
696
|
+
socketPath := configDir + "/ispeak.sock"
|
|
697
|
+
listener, err := listenUnixSocket(socketPath)
|
|
698
|
+
if err != nil {
|
|
699
|
+
if errors.Is(err, errAlreadyRunning) {
|
|
700
|
+
log.Fatalf("iSpeak 已在运行,请先关闭旧实例或重启")
|
|
701
|
+
}
|
|
702
|
+
log.Fatalf("监听 socket 失败: %v", err)
|
|
703
|
+
}
|
|
704
|
+
defer os.Remove(socketPath)
|
|
705
|
+
|
|
706
|
+
sigCh := make(chan os.Signal, 1)
|
|
707
|
+
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
|
|
708
|
+
go func() {
|
|
709
|
+
<-sigCh
|
|
710
|
+
listener.Close()
|
|
711
|
+
}()
|
|
712
|
+
|
|
713
|
+
engine := NewTaskEngine()
|
|
714
|
+
engine.Start()
|
|
715
|
+
|
|
716
|
+
log.Printf("iSpeak 已启动,监听 %s", socketPath)
|
|
717
|
+
for {
|
|
718
|
+
conn, err := listener.Accept()
|
|
719
|
+
if err != nil {
|
|
720
|
+
if strings.Contains(err.Error(), "use of closed") {
|
|
721
|
+
return
|
|
722
|
+
}
|
|
723
|
+
continue
|
|
724
|
+
}
|
|
725
|
+
go handleConnection(conn, engine)
|
|
726
|
+
}
|
|
727
|
+
}
|
|
728
|
+
|
|
729
|
+
func listenUnixSocket(socketPath string) (net.Listener, error) {
|
|
730
|
+
listener, err := net.Listen("unix", socketPath)
|
|
731
|
+
if err == nil {
|
|
732
|
+
return listener, nil
|
|
733
|
+
}
|
|
734
|
+
|
|
735
|
+
if !errors.Is(err, syscall.EADDRINUSE) {
|
|
736
|
+
_ = os.Remove(socketPath)
|
|
737
|
+
listener, retryErr := net.Listen("unix", socketPath)
|
|
738
|
+
if retryErr == nil {
|
|
739
|
+
return listener, nil
|
|
740
|
+
}
|
|
741
|
+
return nil, retryErr
|
|
742
|
+
}
|
|
743
|
+
|
|
744
|
+
conn, dialErr := net.DialTimeout("unix", socketPath, 200*time.Millisecond)
|
|
745
|
+
if dialErr == nil {
|
|
746
|
+
_ = conn.Close()
|
|
747
|
+
return nil, errAlreadyRunning
|
|
748
|
+
}
|
|
749
|
+
|
|
750
|
+
if removeErr := os.Remove(socketPath); removeErr != nil && !errors.Is(removeErr, os.ErrNotExist) {
|
|
751
|
+
return nil, removeErr
|
|
752
|
+
}
|
|
753
|
+
listener, err = net.Listen("unix", socketPath)
|
|
754
|
+
if err != nil {
|
|
755
|
+
return nil, err
|
|
756
|
+
}
|
|
757
|
+
log.Printf("已清理残留 socket: %s", socketPath)
|
|
758
|
+
return listener, nil
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
// 清理历史遗留的 temp 目录(进程崩溃时留下)
|
|
762
|
+
func cleanupOldTempDirs() {
|
|
763
|
+
entries, err := os.ReadDir(os.TempDir())
|
|
764
|
+
if err != nil {
|
|
765
|
+
return
|
|
766
|
+
}
|
|
767
|
+
for _, e := range entries {
|
|
768
|
+
if strings.HasPrefix(e.Name(), "ttsd-") {
|
|
769
|
+
os.RemoveAll(filepath.Join(os.TempDir(), e.Name()))
|
|
770
|
+
}
|
|
771
|
+
}
|
|
772
|
+
}
|
|
773
|
+
|
|
774
|
+
// 校验配置必填项
|
|
775
|
+
func validateConfig(cfg Config) error {
|
|
776
|
+
if cfg.APIKey == "" {
|
|
777
|
+
return fmt.Errorf("apiKey 未设置,编辑 ~/.config/iSpeak/config.json")
|
|
778
|
+
}
|
|
779
|
+
if cfg.Endpoint == "" {
|
|
780
|
+
return fmt.Errorf("endpoint 未设置")
|
|
781
|
+
}
|
|
782
|
+
if cfg.DefaultVoice == nil || cfg.DefaultVoice.VoiceType == "" {
|
|
783
|
+
return fmt.Errorf("defaultVoice 未设置")
|
|
784
|
+
}
|
|
785
|
+
return nil
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
func playAudio(data []byte) error {
|
|
789
|
+
tmpFile := filepath.Join(tempDir, fmt.Sprintf("ttsd-%d.mp3", time.Now().UnixNano()))
|
|
790
|
+
if err := os.WriteFile(tmpFile, data, 0644); err != nil {
|
|
791
|
+
return fmt.Errorf("写入临时文件失败: %w", err)
|
|
792
|
+
}
|
|
793
|
+
defer os.Remove(tmpFile)
|
|
794
|
+
|
|
795
|
+
cmd := exec.Command("/usr/bin/afplay", tmpFile)
|
|
796
|
+
log.Printf("播放开始: %s", filepath.Base(tmpFile))
|
|
797
|
+
if err := cmd.Run(); err != nil {
|
|
798
|
+
return fmt.Errorf("播放失败: %w", err)
|
|
799
|
+
}
|
|
800
|
+
return nil
|
|
801
|
+
}
|
|
802
|
+
|
|
803
|
+
func handleConnection(conn net.Conn, engine *TaskEngine) {
|
|
804
|
+
defer func() {
|
|
805
|
+
if r := recover(); r != nil {
|
|
806
|
+
log.Printf("连接处理崩溃: %v", r)
|
|
807
|
+
}
|
|
808
|
+
}()
|
|
809
|
+
defer conn.Close()
|
|
810
|
+
|
|
811
|
+
cfg := loadConfig()
|
|
812
|
+
|
|
813
|
+
var sb strings.Builder
|
|
814
|
+
scanner := bufio.NewScanner(conn)
|
|
815
|
+
scanner.Buffer(make([]byte, 1*1024*1024), 1*1024*1024)
|
|
816
|
+
for scanner.Scan() {
|
|
817
|
+
sb.WriteString(scanner.Text())
|
|
818
|
+
}
|
|
819
|
+
|
|
820
|
+
text := strings.TrimSpace(sb.String())
|
|
821
|
+
if text == "" {
|
|
822
|
+
return
|
|
823
|
+
}
|
|
824
|
+
|
|
825
|
+
voice, content := extractVoicePrefix(text, cfg)
|
|
826
|
+
if voice == nil {
|
|
827
|
+
voice = cfg.DefaultVoice
|
|
828
|
+
}
|
|
829
|
+
if voice == nil {
|
|
830
|
+
log.Printf("未配置默认音色")
|
|
831
|
+
return
|
|
832
|
+
}
|
|
833
|
+
|
|
834
|
+
cleaned := cleanText(content)
|
|
835
|
+
if cleaned == "" {
|
|
836
|
+
return
|
|
837
|
+
}
|
|
838
|
+
|
|
839
|
+
log.Printf("TTS: %s", cleaned)
|
|
840
|
+
engine.Submit(cleaned, *voice, cfg)
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
// 解析消息中的音色前缀,返回 VoiceInfo
|
|
844
|
+
func extractVoicePrefix(text string, cfg Config) (voice *VoiceInfo, content string) {
|
|
845
|
+
// 格式: {source:claude}文本
|
|
846
|
+
const prefix = "{source:"
|
|
847
|
+
if strings.HasPrefix(text, prefix) {
|
|
848
|
+
if end := strings.Index(text, "}"); end > len(prefix) {
|
|
849
|
+
if v, ok := cfg.SourceVoices[text[len(prefix):end]]; ok {
|
|
850
|
+
voice = v
|
|
851
|
+
}
|
|
852
|
+
content = text[end+1:]
|
|
853
|
+
return
|
|
854
|
+
}
|
|
855
|
+
}
|
|
856
|
+
content = text
|
|
857
|
+
return
|
|
858
|
+
}
|