@xdfnet/ispeak 1.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/main.go ADDED
@@ -0,0 +1,858 @@
1
+ // ttsd — 独立 TTS 播报守护进程
2
+ // 监听 Unix Socket,收到文本 → 字节跳动 TTS SSE → 流式播放
3
+ package main
4
+
5
+ import (
6
+ "bufio"
7
+ "context"
8
+ "encoding/base64"
9
+ "encoding/json"
10
+ "errors"
11
+ "fmt"
12
+ "io"
13
+ "log"
14
+ "net"
15
+ "net/http"
16
+ "os"
17
+ "os/exec"
18
+ "os/signal"
19
+ "path/filepath"
20
+ "strings"
21
+ "sync"
22
+ "syscall"
23
+ "time"
24
+
25
+ "gopkg.in/natefinch/lumberjack.v2"
26
+ )
27
+
28
+ const (
29
+ ttsMaxAttempts = 2
30
+ ttsRetryBackoff = 400 * time.Millisecond
31
+ )
32
+
33
+ var configDir = os.ExpandEnv("$HOME/.config/iSpeak")
34
+
35
+ var (
36
+ configCacheMu sync.Mutex
37
+ configCachePath string
38
+ configCacheModTime time.Time
39
+ configCache Config
40
+ configCacheValid bool
41
+ )
42
+
43
+ var ttsHTTPClient = &http.Client{Timeout: 30 * time.Second}
44
+
45
+ // 进程级 temp 目录(进程退出时清理)
46
+ var tempDir string
47
+
48
+ var errAlreadyRunning = errors.New("iSpeak already running")
49
+
50
+ type StreamPlayer interface {
51
+ Write(audio []byte) error
52
+ CloseAndWait() error
53
+ Abort() error
54
+ }
55
+
56
+ type ffplayStreamPlayer struct {
57
+ path string
58
+ cmd *exec.Cmd
59
+ stdin io.WriteCloser
60
+ }
61
+
62
+ func newDefaultStreamPlayer() (StreamPlayer, error) {
63
+ if path, ok := findExecutable("ffplay", "/opt/homebrew/bin/ffplay", "/usr/local/bin/ffplay"); ok {
64
+ log.Printf("播放器模式: ffplay 流式 stdin (%s)", path)
65
+ return newFFplayStreamPlayer(path)
66
+ }
67
+
68
+ log.Printf("播放器模式: afplay 完整音频 fallback")
69
+ return &bufferedStreamPlayer{}, nil
70
+ }
71
+
72
+ func findExecutable(name string, candidates ...string) (string, bool) {
73
+ if path, err := exec.LookPath(name); err == nil {
74
+ return path, true
75
+ }
76
+ for _, path := range candidates {
77
+ if st, err := os.Stat(path); err == nil && !st.IsDir() && st.Mode()&0111 != 0 {
78
+ return path, true
79
+ }
80
+ }
81
+ return "", false
82
+ }
83
+
84
+ func newFFplayStreamPlayer(path string) (*ffplayStreamPlayer, error) {
85
+ cmd := exec.Command(path, "-nodisp", "-autoexit", "-loglevel", "error", "-i", "pipe:0")
86
+ stdin, err := cmd.StdinPipe()
87
+ if err != nil {
88
+ return nil, err
89
+ }
90
+ cmd.Stderr = os.Stderr
91
+ if err := cmd.Start(); err != nil {
92
+ _ = stdin.Close()
93
+ return nil, err
94
+ }
95
+ return &ffplayStreamPlayer{path: path, cmd: cmd, stdin: stdin}, nil
96
+ }
97
+
98
+ func (p *ffplayStreamPlayer) Write(audio []byte) error {
99
+ if len(audio) == 0 {
100
+ return nil
101
+ }
102
+ if _, err := p.stdin.Write(audio); err != nil {
103
+ return fmt.Errorf("写入播放器失败: %w", err)
104
+ }
105
+ return nil
106
+ }
107
+
108
+ func (p *ffplayStreamPlayer) CloseAndWait() error {
109
+ if p.stdin != nil {
110
+ if err := p.stdin.Close(); err != nil {
111
+ return fmt.Errorf("关闭播放器输入失败: %w", err)
112
+ }
113
+ p.stdin = nil
114
+ }
115
+ if err := p.cmd.Wait(); err != nil {
116
+ return fmt.Errorf("ffplay failed: %w", err)
117
+ }
118
+ return nil
119
+ }
120
+
121
+ func (p *ffplayStreamPlayer) Abort() error {
122
+ if p.stdin != nil {
123
+ _ = p.stdin.Close()
124
+ p.stdin = nil
125
+ }
126
+ if p.cmd != nil && p.cmd.Process != nil {
127
+ _ = p.cmd.Process.Kill()
128
+ _ = p.cmd.Wait()
129
+ }
130
+ return nil
131
+ }
132
+
133
+ type bufferedStreamPlayer struct {
134
+ chunks [][]byte
135
+ }
136
+
137
+ func (p *bufferedStreamPlayer) Write(audio []byte) error {
138
+ if len(audio) == 0 {
139
+ return nil
140
+ }
141
+ chunk := append([]byte(nil), audio...)
142
+ p.chunks = append(p.chunks, chunk)
143
+ return nil
144
+ }
145
+
146
+ func (p *bufferedStreamPlayer) CloseAndWait() error {
147
+ total := 0
148
+ for _, chunk := range p.chunks {
149
+ total += len(chunk)
150
+ }
151
+ audio := make([]byte, 0, total)
152
+ for _, chunk := range p.chunks {
153
+ audio = append(audio, chunk...)
154
+ }
155
+ return playAudio(audio)
156
+ }
157
+
158
+ func (p *bufferedStreamPlayer) Abort() error {
159
+ p.chunks = nil
160
+ return nil
161
+ }
162
+
163
+ // 任务状态
164
+ // 生命周期:pending_synth -> speaking -> delete
165
+ type TaskStatus int
166
+
167
+ const (
168
+ TaskStatusPendingSynth TaskStatus = iota
169
+ TaskStatusSpeaking
170
+ )
171
+
172
+ // 单个 TTS 任务
173
+ type Task struct {
174
+ ID uint64
175
+ Text string
176
+ Status TaskStatus
177
+ Voice VoiceInfo
178
+ Cfg Config
179
+ }
180
+
181
+ // 任务引擎:任务仓库 + 单流式合成播放 worker
182
+ type TaskEngine struct {
183
+ mu sync.Mutex
184
+
185
+ nextID uint64
186
+ tasks map[uint64]*Task
187
+ pendingSynth []uint64
188
+
189
+ synthWake chan struct{}
190
+
191
+ synthesizeStreamFn func(ctx context.Context, cfg Config, text string, voice *VoiceInfo, onAudio func([]byte) error) error
192
+ newStreamPlayerFn func() (StreamPlayer, error)
193
+ }
194
+
195
+ func NewTaskEngine() *TaskEngine {
196
+ return &TaskEngine{
197
+ tasks: make(map[uint64]*Task),
198
+ synthWake: make(chan struct{}, 1),
199
+ synthesizeStreamFn: synthesizeStream,
200
+ newStreamPlayerFn: newDefaultStreamPlayer,
201
+ }
202
+ }
203
+
204
+ func (e *TaskEngine) Start() {
205
+ go e.speakWorker()
206
+ }
207
+
208
+ func (e *TaskEngine) Submit(text string, voice VoiceInfo, cfg Config) uint64 {
209
+ e.mu.Lock()
210
+ defer e.mu.Unlock()
211
+
212
+ // 新任务进来先删所有未开始合成任务
213
+ for _, id := range e.pendingSynth {
214
+ delete(e.tasks, id)
215
+ log.Printf("删除待合成任务: id=%d", id)
216
+ }
217
+ e.pendingSynth = e.pendingSynth[:0]
218
+
219
+ e.nextID++
220
+ task := &Task{
221
+ ID: e.nextID,
222
+ Text: text,
223
+ Status: TaskStatusPendingSynth,
224
+ Voice: voice,
225
+ Cfg: cfg,
226
+ }
227
+ e.tasks[task.ID] = task
228
+ e.pendingSynth = append(e.pendingSynth, task.ID)
229
+ log.Printf("任务创建: id=%d text=%s", task.ID, text)
230
+
231
+ notify(e.synthWake)
232
+ return task.ID
233
+ }
234
+
235
+ func (e *TaskEngine) speakWorker() {
236
+ for {
237
+ id := e.claimPendingSynth()
238
+ if id == 0 {
239
+ <-e.synthWake
240
+ continue
241
+ }
242
+
243
+ e.processSpeakTask(id)
244
+ }
245
+ }
246
+
247
+ func (e *TaskEngine) processSpeakTask(id uint64) {
248
+ defer func() {
249
+ if r := recover(); r != nil {
250
+ log.Printf("播报任务崩溃并删除: id=%d err=%v", id, r)
251
+ e.deleteTask(id)
252
+ }
253
+ }()
254
+
255
+ task, ok := e.getTask(id)
256
+ if !ok {
257
+ return
258
+ }
259
+
260
+ var lastErr error
261
+ for i := 1; i <= ttsMaxAttempts; i++ {
262
+ lastErr = e.speakOnce(context.Background(), task)
263
+ if lastErr == nil {
264
+ break
265
+ }
266
+ if i < ttsMaxAttempts {
267
+ time.Sleep(ttsRetryBackoff)
268
+ }
269
+ }
270
+
271
+ if lastErr != nil {
272
+ log.Printf("播报失败并删除任务: id=%d err=%v", id, lastErr)
273
+ e.deleteTask(id)
274
+ return
275
+ }
276
+
277
+ log.Printf("播报完成并删除任务: id=%d", id)
278
+ e.deleteTask(id)
279
+ }
280
+
281
+ func (e *TaskEngine) speakOnce(ctx context.Context, task *Task) error {
282
+ startedAt := time.Now()
283
+ player, err := e.newStreamPlayerFn()
284
+ if err != nil {
285
+ return fmt.Errorf("启动播放器失败: %w", err)
286
+ }
287
+
288
+ firstChunkLogged := false
289
+ onAudio := func(audio []byte) error {
290
+ if len(audio) > 0 && !firstChunkLogged {
291
+ firstChunkLogged = true
292
+ log.Printf("首个音频 chunk: id=%d elapsed=%s bytes=%d", task.ID, time.Since(startedAt).Round(time.Millisecond), len(audio))
293
+ }
294
+ return player.Write(audio)
295
+ }
296
+
297
+ if err := e.synthesizeStreamFn(ctx, task.Cfg, task.Text, &task.Voice, onAudio); err != nil {
298
+ _ = player.Abort()
299
+ return err
300
+ }
301
+ log.Printf("TTS 流结束: id=%d elapsed=%s", task.ID, time.Since(startedAt).Round(time.Millisecond))
302
+ if err := player.CloseAndWait(); err != nil {
303
+ _ = player.Abort()
304
+ return err
305
+ }
306
+ return nil
307
+ }
308
+
309
+ func (e *TaskEngine) claimPendingSynth() uint64 {
310
+ e.mu.Lock()
311
+ defer e.mu.Unlock()
312
+
313
+ for len(e.pendingSynth) > 0 {
314
+ id := e.pendingSynth[0]
315
+ e.pendingSynth = e.pendingSynth[1:]
316
+ task, ok := e.tasks[id]
317
+ if !ok {
318
+ continue
319
+ }
320
+ if task.Status != TaskStatusPendingSynth {
321
+ continue
322
+ }
323
+ task.Status = TaskStatusSpeaking
324
+ return id
325
+ }
326
+ return 0
327
+ }
328
+
329
+ func (e *TaskEngine) getTask(id uint64) (*Task, bool) {
330
+ e.mu.Lock()
331
+ defer e.mu.Unlock()
332
+
333
+ task, ok := e.tasks[id]
334
+ if !ok {
335
+ return nil, false
336
+ }
337
+ clone := *task
338
+ return &clone, true
339
+ }
340
+
341
+ func (e *TaskEngine) deleteTask(id uint64) {
342
+ e.mu.Lock()
343
+ defer e.mu.Unlock()
344
+ delete(e.tasks, id)
345
+ }
346
+
347
+ func notify(ch chan struct{}) {
348
+ select {
349
+ case ch <- struct{}{}:
350
+ default:
351
+ }
352
+ }
353
+
354
+ // 音色信息
355
+ type VoiceInfo struct {
356
+ VoiceType string `json:"voice_type"`
357
+ ResourceID string `json:"resourceId"`
358
+ }
359
+
360
+ // TTS 配置
361
+ type Config struct {
362
+ APIKey string `json:"apiKey"`
363
+ Endpoint string `json:"endpoint"`
364
+ DefaultVoice *VoiceInfo `json:"defaultVoice"` // 默认音色
365
+ SourceVoices map[string]*VoiceInfo `json:"sourceVoices"` // 来源 → 音色 映射
366
+ }
367
+
368
+ func loadConfig() Config {
369
+ configPaths := []string{
370
+ configDir + "/config.json",
371
+ }
372
+ for _, p := range configPaths {
373
+ st, statErr := os.Stat(p)
374
+ if statErr == nil {
375
+ configCacheMu.Lock()
376
+ if configCacheValid && configCachePath == p && st.ModTime().Equal(configCacheModTime) {
377
+ cached := configCache
378
+ configCacheMu.Unlock()
379
+ return cached
380
+ }
381
+ configCacheMu.Unlock()
382
+ }
383
+
384
+ data, err := os.ReadFile(p)
385
+ if err != nil {
386
+ continue
387
+ }
388
+ var cfg Config
389
+ if json.Unmarshal(data, &cfg) == nil && cfg.APIKey != "" {
390
+ log.Printf("配置文件: %s", p)
391
+ if cfg.DefaultVoice != nil {
392
+ log.Printf("默认音色: %s (%s)", cfg.DefaultVoice.VoiceType, cfg.DefaultVoice.ResourceID)
393
+ }
394
+ for source, v := range cfg.SourceVoices {
395
+ log.Printf("来源 %s → %s (%s)", source, v.VoiceType, v.ResourceID)
396
+ }
397
+ if st, stErr := os.Stat(p); stErr == nil {
398
+ configCacheMu.Lock()
399
+ configCachePath = p
400
+ configCacheModTime = st.ModTime()
401
+ configCache = cfg
402
+ configCacheValid = true
403
+ configCacheMu.Unlock()
404
+ }
405
+ return cfg
406
+ }
407
+ }
408
+
409
+ // 回退到环境变量
410
+ return Config{
411
+ APIKey: envOrDefault("IAGENT_TTS_API_KEY", ""),
412
+ Endpoint: envOrDefault("IAGENT_TTS_ENDPOINT", "https://openspeech.bytedance.com/api/v3/tts/unidirectional"),
413
+ }
414
+ }
415
+
416
+ func envOrDefault(key, fallback string) string {
417
+ if v := os.Getenv(key); v != "" {
418
+ return v
419
+ }
420
+ return fallback
421
+ }
422
+
423
+ // TTS 请求体
424
+ type ttsRequest struct {
425
+ User ttsUser `json:"user"`
426
+ Namespace string `json:"namespace"`
427
+ ReqParams ttsReqParams `json:"req_params"`
428
+ }
429
+
430
+ type ttsUser struct {
431
+ UID string `json:"uid"`
432
+ }
433
+
434
+ type ttsReqParams struct {
435
+ Text string `json:"text"`
436
+ Speaker string `json:"speaker"`
437
+ AudioParams ttsAudioParams `json:"audio_params"`
438
+ }
439
+
440
+ type ttsAudioParams struct {
441
+ Format string `json:"format"`
442
+ SampleRate int `json:"sample_rate"`
443
+ }
444
+
445
+ // 调用字节跳动 TTS API,返回完整 MP3 音频数据。保留给测试和 fallback 使用。
446
+ func synthesize(ctx context.Context, cfg Config, text string, voice *VoiceInfo) ([]byte, error) {
447
+ var chunks [][]byte
448
+ if err := synthesizeStream(ctx, cfg, text, voice, func(audio []byte) error {
449
+ chunk := append([]byte(nil), audio...)
450
+ chunks = append(chunks, chunk)
451
+ return nil
452
+ }); err != nil {
453
+ return nil, err
454
+ }
455
+
456
+ total := 0
457
+ for _, c := range chunks {
458
+ total += len(c)
459
+ }
460
+ result := make([]byte, 0, total)
461
+ for _, c := range chunks {
462
+ result = append(result, c...)
463
+ }
464
+ return result, nil
465
+ }
466
+
467
+ // 调用字节跳动 TTS API,边解析 SSE 边回调 MP3 音频块
468
+ func synthesizeStream(ctx context.Context, cfg Config, text string, voice *VoiceInfo, onAudio func([]byte) error) error {
469
+ speaker := voice.VoiceType
470
+ resourceID := voice.ResourceID
471
+
472
+ log.Printf("音色: %s (resourceId: %s)", speaker, resourceID)
473
+
474
+ reqBody := ttsRequest{
475
+ User: ttsUser{UID: fmt.Sprintf("ttsd-%d", time.Now().UnixNano())},
476
+ Namespace: "BidirectionalTTS",
477
+ ReqParams: ttsReqParams{
478
+ Text: text,
479
+ Speaker: speaker,
480
+ AudioParams: ttsAudioParams{
481
+ Format: "mp3",
482
+ SampleRate: 24000,
483
+ },
484
+ },
485
+ }
486
+
487
+ body, err := json.Marshal(reqBody)
488
+ if err != nil {
489
+ return fmt.Errorf("marshal request: %w", err)
490
+ }
491
+
492
+ req, err := http.NewRequestWithContext(ctx, "POST", cfg.Endpoint, strings.NewReader(string(body)))
493
+ if err != nil {
494
+ return fmt.Errorf("create request: %w", err)
495
+ }
496
+ req.Header.Set("Content-Type", "application/json")
497
+ req.Header.Set("X-Api-Key", cfg.APIKey)
498
+ req.Header.Set("X-Api-Resource-Id", resourceID)
499
+ req.Header.Set("X-Api-Request-Id", fmt.Sprintf("ttsd-%d", time.Now().UnixNano()))
500
+
501
+ resp, err := ttsHTTPClient.Do(req)
502
+ if err != nil {
503
+ return fmt.Errorf("http request: %w", err)
504
+ }
505
+ if resp.StatusCode != 200 {
506
+ io.Copy(io.Discard, resp.Body) // 消费 body 以释放连接
507
+ resp.Body.Close()
508
+ return fmt.Errorf("http status %d", resp.StatusCode)
509
+ }
510
+ defer resp.Body.Close()
511
+
512
+ return parseSSEStream(resp.Body, onAudio)
513
+ }
514
+
515
+ // 解析 SSE 流,提取 base64 音频数据
516
+ func parseSSE(r io.Reader) ([]byte, error) {
517
+ var chunks [][]byte
518
+ if err := parseSSEStream(r, func(audio []byte) error {
519
+ chunk := append([]byte(nil), audio...)
520
+ chunks = append(chunks, chunk)
521
+ return nil
522
+ }); err != nil {
523
+ return nil, err
524
+ }
525
+
526
+ total := 0
527
+ for _, c := range chunks {
528
+ total += len(c)
529
+ }
530
+ result := make([]byte, 0, total)
531
+ for _, c := range chunks {
532
+ result = append(result, c...)
533
+ }
534
+ return result, nil
535
+ }
536
+
537
+ func parseSSEStream(r io.Reader, onAudio func([]byte) error) error {
538
+ audioChunks := 0
539
+ scanner := bufio.NewScanner(r)
540
+ scanner.Buffer(make([]byte, 256*1024), 256*1024)
541
+
542
+ var dataLines []string
543
+
544
+ flush := func() error {
545
+ if len(dataLines) == 0 {
546
+ return nil
547
+ }
548
+ payload := strings.Join(dataLines, "\n")
549
+ dataLines = dataLines[:0]
550
+ ok, err := processEvent(payload, onAudio)
551
+ if ok {
552
+ audioChunks++
553
+ }
554
+ return err
555
+ }
556
+
557
+ for scanner.Scan() {
558
+ line := strings.TrimSpace(scanner.Text())
559
+ if line == "" {
560
+ if err := flush(); err != nil {
561
+ return err
562
+ }
563
+ continue
564
+ }
565
+ if strings.HasPrefix(line, ":") || strings.HasPrefix(line, "event:") ||
566
+ strings.HasPrefix(line, "id:") || strings.HasPrefix(line, "retry:") {
567
+ continue
568
+ }
569
+ if strings.HasPrefix(line, "data:") {
570
+ dataLines = append(dataLines, strings.TrimPrefix(line, "data:"))
571
+ continue
572
+ }
573
+ // 非标准 JSON 直出
574
+ if err := flush(); err != nil {
575
+ return err
576
+ }
577
+ ok, err := processEvent(line, onAudio)
578
+ if ok {
579
+ audioChunks++
580
+ }
581
+ if err != nil {
582
+ return err
583
+ }
584
+ }
585
+ if err := flush(); err != nil {
586
+ return err
587
+ }
588
+ if err := scanner.Err(); err != nil {
589
+ return fmt.Errorf("scan: %w", err)
590
+ }
591
+
592
+ if audioChunks == 0 {
593
+ return fmt.Errorf("no audio data")
594
+ }
595
+ return nil
596
+ }
597
+
598
+ func processEvent(payload string, onAudio func([]byte) error) (bool, error) {
599
+ payload = strings.TrimSpace(payload)
600
+ if payload == "" || payload == "[DONE]" {
601
+ return false, nil
602
+ }
603
+
604
+ var event map[string]any
605
+ if err := json.Unmarshal([]byte(payload), &event); err != nil {
606
+ log.Printf("SSE 数据解析失败: %v", err)
607
+ return false, nil
608
+ }
609
+
610
+ if b64 := extractAudioBase64(event); b64 != "" {
611
+ data, err := base64.StdEncoding.DecodeString(b64)
612
+ if err != nil {
613
+ return false, fmt.Errorf("decode audio chunk: %w", err)
614
+ }
615
+ if err := onAudio(data); err != nil {
616
+ return false, err
617
+ }
618
+ return true, nil
619
+ }
620
+
621
+ return false, nil
622
+ }
623
+
624
+ func extractAudioBase64(event map[string]any) string {
625
+ for _, key := range []string{"data", "audio", "audio_data"} {
626
+ if v, ok := event[key].(string); ok && v != "" {
627
+ return v
628
+ }
629
+ }
630
+ for _, key := range []string{"data", "result", "payload"} {
631
+ if nested, ok := event[key].(map[string]any); ok {
632
+ if v := extractAudioBase64(nested); v != "" {
633
+ return v
634
+ }
635
+ }
636
+ }
637
+ return ""
638
+ }
639
+
640
+ // 过滤格式符号,保留自然朗读文本
641
+ func cleanText(text string) string {
642
+ var lines []string
643
+ for _, line := range strings.Split(text, "\n") {
644
+ line = strings.TrimSpace(line)
645
+ if strings.HasPrefix(line, "|---") || strings.HasPrefix(line, "|:---") {
646
+ continue
647
+ }
648
+ if strings.HasPrefix(line, "---") && strings.Count(line, "-") > 3 {
649
+ continue
650
+ }
651
+ // 过滤纯表格分隔行(|---|---|、:---|:---| 等)
652
+ if strings.Trim(line, "|-: ") == "" {
653
+ continue
654
+ }
655
+ cleaned := strings.NewReplacer(
656
+ "**", "",
657
+ "*", "",
658
+ "`", "",
659
+ "#", "",
660
+ ">", "",
661
+ ).Replace(line)
662
+ cleaned = strings.TrimSpace(cleaned)
663
+ if cleaned != "" {
664
+ lines = append(lines, cleaned)
665
+ }
666
+ }
667
+ return strings.Join(lines, ",")
668
+ }
669
+
670
+ func main() {
671
+ log.SetFlags(log.Ltime | log.Lshortfile)
672
+
673
+ // 日志轮转:最大 10MB,保留 3 份
674
+ os.MkdirAll(configDir, 0755)
675
+ log.SetOutput(&lumberjack.Logger{
676
+ Filename: configDir + "/ispeak.log",
677
+ MaxSize: 10,
678
+ MaxBackups: 3,
679
+ Compress: true,
680
+ })
681
+
682
+ // 创建进程级 temp 目录
683
+ cleanupOldTempDirs()
684
+ var err error
685
+ tempDir, err = os.MkdirTemp("", "ttsd-*")
686
+ if err != nil {
687
+ log.Fatalf("创建 temp 目录失败: %v", err)
688
+ }
689
+ defer os.RemoveAll(tempDir)
690
+
691
+ cfg := loadConfig()
692
+ if err := validateConfig(cfg); err != nil {
693
+ log.Fatalf("配置错误: %v", err)
694
+ }
695
+
696
+ socketPath := configDir + "/ispeak.sock"
697
+ listener, err := listenUnixSocket(socketPath)
698
+ if err != nil {
699
+ if errors.Is(err, errAlreadyRunning) {
700
+ log.Fatalf("iSpeak 已在运行,请先关闭旧实例或重启")
701
+ }
702
+ log.Fatalf("监听 socket 失败: %v", err)
703
+ }
704
+ defer os.Remove(socketPath)
705
+
706
+ sigCh := make(chan os.Signal, 1)
707
+ signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
708
+ go func() {
709
+ <-sigCh
710
+ listener.Close()
711
+ }()
712
+
713
+ engine := NewTaskEngine()
714
+ engine.Start()
715
+
716
+ log.Printf("iSpeak 已启动,监听 %s", socketPath)
717
+ for {
718
+ conn, err := listener.Accept()
719
+ if err != nil {
720
+ if strings.Contains(err.Error(), "use of closed") {
721
+ return
722
+ }
723
+ continue
724
+ }
725
+ go handleConnection(conn, engine)
726
+ }
727
+ }
728
+
729
+ func listenUnixSocket(socketPath string) (net.Listener, error) {
730
+ listener, err := net.Listen("unix", socketPath)
731
+ if err == nil {
732
+ return listener, nil
733
+ }
734
+
735
+ if !errors.Is(err, syscall.EADDRINUSE) {
736
+ _ = os.Remove(socketPath)
737
+ listener, retryErr := net.Listen("unix", socketPath)
738
+ if retryErr == nil {
739
+ return listener, nil
740
+ }
741
+ return nil, retryErr
742
+ }
743
+
744
+ conn, dialErr := net.DialTimeout("unix", socketPath, 200*time.Millisecond)
745
+ if dialErr == nil {
746
+ _ = conn.Close()
747
+ return nil, errAlreadyRunning
748
+ }
749
+
750
+ if removeErr := os.Remove(socketPath); removeErr != nil && !errors.Is(removeErr, os.ErrNotExist) {
751
+ return nil, removeErr
752
+ }
753
+ listener, err = net.Listen("unix", socketPath)
754
+ if err != nil {
755
+ return nil, err
756
+ }
757
+ log.Printf("已清理残留 socket: %s", socketPath)
758
+ return listener, nil
759
+ }
760
+
761
+ // 清理历史遗留的 temp 目录(进程崩溃时留下)
762
+ func cleanupOldTempDirs() {
763
+ entries, err := os.ReadDir(os.TempDir())
764
+ if err != nil {
765
+ return
766
+ }
767
+ for _, e := range entries {
768
+ if strings.HasPrefix(e.Name(), "ttsd-") {
769
+ os.RemoveAll(filepath.Join(os.TempDir(), e.Name()))
770
+ }
771
+ }
772
+ }
773
+
774
+ // 校验配置必填项
775
+ func validateConfig(cfg Config) error {
776
+ if cfg.APIKey == "" {
777
+ return fmt.Errorf("apiKey 未设置,编辑 ~/.config/iSpeak/config.json")
778
+ }
779
+ if cfg.Endpoint == "" {
780
+ return fmt.Errorf("endpoint 未设置")
781
+ }
782
+ if cfg.DefaultVoice == nil || cfg.DefaultVoice.VoiceType == "" {
783
+ return fmt.Errorf("defaultVoice 未设置")
784
+ }
785
+ return nil
786
+ }
787
+
788
+ func playAudio(data []byte) error {
789
+ tmpFile := filepath.Join(tempDir, fmt.Sprintf("ttsd-%d.mp3", time.Now().UnixNano()))
790
+ if err := os.WriteFile(tmpFile, data, 0644); err != nil {
791
+ return fmt.Errorf("写入临时文件失败: %w", err)
792
+ }
793
+ defer os.Remove(tmpFile)
794
+
795
+ cmd := exec.Command("/usr/bin/afplay", tmpFile)
796
+ log.Printf("播放开始: %s", filepath.Base(tmpFile))
797
+ if err := cmd.Run(); err != nil {
798
+ return fmt.Errorf("播放失败: %w", err)
799
+ }
800
+ return nil
801
+ }
802
+
803
+ func handleConnection(conn net.Conn, engine *TaskEngine) {
804
+ defer func() {
805
+ if r := recover(); r != nil {
806
+ log.Printf("连接处理崩溃: %v", r)
807
+ }
808
+ }()
809
+ defer conn.Close()
810
+
811
+ cfg := loadConfig()
812
+
813
+ var sb strings.Builder
814
+ scanner := bufio.NewScanner(conn)
815
+ scanner.Buffer(make([]byte, 1*1024*1024), 1*1024*1024)
816
+ for scanner.Scan() {
817
+ sb.WriteString(scanner.Text())
818
+ }
819
+
820
+ text := strings.TrimSpace(sb.String())
821
+ if text == "" {
822
+ return
823
+ }
824
+
825
+ voice, content := extractVoicePrefix(text, cfg)
826
+ if voice == nil {
827
+ voice = cfg.DefaultVoice
828
+ }
829
+ if voice == nil {
830
+ log.Printf("未配置默认音色")
831
+ return
832
+ }
833
+
834
+ cleaned := cleanText(content)
835
+ if cleaned == "" {
836
+ return
837
+ }
838
+
839
+ log.Printf("TTS: %s", cleaned)
840
+ engine.Submit(cleaned, *voice, cfg)
841
+ }
842
+
843
+ // 解析消息中的音色前缀,返回 VoiceInfo
844
+ func extractVoicePrefix(text string, cfg Config) (voice *VoiceInfo, content string) {
845
+ // 格式: {source:claude}文本
846
+ const prefix = "{source:"
847
+ if strings.HasPrefix(text, prefix) {
848
+ if end := strings.Index(text, "}"); end > len(prefix) {
849
+ if v, ok := cfg.SourceVoices[text[len(prefix):end]]; ok {
850
+ voice = v
851
+ }
852
+ content = text[end+1:]
853
+ return
854
+ }
855
+ }
856
+ content = text
857
+ return
858
+ }