personality 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/settings.local.json +9 -0
- data/README.md +92 -13
- data/exe/psn-http +28 -0
- data/exe/psn-mcp +15 -1
- data/exe/psn-voice +7 -0
- data/lib/personality/cart.rb +12 -1
- data/lib/personality/cart_manager.rb +1 -1
- data/lib/personality/cli/index.rb +50 -14
- data/lib/personality/cli/tts.rb +27 -2
- data/lib/personality/indexer.rb +27 -9
- data/lib/personality/mcp/oauth.rb +238 -0
- data/lib/personality/mcp/rack_app.rb +155 -0
- data/lib/personality/mcp/server.rb +183 -30
- data/lib/personality/mcp/tts_server.rb +11 -4
- data/lib/personality/mcp/voice_server.rb +412 -0
- data/lib/personality/tts.rb +168 -35
- data/lib/personality/version.rb +1 -1
- metadata +51 -1
|
@@ -0,0 +1,412 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "mcp"
|
|
4
|
+
require "mcp/transports/stdio"
|
|
5
|
+
require "json"
|
|
6
|
+
require "open3"
|
|
7
|
+
require "tempfile"
|
|
8
|
+
require "shellwords"
|
|
9
|
+
|
|
10
|
+
module Personality
|
|
11
|
+
module MCP
|
|
12
|
+
class VoiceServer
|
|
13
|
+
# Moto G52 phone configuration - ADB over WiFi
|
|
14
|
+
PHONE_IP = "192.168.88.155"
|
|
15
|
+
PHONE_PORT = "5555"
|
|
16
|
+
PHONE_ADB = "#{PHONE_IP}:#{PHONE_PORT}"
|
|
17
|
+
TERMUX_HOME = "/data/data/com.termux/files/home"
|
|
18
|
+
|
|
19
|
+
# Junkpile server configuration (192.168.88.165 for WiFi access)
|
|
20
|
+
JUNKPILE_SSH = "j"
|
|
21
|
+
JUNKPILE_IP = "192.168.88.165"
|
|
22
|
+
WHISPER_PATH = "~/.local/bin/whisper"
|
|
23
|
+
CLAUDE_PATH = "/home/linuxbrew/.linuxbrew/bin/claude"
|
|
24
|
+
|
|
25
|
+
def self.run
|
|
26
|
+
new.start
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def initialize
|
|
30
|
+
@server = ::MCP::Server.new(
|
|
31
|
+
name: "voice",
|
|
32
|
+
version: Personality::VERSION
|
|
33
|
+
)
|
|
34
|
+
@server.server_context = {}
|
|
35
|
+
register_tools
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def start
|
|
39
|
+
transport = ::MCP::Transports::StdioTransport.new(@server)
|
|
40
|
+
transport.open
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
private
|
|
44
|
+
|
|
45
|
+
def tool_response(result)
|
|
46
|
+
::MCP::Tool::Response.new([{type: "text", text: JSON.generate(result)}])
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def register_tools
|
|
50
|
+
register_voice_record
|
|
51
|
+
register_voice_transcribe
|
|
52
|
+
register_voice_ask
|
|
53
|
+
register_voice_listen
|
|
54
|
+
register_voice_status
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# === Voice Record Tool ===
|
|
58
|
+
# Records audio from the Moto G52 phone via Termux
|
|
59
|
+
|
|
60
|
+
def register_voice_record
|
|
61
|
+
@server.define_tool(
|
|
62
|
+
name: "voice_record",
|
|
63
|
+
description: "Record audio from the Moto G52 phone via Termux. Returns path to the recorded WAV file on junkpile.",
|
|
64
|
+
input_schema: {
|
|
65
|
+
type: "object",
|
|
66
|
+
properties: {
|
|
67
|
+
duration: {type: "integer", description: "Recording duration in seconds (default: 8, max: 60)"},
|
|
68
|
+
output_path: {type: "string", description: "Output path on junkpile (default: /tmp/phone_voice.wav)"}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
) do |server_context:, **opts|
|
|
72
|
+
duration = [opts[:duration] || 8, 60].min
|
|
73
|
+
output_path = opts[:output_path] || "/tmp/phone_voice.wav"
|
|
74
|
+
|
|
75
|
+
result = record_from_phone(duration: duration, output_path: output_path)
|
|
76
|
+
tool_response(result)
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# === Voice Transcribe Tool ===
|
|
81
|
+
# Transcribes audio using Whisper on junkpile
|
|
82
|
+
|
|
83
|
+
def register_voice_transcribe
|
|
84
|
+
@server.define_tool(
|
|
85
|
+
name: "voice_transcribe",
|
|
86
|
+
description: "Transcribe audio file using Whisper STT on junkpile. Returns the transcribed text.",
|
|
87
|
+
input_schema: {
|
|
88
|
+
type: "object",
|
|
89
|
+
properties: {
|
|
90
|
+
audio_path: {type: "string", description: "Path to audio file on junkpile"},
|
|
91
|
+
model: {type: "string", description: "Whisper model to use (default: small). Options: tiny, base, small, medium, large"},
|
|
92
|
+
language: {type: "string", description: "Language code (default: en)"}
|
|
93
|
+
},
|
|
94
|
+
required: %w[audio_path]
|
|
95
|
+
}
|
|
96
|
+
) do |audio_path:, server_context:, **opts|
|
|
97
|
+
model = opts[:model] || "small"
|
|
98
|
+
language = opts[:language] || "en"
|
|
99
|
+
|
|
100
|
+
result = transcribe_audio(audio_path: audio_path, model: model, language: language)
|
|
101
|
+
tool_response(result)
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# === Voice Ask Tool ===
|
|
106
|
+
# Full pipeline: record -> transcribe -> Claude -> TTS response
|
|
107
|
+
|
|
108
|
+
def register_voice_ask
|
|
109
|
+
@server.define_tool(
|
|
110
|
+
name: "voice_ask",
|
|
111
|
+
description: "Full voice pipeline: record audio from phone, transcribe with Whisper, send to Claude, and speak the response. Returns the transcript and response.",
|
|
112
|
+
input_schema: {
|
|
113
|
+
type: "object",
|
|
114
|
+
properties: {
|
|
115
|
+
duration: {type: "integer", description: "Recording duration in seconds (default: 8)"},
|
|
116
|
+
model: {type: "string", description: "Whisper model (default: small)"},
|
|
117
|
+
speak_response: {type: "boolean", description: "Speak the response via TTS (default: true)"},
|
|
118
|
+
voice: {type: "string", description: "TTS voice to use (default: bt7274)"}
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
) do |server_context:, **opts|
|
|
122
|
+
duration = opts[:duration] || 8
|
|
123
|
+
model = opts[:model] || "small"
|
|
124
|
+
speak_response = opts.fetch(:speak_response, true)
|
|
125
|
+
voice = opts[:voice] || "bt7274"
|
|
126
|
+
|
|
127
|
+
result = voice_ask_pipeline(
|
|
128
|
+
duration: duration,
|
|
129
|
+
model: model,
|
|
130
|
+
speak_response: speak_response,
|
|
131
|
+
voice: voice
|
|
132
|
+
)
|
|
133
|
+
tool_response(result)
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# === Voice Listen Tool ===
|
|
138
|
+
# Starts continuous wake word listening (placeholder for future)
|
|
139
|
+
|
|
140
|
+
def register_voice_listen
|
|
141
|
+
@server.define_tool(
|
|
142
|
+
name: "voice_listen",
|
|
143
|
+
description: "Start continuous wake word listening on the phone (using Vosk). Currently returns status of the listener service.",
|
|
144
|
+
input_schema: {
|
|
145
|
+
type: "object",
|
|
146
|
+
properties: {
|
|
147
|
+
wake_word: {type: "string", description: "Wake word to listen for (default: hey b t)"},
|
|
148
|
+
action: {type: "string", enum: %w[start stop status], description: "Action: start, stop, or status (default: status)"}
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
) do |server_context:, **opts|
|
|
152
|
+
action = opts[:action] || "status"
|
|
153
|
+
wake_word = opts[:wake_word] || "hey b t"
|
|
154
|
+
|
|
155
|
+
result = manage_wake_listener(action: action, wake_word: wake_word)
|
|
156
|
+
tool_response(result)
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# === Voice Status Tool ===
|
|
161
|
+
# Check connectivity and status of voice components
|
|
162
|
+
|
|
163
|
+
def register_voice_status
|
|
164
|
+
@server.define_tool(
|
|
165
|
+
name: "voice_status",
|
|
166
|
+
description: "Check status of voice pipeline components: phone connectivity, junkpile availability, Whisper installation.",
|
|
167
|
+
input_schema: {type: "object", properties: {}}
|
|
168
|
+
) do |server_context:, **|
|
|
169
|
+
result = check_voice_status
|
|
170
|
+
tool_response(result)
|
|
171
|
+
end
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# === Implementation Methods ===
|
|
175
|
+
|
|
176
|
+
def record_from_phone(duration:, output_path:)
|
|
177
|
+
# Record on phone using termux-microphone-record via ADB
|
|
178
|
+
phone_audio = "#{TERMUX_HOME}/voice_cmd.wav"
|
|
179
|
+
local_audio = "/tmp/voice_cmd_local.wav"
|
|
180
|
+
|
|
181
|
+
# Ensure ADB is connected
|
|
182
|
+
_, _ = Open3.capture2("adb connect #{PHONE_ADB} 2>&1")
|
|
183
|
+
|
|
184
|
+
# Step 1: Record audio on phone via ADB + run-as
|
|
185
|
+
record_cmd = "adb -s #{PHONE_ADB} shell 'run-as com.termux #{TERMUX_HOME}/../usr/bin/termux-microphone-record -f #{phone_audio} -l #{duration}' 2>&1"
|
|
186
|
+
record_output, record_status = Open3.capture2(record_cmd)
|
|
187
|
+
|
|
188
|
+
unless record_status.success?
|
|
189
|
+
return {
|
|
190
|
+
success: false,
|
|
191
|
+
error: "Failed to record on phone",
|
|
192
|
+
details: record_output.strip
|
|
193
|
+
}
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
# Step 2: Pull audio from phone via ADB
|
|
197
|
+
pull_cmd = "adb -s #{PHONE_ADB} shell 'run-as com.termux cat #{phone_audio}' > #{local_audio} 2>/dev/null"
|
|
198
|
+
_, pull_status = Open3.capture2(pull_cmd)
|
|
199
|
+
|
|
200
|
+
unless pull_status.success? && File.exist?(local_audio) && File.size(local_audio) > 0
|
|
201
|
+
return {
|
|
202
|
+
success: false,
|
|
203
|
+
error: "Failed to pull audio from phone",
|
|
204
|
+
details: "File size: #{File.exist?(local_audio) ? File.size(local_audio) : "N/A"}"
|
|
205
|
+
}
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
# Step 3: Transfer to junkpile
|
|
209
|
+
scp_cmd = "scp -q #{local_audio} #{JUNKPILE_SSH}:#{output_path} 2>&1"
|
|
210
|
+
scp_output, scp_status = Open3.capture2(scp_cmd)
|
|
211
|
+
|
|
212
|
+
unless scp_status.success?
|
|
213
|
+
return {
|
|
214
|
+
success: false,
|
|
215
|
+
error: "Failed to transfer audio to junkpile",
|
|
216
|
+
details: scp_output.strip
|
|
217
|
+
}
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
# Step 4: Cleanup
|
|
221
|
+
File.delete(local_audio) if File.exist?(local_audio)
|
|
222
|
+
Open3.capture2("adb -s #{PHONE_ADB} shell 'run-as com.termux rm -f #{phone_audio}' 2>&1")
|
|
223
|
+
|
|
224
|
+
{
|
|
225
|
+
success: true,
|
|
226
|
+
duration: duration,
|
|
227
|
+
output_path: output_path,
|
|
228
|
+
message: "Recorded #{duration}s of audio"
|
|
229
|
+
}
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
def transcribe_audio(audio_path:, model:, language:)
|
|
233
|
+
# Convert audio to proper format and transcribe with Whisper
|
|
234
|
+
converted_path = "/tmp/voice_converted.wav"
|
|
235
|
+
|
|
236
|
+
# Build the transcription command
|
|
237
|
+
cmd = <<~BASH
|
|
238
|
+
export PATH=~/.local/bin:$PATH
|
|
239
|
+
ffmpeg -i #{Shellwords.escape(audio_path)} -ar 16000 -ac 1 #{converted_path} -y 2>/dev/null
|
|
240
|
+
#{WHISPER_PATH} #{converted_path} --model #{model} --language #{language} --output_format txt --output_dir /tmp 2>/dev/null
|
|
241
|
+
cat /tmp/voice_converted.txt 2>/dev/null | tr '\\n' ' ' | xargs
|
|
242
|
+
BASH
|
|
243
|
+
|
|
244
|
+
ssh_cmd = "ssh #{JUNKPILE_SSH} #{Shellwords.escape("bash -c #{Shellwords.escape(cmd)}")} 2>&1"
|
|
245
|
+
output, _ = Open3.capture2(ssh_cmd)
|
|
246
|
+
|
|
247
|
+
transcript = output.strip
|
|
248
|
+
|
|
249
|
+
if transcript.empty?
|
|
250
|
+
return {
|
|
251
|
+
success: false,
|
|
252
|
+
error: "Transcription returned empty result",
|
|
253
|
+
audio_path: audio_path
|
|
254
|
+
}
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
{
|
|
258
|
+
success: true,
|
|
259
|
+
transcript: transcript,
|
|
260
|
+
model: model,
|
|
261
|
+
language: language,
|
|
262
|
+
audio_path: audio_path
|
|
263
|
+
}
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
def voice_ask_pipeline(duration:, model:, speak_response:, voice:)
|
|
267
|
+
# Step 1: Record
|
|
268
|
+
audio_path = "/tmp/phone_voice.wav"
|
|
269
|
+
record_result = record_from_phone(duration: duration, output_path: audio_path)
|
|
270
|
+
|
|
271
|
+
unless record_result[:success]
|
|
272
|
+
return record_result.merge(stage: "record")
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
# Step 2: Transcribe
|
|
276
|
+
transcribe_result = transcribe_audio(audio_path: audio_path, model: model, language: "en")
|
|
277
|
+
|
|
278
|
+
unless transcribe_result[:success]
|
|
279
|
+
return transcribe_result.merge(stage: "transcribe")
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
transcript = transcribe_result[:transcript]
|
|
283
|
+
|
|
284
|
+
# Step 3: Get Claude response on junkpile
|
|
285
|
+
prompt = "Voice command from user: #{transcript}\nRespond concisely (1-2 sentences max)."
|
|
286
|
+
|
|
287
|
+
claude_cmd = <<~BASH
|
|
288
|
+
export PATH=/home/linuxbrew/.linuxbrew/bin:$PATH
|
|
289
|
+
#{CLAUDE_PATH} --print --output-format stream-json --verbose #{Shellwords.escape(prompt)} 2>/dev/null | grep '"type":"assistant"' | head -1 | jq -r '.message.content[0].text // empty'
|
|
290
|
+
BASH
|
|
291
|
+
|
|
292
|
+
ssh_cmd = "ssh #{JUNKPILE_SSH} #{Shellwords.escape("bash -c #{Shellwords.escape(claude_cmd)}")} 2>&1"
|
|
293
|
+
response, _ = Open3.capture2(ssh_cmd)
|
|
294
|
+
response = response.strip
|
|
295
|
+
|
|
296
|
+
if response.empty?
|
|
297
|
+
return {
|
|
298
|
+
success: false,
|
|
299
|
+
error: "Claude returned empty response",
|
|
300
|
+
transcript: transcript,
|
|
301
|
+
stage: "claude"
|
|
302
|
+
}
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
# Step 4: Speak response (if enabled)
|
|
306
|
+
if speak_response
|
|
307
|
+
# Use local TTS via psn
|
|
308
|
+
speak_text(text: response, voice: voice)
|
|
309
|
+
end
|
|
310
|
+
|
|
311
|
+
{
|
|
312
|
+
success: true,
|
|
313
|
+
transcript: transcript,
|
|
314
|
+
response: response,
|
|
315
|
+
spoke_response: speak_response,
|
|
316
|
+
voice: voice,
|
|
317
|
+
duration: duration,
|
|
318
|
+
model: model
|
|
319
|
+
}
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
def speak_text(text:, voice:)
|
|
323
|
+
# Call local piper TTS
|
|
324
|
+
require_relative "../tts"
|
|
325
|
+
Personality::TTS.speak(text, voice: voice)
|
|
326
|
+
rescue => e
|
|
327
|
+
{error: e.message}
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
def manage_wake_listener(action:, wake_word:)
|
|
331
|
+
case action
|
|
332
|
+
when "start"
|
|
333
|
+
# Future: Start Vosk wake word listener on phone
|
|
334
|
+
{
|
|
335
|
+
success: false,
|
|
336
|
+
message: "Wake word listener not yet implemented. Use voice_record for manual recording.",
|
|
337
|
+
wake_word: wake_word
|
|
338
|
+
}
|
|
339
|
+
when "stop"
|
|
340
|
+
{
|
|
341
|
+
success: false,
|
|
342
|
+
message: "Wake word listener not yet implemented"
|
|
343
|
+
}
|
|
344
|
+
when "status"
|
|
345
|
+
# Check if wake listener process is running via ADB
|
|
346
|
+
check_cmd = "adb -s #{PHONE_ADB} shell 'run-as com.termux pgrep -f vosk_wake || echo not_running' 2>&1"
|
|
347
|
+
output, _ = Open3.capture2(check_cmd)
|
|
348
|
+
|
|
349
|
+
running = !output.include?("not_running")
|
|
350
|
+
|
|
351
|
+
{
|
|
352
|
+
status: running ? "running" : "stopped",
|
|
353
|
+
wake_word: wake_word,
|
|
354
|
+
message: running ? "Wake word listener is active" : "Wake word listener is not running"
|
|
355
|
+
}
|
|
356
|
+
else
|
|
357
|
+
{error: "Unknown action: #{action}"}
|
|
358
|
+
end
|
|
359
|
+
end
|
|
360
|
+
|
|
361
|
+
def check_voice_status
|
|
362
|
+
status = {}
|
|
363
|
+
|
|
364
|
+
# Check phone connectivity via ADB
|
|
365
|
+
adb_check = "adb connect #{PHONE_ADB} 2>&1 && adb -s #{PHONE_ADB} shell echo ok 2>&1"
|
|
366
|
+
phone_output, _ = Open3.capture2(adb_check)
|
|
367
|
+
status[:phone] = {
|
|
368
|
+
host: PHONE_ADB,
|
|
369
|
+
method: "adb_wifi",
|
|
370
|
+
connected: phone_output.include?("ok")
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
# Check junkpile connectivity
|
|
374
|
+
junkpile_check = "ssh -o ConnectTimeout=3 #{JUNKPILE_SSH} 'echo ok' 2>&1"
|
|
375
|
+
junkpile_output, _ = Open3.capture2(junkpile_check)
|
|
376
|
+
status[:junkpile] = {
|
|
377
|
+
ssh_alias: JUNKPILE_SSH,
|
|
378
|
+
connected: junkpile_output.strip == "ok"
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
# Check Whisper installation on junkpile
|
|
382
|
+
if status[:junkpile][:connected]
|
|
383
|
+
whisper_check = "ssh #{JUNKPILE_SSH} 'test -x #{WHISPER_PATH} && echo ok' 2>&1"
|
|
384
|
+
whisper_output, _ = Open3.capture2(whisper_check)
|
|
385
|
+
status[:whisper] = {
|
|
386
|
+
path: WHISPER_PATH,
|
|
387
|
+
installed: whisper_output.strip == "ok"
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
# Check Claude CLI on junkpile
|
|
391
|
+
claude_check = "ssh #{JUNKPILE_SSH} 'test -x #{CLAUDE_PATH} && echo ok' 2>&1"
|
|
392
|
+
claude_output, _ = Open3.capture2(claude_check)
|
|
393
|
+
status[:claude] = {
|
|
394
|
+
path: CLAUDE_PATH,
|
|
395
|
+
installed: claude_output.strip == "ok"
|
|
396
|
+
}
|
|
397
|
+
end
|
|
398
|
+
|
|
399
|
+
# Overall status
|
|
400
|
+
all_ok = status[:phone][:connected] &&
|
|
401
|
+
status[:junkpile][:connected] &&
|
|
402
|
+
status.dig(:whisper, :installed) &&
|
|
403
|
+
status.dig(:claude, :installed)
|
|
404
|
+
|
|
405
|
+
status[:ready] = all_ok
|
|
406
|
+
status[:message] = all_ok ? "Voice pipeline ready" : "Some components unavailable"
|
|
407
|
+
|
|
408
|
+
status
|
|
409
|
+
end
|
|
410
|
+
end
|
|
411
|
+
end
|
|
412
|
+
end
|
data/lib/personality/tts.rb
CHANGED
|
@@ -2,40 +2,52 @@
|
|
|
2
2
|
|
|
3
3
|
require "open3"
|
|
4
4
|
require "fileutils"
|
|
5
|
+
require "net/http"
|
|
6
|
+
require "json"
|
|
5
7
|
|
|
6
8
|
module Personality
|
|
7
9
|
module TTS
|
|
8
10
|
VOICES_DIR = File.join(Dir.home, ".local", "share", "psn", "voices")
|
|
9
11
|
DATA_DIR = File.join(Dir.home, ".local", "share", "personality", "data")
|
|
10
|
-
DEFAULT_VOICE = "
|
|
12
|
+
DEFAULT_VOICE = "bt7274"
|
|
11
13
|
|
|
12
14
|
PID_FILE = File.join(DATA_DIR, "tts.pid")
|
|
13
15
|
WAV_FILE = File.join(DATA_DIR, "tts_current.wav")
|
|
14
16
|
NATURAL_STOP_FLAG = File.join(DATA_DIR, "tts_natural_stop")
|
|
15
17
|
|
|
18
|
+
# XTTS configuration
|
|
19
|
+
XTTS_HOST = ENV.fetch("XTTS_HOST", "junkpile")
|
|
20
|
+
XTTS_PORT = ENV.fetch("XTTS_PORT", "5002")
|
|
21
|
+
XTTS_URL = "http://#{XTTS_HOST}:#{XTTS_PORT}"
|
|
22
|
+
|
|
23
|
+
# Backend selection: "auto" selects based on language (pl=xtts, en=piper)
|
|
24
|
+
BACKEND = ENV.fetch("TTS_BACKEND", "auto") # "piper", "xtts", or "auto"
|
|
25
|
+
|
|
16
26
|
PIPER_VOICES_BASE_URL = "https://huggingface.co/rhasspy/piper-voices/resolve/main"
|
|
17
27
|
|
|
28
|
+
# Audio padding (matches XTTS server)
|
|
29
|
+
PADDING_MS = 250
|
|
30
|
+
|
|
18
31
|
class << self
|
|
19
32
|
# --- Synthesis & Playback ---
|
|
20
33
|
|
|
21
|
-
def speak(text, voice: nil)
|
|
34
|
+
def speak(text, voice: nil, language: nil)
|
|
22
35
|
stop_current
|
|
23
36
|
voice ||= active_voice
|
|
37
|
+
language ||= detect_language(text)
|
|
24
38
|
|
|
25
|
-
|
|
26
|
-
return {error: "Voice not found: #{voice}"} unless model_path
|
|
39
|
+
FileUtils.mkdir_p(DATA_DIR)
|
|
27
40
|
|
|
28
|
-
|
|
29
|
-
|
|
41
|
+
# Select backend: auto mode uses XTTS for Polish, piper for English
|
|
42
|
+
backend = select_backend(language)
|
|
30
43
|
|
|
31
|
-
|
|
44
|
+
result = if backend == "xtts"
|
|
45
|
+
synthesize_xtts(text, voice: voice, language: language)
|
|
46
|
+
else
|
|
47
|
+
synthesize_piper(text, voice: voice)
|
|
48
|
+
end
|
|
32
49
|
|
|
33
|
-
|
|
34
|
-
_, stderr, status = Open3.capture3(
|
|
35
|
-
piper_bin, "--model", model_path, "--output_file", WAV_FILE,
|
|
36
|
-
stdin_data: text
|
|
37
|
-
)
|
|
38
|
-
return {error: "piper failed: #{stderr}"} unless status.success?
|
|
50
|
+
return result if result[:error]
|
|
39
51
|
|
|
40
52
|
# Play audio (macOS: afplay, Linux: aplay)
|
|
41
53
|
player = player_command
|
|
@@ -44,11 +56,11 @@ module Personality
|
|
|
44
56
|
pid = spawn(player, WAV_FILE, [:out, :err] => "/dev/null")
|
|
45
57
|
save_pid(pid)
|
|
46
58
|
|
|
47
|
-
{speaking: true, voice: voice, pid: pid}
|
|
59
|
+
{speaking: true, voice: voice, pid: pid, backend: backend}
|
|
48
60
|
end
|
|
49
61
|
|
|
50
|
-
def speak_and_wait(text, voice: nil)
|
|
51
|
-
result = speak(text, voice: voice)
|
|
62
|
+
def speak_and_wait(text, voice: nil, language: nil)
|
|
63
|
+
result = speak(text, voice: voice, language: language)
|
|
52
64
|
return result if result[:error]
|
|
53
65
|
|
|
54
66
|
Process.wait(result[:pid])
|
|
@@ -95,21 +107,144 @@ module Personality
|
|
|
95
107
|
# --- Voice Management ---
|
|
96
108
|
|
|
97
109
|
def find_voice(name)
|
|
110
|
+
# For XTTS, check if speaker embedding exists
|
|
111
|
+
if BACKEND == "xtts"
|
|
112
|
+
# XTTS voices are speaker embeddings on junkpile
|
|
113
|
+
return name if xtts_voice_available?(name)
|
|
114
|
+
return nil
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Piper: check for .onnx file
|
|
98
118
|
path = File.join(VOICES_DIR, "#{name}.onnx")
|
|
99
119
|
File.exist?(path) ? path : nil
|
|
100
120
|
end
|
|
101
121
|
|
|
102
122
|
def list_voices
|
|
123
|
+
if BACKEND == "xtts"
|
|
124
|
+
list_xtts_voices
|
|
125
|
+
else
|
|
126
|
+
list_piper_voices
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
def download_voice(voice_name)
|
|
131
|
+
if BACKEND == "xtts"
|
|
132
|
+
{error: "XTTS voices are pre-installed on #{XTTS_HOST}"}
|
|
133
|
+
else
|
|
134
|
+
download_piper_voice(voice_name)
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
def active_voice
|
|
139
|
+
ENV.fetch("PERSONALITY_VOICE", DEFAULT_VOICE)
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def backend
|
|
143
|
+
BACKEND
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
private
|
|
147
|
+
|
|
148
|
+
# --- XTTS Backend ---
|
|
149
|
+
|
|
150
|
+
def synthesize_xtts(text, voice:, language:)
|
|
151
|
+
uri = URI.parse("#{XTTS_URL}/synthesize")
|
|
152
|
+
|
|
153
|
+
request = Net::HTTP::Post.new(uri)
|
|
154
|
+
request["Content-Type"] = "application/json"
|
|
155
|
+
request.body = JSON.generate({text: text, language: language})
|
|
156
|
+
|
|
157
|
+
response = Net::HTTP.start(uri.host, uri.port, read_timeout: 30) do |http|
|
|
158
|
+
http.request(request)
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
unless response.is_a?(Net::HTTPSuccess)
|
|
162
|
+
return {error: "XTTS synthesis failed: #{response.code} #{response.body}"}
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
File.binwrite(WAV_FILE, response.body)
|
|
166
|
+
{synthesized: true}
|
|
167
|
+
rescue Errno::ECONNREFUSED
|
|
168
|
+
{error: "XTTS server not running on #{XTTS_HOST}:#{XTTS_PORT}"}
|
|
169
|
+
rescue Net::ReadTimeout
|
|
170
|
+
{error: "XTTS synthesis timed out"}
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
def xtts_voice_available?(name)
|
|
174
|
+
# bt7274 is the only trained voice currently
|
|
175
|
+
name == "bt7274"
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
def list_xtts_voices
|
|
179
|
+
# Check if server is healthy
|
|
180
|
+
uri = URI.parse("#{XTTS_URL}/health")
|
|
181
|
+
response = Net::HTTP.get_response(uri)
|
|
182
|
+
|
|
183
|
+
if response.is_a?(Net::HTTPSuccess)
|
|
184
|
+
[{name: "bt7274", backend: "xtts", server: "#{XTTS_HOST}:#{XTTS_PORT}"}]
|
|
185
|
+
else
|
|
186
|
+
[]
|
|
187
|
+
end
|
|
188
|
+
rescue Errno::ECONNREFUSED
|
|
189
|
+
[]
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# --- Piper Backend ---
|
|
193
|
+
|
|
194
|
+
def synthesize_piper(text, voice:)
|
|
195
|
+
model_path = find_voice(voice)
|
|
196
|
+
return {error: "Voice not found: #{voice}"} unless model_path
|
|
197
|
+
|
|
198
|
+
piper_bin = find_piper
|
|
199
|
+
return {error: "piper not installed"} unless piper_bin
|
|
200
|
+
|
|
201
|
+
raw_wav = "#{WAV_FILE}.raw"
|
|
202
|
+
_, stderr, status = Open3.capture3(
|
|
203
|
+
piper_bin, "--model", model_path, "--output_file", raw_wav,
|
|
204
|
+
stdin_data: text
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
return {error: "piper failed: #{stderr}"} unless status.success?
|
|
208
|
+
|
|
209
|
+
# Add 250ms silence at start (matches XTTS padding)
|
|
210
|
+
add_silence_padding(raw_wav, WAV_FILE)
|
|
211
|
+
|
|
212
|
+
{synthesized: true}
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
def add_silence_padding(input_wav, output_wav)
|
|
216
|
+
sox_bin = `which sox 2>/dev/null`.strip
|
|
217
|
+
padding_sec = PADDING_MS / 1000.0
|
|
218
|
+
|
|
219
|
+
if !sox_bin.empty? && File.executable?(sox_bin)
|
|
220
|
+
# Use sox to pad silence at start
|
|
221
|
+
system(sox_bin, input_wav, output_wav, "pad", padding_sec.to_s, "0",
|
|
222
|
+
[:out, :err] => "/dev/null")
|
|
223
|
+
FileUtils.rm_f(input_wav)
|
|
224
|
+
else
|
|
225
|
+
# Fallback: just rename (no padding)
|
|
226
|
+
FileUtils.mv(input_wav, output_wav)
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
def find_piper
|
|
231
|
+
[
|
|
232
|
+
File.join(Dir.home, ".local", "bin", "piper"),
|
|
233
|
+
`which piper 2>/dev/null`.strip
|
|
234
|
+
].find { |p| !p.empty? && File.executable?(p) }
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
def list_piper_voices
|
|
103
238
|
return [] unless Dir.exist?(VOICES_DIR)
|
|
104
239
|
|
|
105
240
|
Dir.glob(File.join(VOICES_DIR, "*.onnx")).map do |path|
|
|
106
241
|
name = File.basename(path, ".onnx")
|
|
107
242
|
size_mb = File.size(path) / (1024.0 * 1024)
|
|
108
|
-
{name: name, path: path, size_mb: size_mb.round(1)}
|
|
243
|
+
{name: name, path: path, size_mb: size_mb.round(1), backend: "piper"}
|
|
109
244
|
end.sort_by { |v| v[:name].downcase }
|
|
110
245
|
end
|
|
111
246
|
|
|
112
|
-
def
|
|
247
|
+
def download_piper_voice(voice_name)
|
|
113
248
|
FileUtils.mkdir_p(VOICES_DIR)
|
|
114
249
|
|
|
115
250
|
model_path = File.join(VOICES_DIR, "#{voice_name}.onnx")
|
|
@@ -120,17 +255,14 @@ module Personality
|
|
|
120
255
|
parts = voice_name.split("-")
|
|
121
256
|
return {error: "Invalid voice format"} if parts.length < 2
|
|
122
257
|
|
|
123
|
-
lang = parts[0]
|
|
124
|
-
lang_short = lang.split("_")[0]
|
|
125
|
-
name = parts[1]
|
|
258
|
+
lang = parts[0]
|
|
259
|
+
lang_short = lang.split("_")[0]
|
|
260
|
+
name = parts[1]
|
|
126
261
|
quality = parts[2] || "medium"
|
|
127
262
|
|
|
128
263
|
model_url = "#{PIPER_VOICES_BASE_URL}/#{lang_short}/#{lang}/#{name}/#{quality}/#{voice_name}.onnx"
|
|
129
264
|
config_url = "#{PIPER_VOICES_BASE_URL}/#{lang_short}/#{lang}/#{name}/#{quality}/#{voice_name}.onnx.json"
|
|
130
265
|
|
|
131
|
-
require "net/http"
|
|
132
|
-
require "uri"
|
|
133
|
-
|
|
134
266
|
download_file(model_url, model_path)
|
|
135
267
|
download_file(config_url, config_path)
|
|
136
268
|
|
|
@@ -142,18 +274,12 @@ module Personality
|
|
|
142
274
|
{error: "Download failed: #{e.message}"}
|
|
143
275
|
end
|
|
144
276
|
|
|
145
|
-
|
|
146
|
-
ENV.fetch("PERSONALITY_VOICE", DEFAULT_VOICE)
|
|
147
|
-
end
|
|
148
|
-
|
|
149
|
-
private
|
|
277
|
+
# --- Utilities ---
|
|
150
278
|
|
|
151
|
-
def
|
|
152
|
-
#
|
|
153
|
-
[
|
|
154
|
-
|
|
155
|
-
`which piper 2>/dev/null`.strip
|
|
156
|
-
].find { |p| !p.empty? && File.executable?(p) }
|
|
279
|
+
def detect_language(text)
|
|
280
|
+
# Simple heuristic: check for Polish characters
|
|
281
|
+
polish_chars = /[ąćęłńóśźżĄĆĘŁŃÓŚŹŻ]/
|
|
282
|
+
text.match?(polish_chars) ? "pl" : "en"
|
|
157
283
|
end
|
|
158
284
|
|
|
159
285
|
def player_command
|
|
@@ -164,6 +290,13 @@ module Personality
|
|
|
164
290
|
end
|
|
165
291
|
end
|
|
166
292
|
|
|
293
|
+
def select_backend(language)
|
|
294
|
+
return BACKEND unless BACKEND == "auto"
|
|
295
|
+
|
|
296
|
+
# Polish uses XTTS (trained voice), English uses piper (fast)
|
|
297
|
+
(language == "pl") ? "xtts" : "piper"
|
|
298
|
+
end
|
|
299
|
+
|
|
167
300
|
def save_pid(pid)
|
|
168
301
|
File.write(PID_FILE, pid.to_s)
|
|
169
302
|
end
|
data/lib/personality/version.rb
CHANGED