npm - voicesmith-mcp - Versions diffs - 1.0.10 → 1.0.12 - Mend

voicesmith-mcp 1.0.10 → 1.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/bin/utils.js +29 -87
package/package.json +1 -1
package/stt/__pycache__/mic_capture.cpython-314.pyc +0 -0
package/stt/mic_capture.py +16 -0

package/bin/utils.js CHANGED Viewed

@@ -325,14 +325,17 @@ function generateVoiceRules(mainAgentName) {
     content = fs.readFileSync(tplPath, "utf8");
     content = content.replace(/\{\{MAIN_AGENT\}\}/g, mainAgentName);
   } else {
-    // Fallback inline template
+    // Fallback inline template (mirrors templates/voice-rules.md)
     content = `# Voice Behavior Rules (VoiceSmith MCP)
 You have access to voice tools via the VoiceSmith MCP server.
 ## Your Voice
-- You are **${mainAgentName}**. Always call \`speak\` with \`name: "${mainAgentName}"\` — this is your voice.
-- Do not use "${mainAgentName}" for sub-agents. Each agent needs its own unique name.
+- Your default voice name is **${mainAgentName}**, but your actual assigned name may differ if another session claimed it first.
+- **IMPORTANT:** If your session context says "Your assigned voice for this session is: [Name]", use THAT name — not "${mainAgentName}". This is your real identity for this session.
+- On your first response, speak a brief intro using your assigned name: "[Name] here, ready to go."
+- Do not use your assigned name for sub-agents. Each agent needs its own unique name.
+- Tone: Be conversational and natural. Match the user's energy — casual if they're casual, focused if they're focused.
 ## Voice Switching
 - If the user asks to switch to a voice and \`speak\` returns \`"error": "name_occupied"\`, tell the user that voice is occupied by another session.
@@ -340,32 +343,38 @@ You have access to voice tools via the VoiceSmith MCP server.
 - Do NOT silently fall back to a different voice.
 ## Speaking
-- Speak twice per response:
-  1. **Opening** — Brief acknowledgment when starting work. Use \`block: false\`.
-  2. **Closing** — Summary when done. Use \`block: true\`. Never skip this.
-- Keep spoken messages to 1-2 sentences. Write details, speak summaries.
-- Do not speak code, file paths, or long lists aloud.
-- Speak at transitions only: start, finish, error, question.
+- **Opening** — Only speak at the start when you have something meaningful to say (e.g., clarifying your approach, flagging an issue). Do NOT speak filler acknowledgments like "Let me look into that." Use \`block: false\` when you do speak an opening.
+- **Closing** — Always speak a summary when done. Use \`block: true\`. Never skip the closing.
+- **Questions requiring user input → use \`speak_then_listen\` as your closing.** If the user literally cannot continue without providing input (e.g., choosing between options, confirming a destructive action, providing missing info), use \`speak_then_listen\`. If you can reasonably continue without their answer, use regular \`speak\`.
+- Keep spoken output brief — prefer 1-2 sentences, never exceed 3. Write details, speak summaries. No code or paths aloud.
+## Speed Preferences
+- The \`speak\` tool accepts a \`speed\` parameter (default 1.0). Values < 1.0 are slower, > 1.0 are faster.
+- If the user asks to speak slower or faster, adjust the speed and remember their preference for the session.
 ## Listening
-- When asking a short-answer question, use \`speak_then_listen\`.
-- If listen times out or is cancelled, fall back to text. Do not retry.
+- Use \`speak_then_listen\` whenever you need user input — it combines speaking and opening the mic in one call.
+- If \`listen\` returns timeout or cancelled, fall back to requesting text input. Do not retry \`listen\`.
 ## Sub-Agents
-- Before assigning a name to a sub-agent, call \`get_voice_registry\` to see which names are taken and which voices are available.
-- Pick a name that matches an available Kokoro voice (e.g., af_nova → "Nova", am_fenrir → "Fenrir").
+- Pick voice names matching available Kokoro voices (the voice ID suffix is the name — e.g., af_nova → "Nova", am_fenrir → "Fenrir").
 - Each sub-agent must use its own unique name. Never reuse "${mainAgentName}".
-- On handoffs, both agents speak: outgoing announces, incoming acknowledges.
+- On handoffs, both agents speak: the outgoing agent announces the handoff, the incoming agent acknowledges before starting.
+## Error Handling
+- If \`speak\` or \`speak_then_listen\` fails, fall back to text silently. Do not retry.
+- If \`listen\` times out, fall back to text. Do not retry.
 ## Fallback
-- If voice tools are not available, respond in text only.
-- If muted, \`speak\` succeeds silently. Do not call \`unmute\` unless asked.`;
+- If voice tools are not available, respond in text only. Do not mention voice capabilities.
+- If muted, \`speak\` succeeds silently. Do not call \`unmute\` unless the user asks.`;
   }
   return content;
 }
 function generateCursorRule(mainAgentName) {
+  const rules = generateVoiceRules(mainAgentName);
   return `---
 description: Voice interaction rules for VoiceSmith MCP server
 globs:
@@ -373,83 +382,16 @@ alwaysApply: true
 ---
 ${VOICE_RULES_SENTINEL}
-# Voice Behavior Rules (VoiceSmith MCP)
-You have access to voice tools via the VoiceSmith MCP server.
-## Your Voice
-- Your default voice name is **${mainAgentName}**, but your actual assigned name may differ if another session claimed it first.
-- **IMPORTANT:** If your session context says "Your assigned voice for this session is: [Name]", use THAT name — not "${mainAgentName}". This is your real identity for this session.
-- On your first response, speak a brief intro using your assigned name: "[Name] here, ready to go."
-- Do not use your assigned name for sub-agents.
-## Voice Switching
-- If the user asks to switch to a voice and \`speak\` returns \`"error": "name_occupied"\`, tell the user that voice is occupied by another session.
-- Then call \`get_voice_registry\` and show the user which voices are available to pick from.
-- Do NOT silently fall back to a different voice.
-## Speaking
-- Speak twice per response:
-  1. **Opening** — Brief acknowledgment. Use \`block: false\`.
-  2. **Closing** — Summary when done. Use \`block: true\`. Never skip.
-- **Questions that need user input → use \`speak_then_listen\` as your closing voice.** If your response asks the user to make a decision, provide information, or confirm something (e.g., "which approach?", "should I?", "want me to?", "does this look right?"), your closing voice MUST be \`speak_then_listen\` — not regular \`speak\`. This way the mic opens right after you ask.
-- Rhetorical wrap-ups ("What's next?", "Standing by.") do NOT require listen — use regular \`speak\` for those.
-- 1-2 sentences max. Write details, speak summaries. No code or paths aloud.
-- Speak at transitions: start, finish, error, question.
-## Listening
-- Use \`speak_then_listen\` whenever you need user input — it is your closing voice AND listen in one call.
-- Fall back to text on timeout. Do not retry listen.
-## Sub-Agents
-- Call \`get_voice_registry\` to find available voice names before assigning.
-- Pick names matching available Kokoro voices (e.g., af_nova → "Nova").
-- Never reuse "${mainAgentName}". On handoffs, both agents speak.
-## Fallback
-- No voice tools? Text only. Muted? Don't call \`unmute\` unless asked.
+${rules}
 `;
 }
 function generateAppendBlock(mainAgentName) {
-  // Block to append to CLAUDE.md or AGENTS.md
+  // Block to append to CLAUDE.md or AGENTS.md — reads from the template
+  const rules = generateVoiceRules(mainAgentName);
   return `
 ${VOICE_RULES_SENTINEL}
-# Voice Behavior Rules (VoiceSmith MCP)
-You have access to voice tools via the VoiceSmith MCP server.
-## Your Voice
-- Your default voice name is **${mainAgentName}**, but your actual assigned name may differ if another session claimed it first.
-- **IMPORTANT:** If your session context says "Your assigned voice for this session is: [Name]", use THAT name — not "${mainAgentName}". This is your real identity for this session.
-- On your first response, speak a brief intro using your assigned name: "[Name] here, ready to go."
-- Do not use your assigned name for sub-agents.
-## Voice Switching
-- If the user asks to switch to a voice and \`speak\` returns \`"error": "name_occupied"\`, tell the user that voice is occupied by another session.
-- Then call \`get_voice_registry\` and show the user which voices are available to pick from.
-- Do NOT silently fall back to a different voice.
-## Speaking
-- Speak twice per response:
-  1. **Opening** — Brief acknowledgment. Use \`block: false\`.
-  2. **Closing** — Summary when done. Use \`block: true\`. Never skip.
-- **Questions that need user input → use \`speak_then_listen\` as your closing voice.** If your response asks the user to make a decision, provide information, or confirm something (e.g., "which approach?", "should I?", "want me to?", "does this look right?"), your closing voice MUST be \`speak_then_listen\` — not regular \`speak\`. This way the mic opens right after you ask.
-- Rhetorical wrap-ups ("What's next?", "Standing by.") do NOT require listen — use regular \`speak\` for those.
-- 1-2 sentences max. Write details, speak summaries. No code or paths aloud.
-- Speak at transitions: start, finish, error, question.
-## Listening
-- Use \`speak_then_listen\` whenever you need user input — it is your closing voice AND listen in one call.
-- Fall back to text on timeout. Do not retry listen.
-## Sub-Agents
-- Call \`get_voice_registry\` to find available voice names before assigning.
-- Pick names matching available Kokoro voices (e.g., af_nova → "Nova").
-- Never reuse "${mainAgentName}". On handoffs, both agents speak.
-## Fallback
-- No voice tools? Text only. Muted? Don't call \`unmute\` unless asked.
+${rules}
 `;
 }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "voicesmith-mcp",
-  "version": "1.0.10",
+  "version": "1.0.12",
   "description": "Local AI voice for coding assistants — TTS & STT via MCP. Kokoro ONNX + faster-whisper, fully offline.",
   "bin": {
     "voicesmith-mcp": "bin/cli.js"

package/stt/__pycache__/mic_capture.cpython-314.pyc CHANGED Viewed

Binary file

package/stt/mic_capture.py CHANGED Viewed

@@ -61,6 +61,11 @@ class MicCapture:
         silence_duration = 0.0
         loop = asyncio.get_event_loop()
+        # Reset VAD state — the LSTM hidden state and context window must
+        # be cleared between recordings to avoid stale state from previous
+        # audio affecting speech detection.
+        vad.reset()
         stream = None
         try:
             stream = sd.InputStream(
@@ -73,6 +78,17 @@ class MicCapture:
             stream.start()
             logger.info("Microphone recording started")
+            # Discard the first ~200ms of audio to avoid picking up residual
+            # speaker output (Tink sound or TTS playback that just finished).
+            # This prevents VAD from detecting speaker bleed as "speech" and
+            # then cutting off when the bleed stops.
+            flush_chunks = int(0.2 * self._sample_rate / 512)  # ~6 chunks
+            for _ in range(flush_chunks):
+                try:
+                    self._audio_queue.get(timeout=0.1)
+                except queue.Empty:
+                    break
             start_time = asyncio.get_event_loop().time()
             while not self._stop_flag: