discoclaw 1.2.4 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. package/.context/voice.md +30 -2
  2. package/.env.example +7 -3
  3. package/.env.example.full +13 -32
  4. package/README.md +1 -1
  5. package/dist/cli/dashboard.js +7 -1
  6. package/dist/cli/dashboard.test.js +0 -4
  7. package/dist/cli/init-wizard.js +4 -8
  8. package/dist/cli/init-wizard.test.js +4 -10
  9. package/dist/config.js +5 -38
  10. package/dist/config.test.js +8 -72
  11. package/dist/cron/executor.js +72 -1
  12. package/dist/dashboard/api/metrics.js +7 -0
  13. package/dist/dashboard/api/metrics.test.js +16 -0
  14. package/dist/dashboard/api/traces.js +14 -0
  15. package/dist/dashboard/api/traces.test.js +40 -0
  16. package/dist/dashboard/page.js +187 -8
  17. package/dist/dashboard/server.js +82 -19
  18. package/dist/dashboard/server.test.js +123 -10
  19. package/dist/discord/actions.js +112 -6
  20. package/dist/discord/actions.test.js +117 -1
  21. package/dist/discord/deferred-runner.js +306 -219
  22. package/dist/discord/help-command.js +1 -1
  23. package/dist/discord/message-coordinator.js +4 -36
  24. package/dist/discord/models-command.js +1 -1
  25. package/dist/discord/reaction-handler.js +83 -5
  26. package/dist/discord/reaction-handler.test.js +55 -0
  27. package/dist/discord/verify-push.js +31 -36
  28. package/dist/discord/verify-push.test.js +34 -6
  29. package/dist/discord/voice-command.js +1 -31
  30. package/dist/discord/voice-command.test.js +21 -259
  31. package/dist/discord/voice-status-command.js +3 -22
  32. package/dist/discord/voice-status-command.test.js +16 -124
  33. package/dist/discord-followup.test.js +133 -0
  34. package/dist/health/config-doctor.js +5 -27
  35. package/dist/health/config-doctor.test.js +1 -4
  36. package/dist/index.js +15 -28
  37. package/dist/observability/trace-store.js +56 -0
  38. package/dist/observability/trace-utils.js +31 -0
  39. package/dist/runtime/codex-cli.js +3 -2
  40. package/dist/runtime/codex-cli.test.js +33 -0
  41. package/dist/runtime/model-tiers.js +1 -1
  42. package/dist/runtime/model-tiers.test.js +9 -0
  43. package/dist/runtime/openai-tool-schemas.js +17 -0
  44. package/dist/runtime-overrides.js +2 -3
  45. package/dist/runtime-overrides.test.js +27 -193
  46. package/dist/tasks/store.js +10 -6
  47. package/dist/tasks/store.test.js +44 -0
  48. package/dist/tasks/task-action-executor.test.js +162 -50
  49. package/dist/tasks/task-action-mutations.js +22 -2
  50. package/dist/tasks/task-action-read-ops.js +7 -1
  51. package/dist/tasks/task-action-runner-types.js +19 -1
  52. package/dist/voice/audio-pipeline.js +183 -96
  53. package/dist/voice/audio-receiver.js +8 -0
  54. package/dist/voice/audio-receiver.test.js +16 -0
  55. package/dist/voice/conversation-buffer.js +16 -6
  56. package/dist/voice/providers/gemini-live-provider.js +481 -0
  57. package/dist/voice/providers/gemini-live-provider.test.js +834 -0
  58. package/dist/voice/providers/gemini-live-responder.js +267 -0
  59. package/dist/voice/providers/gemini-live-responder.test.js +615 -0
  60. package/dist/voice/providers/gemini-live-token-estimator.js +100 -0
  61. package/dist/voice/providers/gemini-live-token-estimator.test.js +160 -0
  62. package/dist/voice/providers/gemini-live-types.js +32 -0
  63. package/dist/voice/providers/gemini-tool-mapper.js +91 -0
  64. package/dist/voice/providers/gemini-tool-mapper.test.js +253 -0
  65. package/dist/voice/providers/index.js +3 -0
  66. package/dist/voice/voice-prompt-builder.js +26 -17
  67. package/dist/voice/voice-prompt-builder.test.js +16 -1
  68. package/docs/configuration.md +4 -9
  69. package/docs/official-docs.md +6 -9
  70. package/docs/runtime-switching.md +1 -1
  71. package/package.json +1 -1
  72. package/dist/voice/audio-pipeline.test.js +0 -619
  73. package/dist/voice/stt-deepgram.js +0 -154
  74. package/dist/voice/stt-deepgram.test.js +0 -275
  75. package/dist/voice/stt-factory.js +0 -42
  76. package/dist/voice/stt-factory.test.js +0 -45
  77. package/dist/voice/stt-openai.js +0 -156
  78. package/dist/voice/stt-openai.test.js +0 -281
  79. package/dist/voice/tts-cartesia.js +0 -169
  80. package/dist/voice/tts-cartesia.test.js +0 -228
  81. package/dist/voice/tts-deepgram.js +0 -84
  82. package/dist/voice/tts-deepgram.test.js +0 -220
  83. package/dist/voice/tts-factory.js +0 -52
  84. package/dist/voice/tts-factory.test.js +0 -53
  85. package/dist/voice/tts-openai.js +0 -70
  86. package/dist/voice/tts-openai.test.js +0 -138
  87. package/dist/voice/types.test.js +0 -84
package/.context/voice.md CHANGED
@@ -29,10 +29,16 @@ Two native npm packages power the Discord voice integration:
29
29
  | `src/voice/transcript-mirror.ts` | Posts user transcriptions and bot responses to a text channel |
30
30
  | `src/voice/voice-action-flags.ts` | Restricted action subset for voice invocations (messaging + tasks + memory only) |
31
31
  | `src/voice/conversation-buffer.ts` | Per-guild conversation ring buffer (10 turns) — stores user/model exchanges in memory; backfills from voice-log channel on join |
32
+ | `src/voice/providers/gemini-live-types.ts` | TypeScript interfaces for Gemini Live: `GeminiLiveOpts`, `GeminiLiveEvent`, `GeminiLiveState` |
33
+ | `src/voice/providers/gemini-live-provider.ts` | Bidirectional WebSocket session wrapper for the Gemini Multimodal Live API — connect/disconnect, audio send/receive, reconnect with exponential backoff |
34
+ | `src/voice/providers/gemini-live-responder.ts` | Bridges `GeminiLiveProvider` audio/text events to Discord `AudioPlayer` playback and `TranscriptMirror` logging |
35
+ | `src/voice/providers/index.ts` | Barrel re-export for Gemini Live provider modules |
32
36
  | `src/discord/actions-voice.ts` | Discord action types: `voiceJoin`, `voiceLeave`, `voiceStatus`, `voiceMute`, `voiceDeafen` |
33
37
 
34
38
  ## Audio Data Flow
35
39
 
40
+ ### Default pipeline (`voiceProvider: 'pipeline'`)
41
+
36
42
  ```
37
43
  User speaks in Discord voice channel
38
44
  → @discordjs/voice receiver emits Opus packets per user
@@ -47,6 +53,23 @@ User speaks in Discord voice channel
47
53
  → AudioPlayer → Discord voice connection
48
54
  ```
49
55
 
56
+ ### Gemini Live (`voiceProvider: 'gemini-live'`)
57
+
58
+ Bypasses separate STT/TTS/AI stages — Gemini handles speech recognition, reasoning, and speech synthesis in a single bidirectional WebSocket session.
59
+
60
+ ```
61
+ User speaks in Discord voice channel
62
+ → @discordjs/voice receiver emits Opus packets per user
63
+ → AudioReceiver: allowlist gate → OpusDecoder (48 kHz stereo PCM)
64
+ → downsample to 16 kHz mono
65
+ → SttProvider shim → GeminiLiveProvider.sendAudio() (WebSocket)
66
+ → Gemini Live: STT + reasoning + TTS (server-side)
67
+ ← audio events (24 kHz mono PCM) + text events
68
+ → GeminiLiveResponder: upsampleToDiscord (48 kHz stereo)
69
+ → AudioPlayer → Discord voice connection
70
+ → onBotResponse callback → TranscriptMirror (text channel)
71
+ ```
72
+
50
73
  ## Key Patterns
51
74
 
52
75
  - **Allowlist gating** — `AudioReceiver` only subscribes to users in `DISCORD_ALLOW_USER_IDS`. Empty allowlist = ignore everyone (fail-closed).
@@ -56,6 +79,8 @@ User speaks in Discord voice channel
56
79
  - **Generation-based cancellation** — `VoiceResponder` increments a generation counter on each new transcription. If a newer transcription arrives mid-pipeline, the older one is silently abandoned.
57
80
  - **Barge-in** — Gated on a non-empty STT transcription result, not the raw VAD `speaking.start` event. Echo from the bot's own TTS leaking through the user's mic produces empty transcriptions and is ignored. Only when `VoiceResponder.handleTranscription()` receives a non-empty transcript while the player is active does it stop playback and advance the generation counter. This eliminates false positives from echo without relying on a static grace-period timeout.
58
81
  - **Conversation ring buffer** — `ConversationBuffer` maintains a per-guild 10-turn ring buffer of user/model exchanges that gets injected into the voice prompt as formatted conversation history. Turns are appended live during a session. On voice join, the buffer backfills from recent voice-log channel messages so context carries across disconnects. The buffer is cleared when the bot leaves the voice channel.
82
+ - **`SttProvider` shim for Gemini Live** — In `gemini-live` mode, the pipeline still uses `AudioReceiver` for Opus decode and downsampling, but replaces the real STT provider with a lightweight shim object that implements the `SttProvider` interface. The shim's `feedAudio()` forwards PCM frames directly to `GeminiLiveProvider.sendAudio()`, while its `start()`/`stop()`/`onTranscription()` are no-ops. This reuses the existing audio-receive path without duplicating Opus decode or downsample logic.
83
+ - **Session rotation timer** — `GeminiLiveProvider` starts a timer on each successful connection that fires at `DISCOCLAW_GEMINI_SESSION_ROTATION_MS` (default 13 min), proactively triggering a graceful reconnect before Gemini's ~15 min server-side session limit. The timer reuses the existing reconnect-with-resume-handle path (ws-039), so audio gap is minimal. The timer is cleared on disconnect and reset on each reconnect. Set to `0` to disable rotation (the server will eventually kill the session).
59
84
  - **Re-entrancy guard** — `AudioPipelineManager.startPipeline` uses a `starting` set because `VoiceConnection.subscribe()` synchronously fires a Ready state change.
60
85
  - **Error containment** — `VoiceConnectionManager` catches connection errors and destroys the connection to prevent process crashes (e.g. DAVE handshake failures).
61
86
  - **Deepgram TTS 2000-char limit** — Deepgram Aura REST TTS returns HTTP 413 (silent failure) for inputs exceeding ~2000 characters. `tts-deepgram.ts` truncates the input to 2000 chars before sending to prevent silent audio dropouts. If the AI response is unexpectedly long (e.g. from a missing `VOICE_STYLE_INSTRUCTION`), the user will still hear a truncated response rather than silence.
@@ -78,8 +103,9 @@ When `voiceEnabled=true`, the post-connect block in `src/index.ts` initializes t
78
103
  | `DISCOCLAW_VOICE_ENABLED` | `0` | Master switch |
79
104
  | `DISCOCLAW_DISCORD_ACTIONS_VOICE` | `0` | Enable voice action types |
80
105
  | `DISCOCLAW_VOICE_AUTO_JOIN` | `0` | Auto-join when allowlisted user enters |
81
- | `DISCOCLAW_STT_PROVIDER` | `deepgram` | STT backend |
82
- | `DISCOCLAW_TTS_PROVIDER` | `cartesia` | TTS backend (`cartesia`, `deepgram`, `openai`, `kokoro`) |
106
+ | `DISCOCLAW_VOICE_PIPELINE_PROVIDER` | `pipeline` | Voice pipeline mode: `pipeline` (separate STT/AI/TTS stages) or `gemini-live` (single bidirectional Gemini WebSocket). Requires `GEMINI_API_KEY` when set to `gemini-live`. |
107
+ | `DISCOCLAW_STT_PROVIDER` | `deepgram` | STT backend (used in `pipeline` mode only; ignored in `gemini-live` mode) |
108
+ | `DISCOCLAW_TTS_PROVIDER` | `cartesia` | TTS backend (`cartesia`, `deepgram`, `openai`, `kokoro`) (used in `pipeline` mode only; ignored in `gemini-live` mode) |
83
109
  | `DISCOCLAW_VOICE_HOME_CHANNEL` | — | Voice audio channel name/ID used for prompt context (not transcript mirroring) |
84
110
  | `DISCOCLAW_VOICE_LOG_CHANNEL` | — | Text channel name/ID where `TranscriptMirror` posts user transcriptions and bot responses; falls back to bootstrap-provided `voiceLogChannelId` if unset |
85
111
  | `DISCOCLAW_VOICE_MODEL` | `capable` | AI model tier for voice responses |
@@ -89,5 +115,7 @@ When `voiceEnabled=true`, the post-connect block in `src/index.ts` initializes t
89
115
  | `DEEPGRAM_TTS_VOICE` | `aura-2-asteria-en` | Deepgram TTS voice name |
90
116
  | `DEEPGRAM_TTS_SPEED` | `1.3` | Deepgram TTS playback speed (range 0.5–1.5) |
91
117
  | `CARTESIA_API_KEY` | — | Required for cartesia TTS |
118
+ | `DISCOCLAW_GEMINI_SESSION_ROTATION_MS` | `780000` (13 min) | Time before proactive session rotation in `gemini-live` mode. Must be less than Gemini's ~15 min server-side limit. Set to `0` to disable. |
119
+ | `GEMINI_API_KEY` | — | Required when `DISCOCLAW_VOICE_PIPELINE_PROVIDER=gemini-live`. Authenticates the Gemini Multimodal Live WebSocket session. Also used by the `gemini-api` runtime adapter (see `runtime.md`). |
92
120
  | `ANTHROPIC_API_KEY` | — | Enables the Anthropic REST adapter; when set and voice is enabled, voice auto-wires to the direct Messages API path (zero CLI cold-start). See `runtime.md § Anthropic REST Runtime`. |
93
121
  | *(built-in)* | — | Telegraphic style instruction hardcoded into every voice AI invocation — front-loads the answer, strips preambles/markdown/filler, keeps responses short for TTS latency. Not an env var; not overridable by `DISCOCLAW_VOICE_SYSTEM_PROMPT`. |
package/.env.example CHANGED
@@ -90,7 +90,7 @@ DISCORD_GUILD_ID=
90
90
  # - PRIMARY_RUNTIME sets the default adapter the instance boots with.
91
91
  # - `!models set chat <runtime>` can live-switch the main runtime in memory, but chat runtime swaps do not persist.
92
92
  # - Persistent model-role defaults live in data/models.json.
93
- # - Persistent runtime-only overlays live in data/runtime-overrides.json (`voiceRuntime`, `fastRuntime`, `ttsVoice`).
93
+ # - Persistent runtime-only overlays live in data/runtime-overrides.json (`voiceRuntime`, `fastRuntime`).
94
94
  # - `!models reset` writes startup-default model strings back into models.json and clears fast/voice runtime overlays;
95
95
  # it does not rewrite PRIMARY_RUNTIME in .env.
96
96
  # Supported runtime-path notes:
@@ -191,8 +191,12 @@ DISCORD_GUILD_ID=
191
191
  # Voice — configure via `pnpm setup` or `discoclaw init`
192
192
  # ----------------------------------------------------------
193
193
  # Run `pnpm setup` or `discoclaw init` to enable voice interactively,
194
- # or set these vars manually to enable voice chat (STT/TTS via Deepgram).
194
+ # or set these vars manually to enable Gemini Live voice chat.
195
195
  #DISCOCLAW_VOICE_ENABLED=0
196
+ # Gemini Live requires GEMINI_API_KEY.
197
+ # Gemini Live session rotation threshold (ms). The provider proactively reconnects
198
+ # before Gemini's ~15 min session limit to minimize audio gap. Default: 780000 (13 min).
199
+ #DISCOCLAW_GEMINI_SESSION_ROTATION_MS=780000
196
200
  # Text channel used for voice prompt context and actions (e.g. posting action results,
197
201
  # reading pinned notes). Required for full voice functionality when voice is enabled.
198
202
  #DISCOCLAW_VOICE_HOME_CHANNEL= # e.g. "voice"
@@ -200,7 +204,7 @@ DISCORD_GUILD_ID=
200
204
  # (the bot creates a "voice-log" text channel and stores its ID in system-scaffold.json).
201
205
  # Only set this to override the auto-discovered channel.
202
206
  #DISCOCLAW_VOICE_LOG_CHANNEL=
203
- #DEEPGRAM_API_KEY=
207
+ #GEMINI_API_KEY=
204
208
  # Optional voice-only Anthropic runtime for voice responses.
205
209
  # `claude-api` is not a valid PRIMARY_RUNTIME and does not persist in models.json.
206
210
  # Use `!models set voice claude-api` to persist the voice runtime path in runtime-overrides.json.
package/.env.example.full CHANGED
@@ -703,32 +703,22 @@ DISCOCLAW_DISCORD_ACTIONS_IMAGEGEN=0
703
703
  # IMAGEGEN_GEMINI_API_KEY is set.
704
704
 
705
705
  # ----------------------------------------------------------
706
- # Voice (STT/TTS) join voice channels, listen and respond
706
+ # Voice — Gemini Live voice chat
707
707
  # ----------------------------------------------------------
708
- # Master switch — enables voice channel interaction (default: off).
709
- # When enabled, the bot can join Discord voice channels, transcribe speech via STT,
710
- # and respond with synthesized speech via TTS.
708
+ # Master switch — enables Discord voice interaction (default: off).
709
+ # Gemini Live handles speech recognition, reasoning, and speech synthesis in one session.
711
710
  #DISCOCLAW_VOICE_ENABLED=0
712
711
  # Enable voice Discord action category (voiceJoin, voiceLeave, voiceStatus, voiceMute, voiceDeafen).
713
712
  # Requires DISCOCLAW_VOICE_ENABLED=1 to take effect (default: off).
714
713
  #DISCOCLAW_DISCORD_ACTIONS_VOICE=0
715
714
  # Auto-join voice channels when a non-bot user joins, and auto-leave when the last
716
- # non-bot user leaves. Starts/tears down the audio pipeline (STT receiver) automatically.
717
- # Requires DISCOCLAW_VOICE_ENABLED=1 (default: off).
715
+ # non-bot user leaves. Requires DISCOCLAW_VOICE_ENABLED=1 (default: off).
718
716
  #DISCOCLAW_VOICE_AUTO_JOIN=0
719
- # Speech-to-text provider: deepgram (Deepgram Nova-3 API) or whisper (whisper.cpp local).
720
- # deepgram requires DEEPGRAM_API_KEY; whisper runs locally with no API key.
721
- #DISCOCLAW_STT_PROVIDER=deepgram
722
- # Text-to-speech provider: cartesia | deepgram | kokoro | openai.
723
- # cartesia requires CARTESIA_API_KEY; deepgram reuses DEEPGRAM_API_KEY;
724
- # openai requires OPENAI_API_KEY; kokoro runs locally with no API key.
725
- #DISCOCLAW_TTS_PROVIDER=cartesia
726
- # Voice audio channel name or ID — the channel the bot joins for voice interaction.
727
- # Used as the prompt context source (root policy, PA files, channel context, durable memory).
717
+ # Voice text channel name or ID used for prompt context and voice-triggered actions.
728
718
  # The old env var DISCOCLAW_VOICE_TRANSCRIPT_CHANNEL is still accepted as a fallback.
729
719
  # Leave unset to skip voice channel context in prompts.
730
720
  #DISCOCLAW_VOICE_HOME_CHANNEL= # e.g. "voice" if using the default scaffold
731
- # Text channel name or ID for posting voice transcripts (user STT and bot TTS responses).
721
+ # Text channel name or ID for posting voice transcripts.
732
722
  # Optional — auto-discovered via bootstrap (the bot creates "voice-log" and stores its ID
733
723
  # in system-scaffold.json). Only set this to override the auto-discovered channel.
734
724
  # Leave unset to disable transcript mirroring.
@@ -741,20 +731,11 @@ DISCOCLAW_DISCORD_ACTIONS_IMAGEGEN=0
741
731
  # Custom system prompt prepended to voice AI invocations. Max 4000 chars.
742
732
  # Use this to set a conversational tone, brevity instructions, or persona for voice responses.
743
733
  #DISCOCLAW_VOICE_SYSTEM_PROMPT=
744
- # Anthropic API key for direct Messages API access (bypasses Claude CLI cold-start).
745
- # When set and voice is enabled, voice invocations use the Anthropic REST adapter
746
- # instead of the CLI subprocess, eliminating ~2-5s cold-start latency per response.
734
+ # Gemini Live session rotation threshold (ms). The provider proactively reconnects
735
+ # before Gemini's ~15 min session limit to minimize audio gap. Default: 780000 (13 min).
736
+ #DISCOCLAW_GEMINI_SESSION_ROTATION_MS=780000
737
+ # Google Gemini API key. Required when voice is enabled.
738
+ #GEMINI_API_KEY=
739
+ # Anthropic API key for the optional direct Messages API voice runtime.
740
+ # When set, `!models set voice claude-api` can bypass CLI cold-start for voice responses.
747
741
  #ANTHROPIC_API_KEY=
748
- # API key for Deepgram Nova-3 STT. Required when DISCOCLAW_STT_PROVIDER=deepgram.
749
- #DEEPGRAM_API_KEY=
750
- # Deepgram STT model for voice transcription (default: nova-3-conversationalai).
751
- # See https://developers.deepgram.com/docs/models-languages-overview for available models.
752
- #DEEPGRAM_STT_MODEL=nova-3-conversationalai
753
- # Deepgram TTS voice for speech synthesis (default: aura-2-asteria-en).
754
- # See https://developers.deepgram.com/docs/tts-models for available voices.
755
- #DEEPGRAM_TTS_VOICE=aura-2-asteria-en
756
- # Deepgram TTS playback speed (range: 0.5–1.5, default: 1.3).
757
- # Values below 1.0 slow down speech; values above 1.0 speed it up.
758
- #DEEPGRAM_TTS_SPEED=1.3
759
- # API key for Cartesia Sonic-3 TTS. Required when DISCOCLAW_TTS_PROVIDER=cartesia.
760
- #CARTESIA_API_KEY=
package/README.md CHANGED
@@ -39,7 +39,7 @@ No gateways, no proxies, no web UI. Discord *is* the interface.
39
39
 
40
40
  ## Voice — the bot talks back
41
41
 
42
- Real-time voice with STT (Deepgram), TTS (Cartesia), barge-in, and transcript mirroring. Off by default. [Setup guide →](docs/voice.md)
42
+ Real-time voice with Gemini Live, barge-in, tool calls, and transcript mirroring. Off by default. [Setup guide →](docs/voice.md)
43
43
 
44
44
  ## Self-management
45
45
 
@@ -146,7 +146,13 @@ function normalizeRuntimeName(value) {
146
146
  const trimmed = value?.trim().toLowerCase();
147
147
  if (!trimmed)
148
148
  return undefined;
149
- const normalized = trimmed === 'claude_code' ? 'claude' : trimmed;
149
+ let normalized = trimmed === 'claude_code' ? 'claude' : trimmed;
150
+ if (normalized === 'claude-cli')
151
+ normalized = 'claude';
152
+ if (normalized === 'codex-cli')
153
+ normalized = 'codex';
154
+ if (normalized === 'claude' || normalized === 'codex')
155
+ return normalized;
150
156
  return KNOWN_RUNTIMES.has(normalized) ? normalized : undefined;
151
157
  }
152
158
  function trimEnvValue(value) {
@@ -591,7 +591,6 @@ describe('runDashboard', () => {
591
591
  runtimeOverrides: {
592
592
  fastRuntime: 'openrouter',
593
593
  voiceRuntime: 'anthropic',
594
- ttsVoice: 'alloy',
595
594
  },
596
595
  envDefaults: {
597
596
  ...makeDoctorContext().envDefaults,
@@ -628,7 +627,6 @@ describe('runDashboard', () => {
628
627
  expect(saveModelConfigMock).toHaveBeenCalledWith('/repo/data/models.json', {});
629
628
  expect(saveOverridesMock).toHaveBeenCalledWith('/repo/data/runtime-overrides.json', {
630
629
  voiceRuntime: 'anthropic',
631
- ttsVoice: 'alloy',
632
630
  });
633
631
  expect(frames.some((frame) => frame.includes('Reset fast to default: capable. Cleared fastRuntime override. Changes take effect on next service restart.'))).toBe(true);
634
632
  });
@@ -638,7 +636,6 @@ describe('runDashboard', () => {
638
636
  runtimeOverrides: {
639
637
  fastRuntime: 'openrouter',
640
638
  voiceRuntime: 'anthropic',
641
- ttsVoice: 'alloy',
642
639
  },
643
640
  envDefaults: {
644
641
  ...makeDoctorContext().envDefaults,
@@ -675,7 +672,6 @@ describe('runDashboard', () => {
675
672
  expect(saveModelConfigMock).toHaveBeenCalledWith('/repo/data/models.json', {});
676
673
  expect(saveOverridesMock).toHaveBeenCalledWith('/repo/data/runtime-overrides.json', {
677
674
  fastRuntime: 'openrouter',
678
- ttsVoice: 'alloy',
679
675
  });
680
676
  expect(frames.some((frame) => frame.includes('Reset voice to default: capable. Cleared voiceRuntime override. Changes take effect on next service restart.'))).toBe(true);
681
677
  });
@@ -88,10 +88,8 @@ export function buildEnvContent(vals, now = new Date()) {
88
88
  // Voice
89
89
  const voiceKeys = [
90
90
  'DISCOCLAW_VOICE_ENABLED',
91
- 'DEEPGRAM_API_KEY',
91
+ 'GEMINI_API_KEY',
92
92
  'DISCOCLAW_DISCORD_ACTIONS_VOICE',
93
- 'DISCOCLAW_STT_PROVIDER',
94
- 'DISCOCLAW_TTS_PROVIDER',
95
93
  ];
96
94
  const hasVoice = voiceKeys.some((k) => vals[k]);
97
95
  if (hasVoice) {
@@ -347,14 +345,12 @@ export async function runInitWizard() {
347
345
  }
348
346
  values.DISCOCLAW_DISCORD_ACTIONS = '1';
349
347
  // ── Voice setup ───────────────────────────────────────────────────────────
350
- const enableVoice = await ask('\nEnable voice chat? (requires a Deepgram API key — you can skip this and enable later) [y/N] ');
348
+ const enableVoice = await ask('\nEnable voice chat? (requires a Gemini API key — you can skip this and enable later) [y/N] ');
351
349
  if (enableVoice.toLowerCase() === 'y') {
352
- const deepgramKey = await askValidated('Deepgram API key: ', (val) => (val ? null : 'Deepgram API key is required'));
350
+ const geminiKey = await askValidated('Gemini API key: ', (val) => (val ? null : 'Gemini API key is required'));
353
351
  values.DISCOCLAW_VOICE_ENABLED = '1';
354
- values.DEEPGRAM_API_KEY = deepgramKey;
352
+ values.GEMINI_API_KEY = geminiKey;
355
353
  values.DISCOCLAW_DISCORD_ACTIONS_VOICE = '1';
356
- values.DISCOCLAW_STT_PROVIDER = 'deepgram';
357
- values.DISCOCLAW_TTS_PROVIDER = 'deepgram';
358
354
  }
359
355
  // ── Write .env ────────────────────────────────────────────────────────────
360
356
  const envContent = buildEnvContent(values);
@@ -155,16 +155,12 @@ describe('init wizard helpers', () => {
155
155
  DISCORD_TOKEN: 'a.b.c',
156
156
  DISCORD_ALLOW_USER_IDS: '1000000000000000001',
157
157
  DISCOCLAW_VOICE_ENABLED: '1',
158
- DEEPGRAM_API_KEY: 'dg-key',
158
+ GEMINI_API_KEY: 'gm-key',
159
159
  DISCOCLAW_DISCORD_ACTIONS_VOICE: '1',
160
- DISCOCLAW_STT_PROVIDER: 'deepgram',
161
- DISCOCLAW_TTS_PROVIDER: 'deepgram',
162
160
  }, new Date('2026-02-26T00:00:00.000Z'));
163
161
  expect(content).toContain('# VOICE');
164
162
  expect(content).toContain('DISCOCLAW_VOICE_ENABLED=1');
165
- expect(content).toContain('DEEPGRAM_API_KEY=dg-key');
166
- expect(content).toContain('DISCOCLAW_STT_PROVIDER=deepgram');
167
- expect(content).toContain('DISCOCLAW_TTS_PROVIDER=deepgram');
163
+ expect(content).toContain('GEMINI_API_KEY=gm-key');
168
164
  });
169
165
  it('omits voice section when no voice vars are provided', () => {
170
166
  const content = buildEnvContent({
@@ -555,7 +551,7 @@ describe('runInitWizard', () => {
555
551
  '5000000000000000001', // DISCORD_GUILD_ID
556
552
  '', // provider selection -> default (Claude)
557
553
  'y', // enable voice -> yes
558
- 'dg-test-key', // Deepgram API key
554
+ 'gemini-test-key', // Gemini API key
559
555
  ];
560
556
  process.chdir(tmpDir);
561
557
  vi.mocked(createInterface).mockReturnValue(makeReadline(answers));
@@ -573,9 +569,7 @@ describe('runInitWizard', () => {
573
569
  const newEnv = fs.readFileSync(path.join(tmpDir, '.env'), 'utf8');
574
570
  expect(newEnv).toContain('# VOICE');
575
571
  expect(newEnv).toContain('DISCOCLAW_VOICE_ENABLED=1');
576
- expect(newEnv).toContain('DEEPGRAM_API_KEY=dg-test-key');
577
- expect(newEnv).toContain('DISCOCLAW_STT_PROVIDER=deepgram');
578
- expect(newEnv).toContain('DISCOCLAW_TTS_PROVIDER=deepgram');
572
+ expect(newEnv).toContain('GEMINI_API_KEY=gemini-test-key');
579
573
  expect(newEnv).toContain('DISCOCLAW_DISCORD_ACTIONS_VOICE=1');
580
574
  });
581
575
  });
package/dist/config.js CHANGED
@@ -510,8 +510,7 @@ export function parseConfig(env) {
510
510
  const anthropicApiKey = parseTrimmedString(env, 'ANTHROPIC_API_KEY');
511
511
  const voiceEnabled = parseBoolean(env, 'DISCOCLAW_VOICE_ENABLED', false);
512
512
  const voiceAutoJoin = parseBoolean(env, 'DISCOCLAW_VOICE_AUTO_JOIN', false);
513
- const voiceSttProvider = parseEnum(env, 'DISCOCLAW_STT_PROVIDER', ['deepgram', 'whisper', 'openai'], 'deepgram');
514
- const voiceTtsProvider = parseEnum(env, 'DISCOCLAW_TTS_PROVIDER', ['cartesia', 'deepgram', 'kokoro', 'openai'], 'cartesia');
513
+ const geminiSessionRotationMs = parseNonNegativeInt(env, 'DISCOCLAW_GEMINI_SESSION_ROTATION_MS', 780_000);
515
514
  let voiceHomeChannel = parseTrimmedString(env, 'DISCOCLAW_VOICE_HOME_CHANNEL');
516
515
  if (!voiceHomeChannel) {
517
516
  const legacy = parseTrimmedString(env, 'DISCOCLAW_VOICE_TRANSCRIPT_CHANNEL');
@@ -521,20 +520,6 @@ export function parseConfig(env) {
521
520
  }
522
521
  }
523
522
  const voiceLogChannel = parseTrimmedString(env, 'DISCOCLAW_VOICE_LOG_CHANNEL');
524
- const deepgramApiKey = parseTrimmedString(env, 'DEEPGRAM_API_KEY');
525
- const deepgramSttModel = parseTrimmedString(env, 'DEEPGRAM_STT_MODEL') ?? 'nova-3-general';
526
- const deepgramTtsVoice = parseTrimmedString(env, 'DEEPGRAM_TTS_VOICE') ?? 'aura-2-asteria-en';
527
- const deepgramTtsSpeed = (() => {
528
- const raw = parseTrimmedString(env, 'DEEPGRAM_TTS_SPEED');
529
- if (raw == null)
530
- return 1.3;
531
- const n = parseFloat(raw);
532
- if (!Number.isFinite(n) || n < 0.5 || n > 1.5) {
533
- throw new Error(`DEEPGRAM_TTS_SPEED must be a number between 0.5 and 1.5, got "${raw}"`);
534
- }
535
- return n;
536
- })();
537
- const cartesiaApiKey = parseTrimmedString(env, 'CARTESIA_API_KEY');
538
523
  const voiceModelRaw = parseTrimmedString(env, 'DISCOCLAW_VOICE_MODEL');
539
524
  const voiceSystemPrompt = (() => {
540
525
  const raw = parseTrimmedString(env, 'DISCOCLAW_VOICE_SYSTEM_PROMPT');
@@ -545,24 +530,12 @@ export function parseConfig(env) {
545
530
  }
546
531
  return raw;
547
532
  })();
548
- if (voiceEnabled && voiceSttProvider === 'deepgram' && !deepgramApiKey) {
549
- warnings.push('DISCOCLAW_VOICE_ENABLED=1 with STT provider "deepgram" but DEEPGRAM_API_KEY is not set; voice STT will fail at runtime.');
550
- }
551
- if (voiceEnabled && voiceSttProvider === 'openai' && !openaiApiKey) {
552
- warnings.push('DISCOCLAW_VOICE_ENABLED=1 with STT provider "openai" but OPENAI_API_KEY is not set; voice STT will fail at runtime.');
553
- }
554
- if (voiceEnabled && voiceTtsProvider === 'cartesia' && !cartesiaApiKey) {
555
- warnings.push('DISCOCLAW_VOICE_ENABLED=1 with TTS provider "cartesia" but CARTESIA_API_KEY is not set; voice TTS will fail at runtime.');
556
- }
557
- if (voiceEnabled && voiceTtsProvider === 'deepgram' && !deepgramApiKey) {
558
- warnings.push('DISCOCLAW_VOICE_ENABLED=1 with TTS provider "deepgram" but DEEPGRAM_API_KEY is not set; voice TTS will fail at runtime.');
559
- }
560
- if (voiceEnabled && voiceTtsProvider === 'openai' && !openaiApiKey) {
561
- warnings.push('DISCOCLAW_VOICE_ENABLED=1 with TTS provider "openai" but OPENAI_API_KEY is not set; voice TTS will fail at runtime.');
562
- }
563
533
  if (voiceEnabled && !voiceHomeChannel) {
564
534
  warnings.push('DISCOCLAW_VOICE_ENABLED=1 but DISCOCLAW_VOICE_HOME_CHANNEL is not set; voice actions will be disabled (no target channel for action execution).');
565
535
  }
536
+ if (voiceEnabled && !geminiApiKey) {
537
+ warnings.push('DISCOCLAW_VOICE_ENABLED=1 but GEMINI_API_KEY is not set; Gemini Live voice will fail at runtime.');
538
+ }
566
539
  const coldStorageEnabled = parseBoolean(env, 'DISCOCLAW_COLD_STORAGE_ENABLED', false);
567
540
  const coldStorageApiKey = parseTrimmedString(env, 'COLD_STORAGE_API_KEY') ?? openaiApiKey;
568
541
  const coldStorageProvider = parseEnum(env, 'COLD_STORAGE_PROVIDER', ['openai', 'openai-compat'], 'openai');
@@ -741,15 +714,9 @@ export function parseConfig(env) {
741
714
  voiceAutoJoin,
742
715
  voiceModel,
743
716
  voiceSystemPrompt,
744
- voiceSttProvider,
745
- voiceTtsProvider,
717
+ geminiSessionRotationMs,
746
718
  voiceHomeChannel,
747
719
  voiceLogChannel,
748
- deepgramApiKey,
749
- deepgramSttModel,
750
- deepgramTtsVoice,
751
- deepgramTtsSpeed,
752
- cartesiaApiKey,
753
720
  forgeDrafterRuntime,
754
721
  forgeAuditorRuntime,
755
722
  openrouterApiKey,
@@ -1021,41 +1021,9 @@ describe('parseConfig', () => {
1021
1021
  expect(config.voiceEnabled).toBe(false);
1022
1022
  });
1023
1023
  it('parses DISCOCLAW_VOICE_ENABLED=1 as true', () => {
1024
- const { config } = parseConfig(env({ DISCOCLAW_VOICE_ENABLED: '1', DEEPGRAM_API_KEY: 'dg-key', CARTESIA_API_KEY: 'ca-key' }));
1024
+ const { config } = parseConfig(env({ DISCOCLAW_VOICE_ENABLED: '1', GEMINI_API_KEY: 'gm-key' }));
1025
1025
  expect(config.voiceEnabled).toBe(true);
1026
1026
  });
1027
- it('defaults voiceSttProvider to "deepgram"', () => {
1028
- const { config } = parseConfig(env());
1029
- expect(config.voiceSttProvider).toBe('deepgram');
1030
- });
1031
- it('parses DISCOCLAW_STT_PROVIDER=whisper', () => {
1032
- const { config } = parseConfig(env({ DISCOCLAW_STT_PROVIDER: 'whisper' }));
1033
- expect(config.voiceSttProvider).toBe('whisper');
1034
- });
1035
- it('parses STT provider case-insensitively', () => {
1036
- const { config } = parseConfig(env({ DISCOCLAW_STT_PROVIDER: 'Deepgram' }));
1037
- expect(config.voiceSttProvider).toBe('deepgram');
1038
- });
1039
- it('throws on invalid STT provider', () => {
1040
- expect(() => parseConfig(env({ DISCOCLAW_STT_PROVIDER: 'invalid' })))
1041
- .toThrow(/DISCOCLAW_STT_PROVIDER must be one of deepgram\|whisper/);
1042
- });
1043
- it('defaults voiceTtsProvider to "cartesia"', () => {
1044
- const { config } = parseConfig(env());
1045
- expect(config.voiceTtsProvider).toBe('cartesia');
1046
- });
1047
- it('parses DISCOCLAW_TTS_PROVIDER=kokoro', () => {
1048
- const { config } = parseConfig(env({ DISCOCLAW_TTS_PROVIDER: 'kokoro' }));
1049
- expect(config.voiceTtsProvider).toBe('kokoro');
1050
- });
1051
- it('parses TTS provider case-insensitively', () => {
1052
- const { config } = parseConfig(env({ DISCOCLAW_TTS_PROVIDER: 'Cartesia' }));
1053
- expect(config.voiceTtsProvider).toBe('cartesia');
1054
- });
1055
- it('throws on invalid TTS provider', () => {
1056
- expect(() => parseConfig(env({ DISCOCLAW_TTS_PROVIDER: 'elevenlabs' })))
1057
- .toThrow(/DISCOCLAW_TTS_PROVIDER must be one of cartesia\|deepgram\|kokoro\|openai/);
1058
- });
1059
1027
  it('parses DISCOCLAW_VOICE_HOME_CHANNEL when set', () => {
1060
1028
  const { config } = parseConfig(env({ DISCOCLAW_VOICE_HOME_CHANNEL: 'voice-log' }));
1061
1029
  expect(config.voiceHomeChannel).toBe('voice-log');
@@ -1085,33 +1053,13 @@ describe('parseConfig', () => {
1085
1053
  const { config } = parseConfig(env());
1086
1054
  expect(config.voiceLogChannel).toBeUndefined();
1087
1055
  });
1088
- it('parses DEEPGRAM_API_KEY when set', () => {
1089
- const { config } = parseConfig(env({ DEEPGRAM_API_KEY: 'dg-key' }));
1090
- expect(config.deepgramApiKey).toBe('dg-key');
1056
+ it('warns when voice enabled but GEMINI_API_KEY is missing', () => {
1057
+ const { warnings } = parseConfig(env({ DISCOCLAW_VOICE_ENABLED: '1' }));
1058
+ expect(warnings.some((w) => w.includes('GEMINI_API_KEY'))).toBe(true);
1091
1059
  });
1092
- it('returns undefined for deepgramApiKey when unset', () => {
1093
- const { config } = parseConfig(env());
1094
- expect(config.deepgramApiKey).toBeUndefined();
1095
- });
1096
- it('parses CARTESIA_API_KEY when set', () => {
1097
- const { config } = parseConfig(env({ CARTESIA_API_KEY: 'ca-key' }));
1098
- expect(config.cartesiaApiKey).toBe('ca-key');
1099
- });
1100
- it('returns undefined for cartesiaApiKey when unset', () => {
1101
- const { config } = parseConfig(env());
1102
- expect(config.cartesiaApiKey).toBeUndefined();
1103
- });
1104
- it('warns when voice enabled with deepgram STT but DEEPGRAM_API_KEY missing', () => {
1105
- const { warnings } = parseConfig(env({ DISCOCLAW_VOICE_ENABLED: '1', CARTESIA_API_KEY: 'ca-key' }));
1106
- expect(warnings.some((w) => w.includes('DEEPGRAM_API_KEY'))).toBe(true);
1107
- });
1108
- it('does not warn about DEEPGRAM_API_KEY when voice disabled', () => {
1060
+ it('does not warn about GEMINI_API_KEY when voice disabled', () => {
1109
1061
  const { warnings } = parseConfig(env({ DISCOCLAW_VOICE_ENABLED: '0' }));
1110
- expect(warnings.some((w) => w.includes('DEEPGRAM_API_KEY'))).toBe(false);
1111
- });
1112
- it('does not warn about DEEPGRAM_API_KEY when STT provider is whisper', () => {
1113
- const { warnings } = parseConfig(env({ DISCOCLAW_VOICE_ENABLED: '1', DISCOCLAW_STT_PROVIDER: 'whisper', CARTESIA_API_KEY: 'ca-key' }));
1114
- expect(warnings.some((w) => w.includes('DEEPGRAM_API_KEY'))).toBe(false);
1062
+ expect(warnings.some((w) => w.includes('GEMINI_API_KEY'))).toBe(false);
1115
1063
  });
1116
1064
  // --- voiceAutoJoin ---
1117
1065
  it('defaults voiceAutoJoin to false', () => {
@@ -1130,24 +1078,12 @@ describe('parseConfig', () => {
1130
1078
  const { config } = parseConfig(env({ DISCOCLAW_VOICE_AUTO_JOIN: '0' }));
1131
1079
  expect(config.voiceAutoJoin).toBe(false);
1132
1080
  });
1133
- it('warns when voice enabled with cartesia TTS but CARTESIA_API_KEY missing', () => {
1134
- const { warnings } = parseConfig(env({ DISCOCLAW_VOICE_ENABLED: '1', DEEPGRAM_API_KEY: 'dg-key' }));
1135
- expect(warnings.some((w) => w.includes('CARTESIA_API_KEY'))).toBe(true);
1136
- });
1137
- it('does not warn about CARTESIA_API_KEY when voice disabled', () => {
1138
- const { warnings } = parseConfig(env({ DISCOCLAW_VOICE_ENABLED: '0' }));
1139
- expect(warnings.some((w) => w.includes('CARTESIA_API_KEY'))).toBe(false);
1140
- });
1141
- it('does not warn about CARTESIA_API_KEY when TTS provider is kokoro', () => {
1142
- const { warnings } = parseConfig(env({ DISCOCLAW_VOICE_ENABLED: '1', DISCOCLAW_TTS_PROVIDER: 'kokoro', DEEPGRAM_API_KEY: 'dg-key' }));
1143
- expect(warnings.some((w) => w.includes('CARTESIA_API_KEY'))).toBe(false);
1144
- });
1145
1081
  it('warns when voice enabled but DISCOCLAW_VOICE_HOME_CHANNEL is unset', () => {
1146
- const { warnings } = parseConfig(env({ DISCOCLAW_VOICE_ENABLED: '1', DEEPGRAM_API_KEY: 'dg-key', CARTESIA_API_KEY: 'ca-key' }));
1082
+ const { warnings } = parseConfig(env({ DISCOCLAW_VOICE_ENABLED: '1', GEMINI_API_KEY: 'gm-key' }));
1147
1083
  expect(warnings.some((w) => w.includes('DISCOCLAW_VOICE_HOME_CHANNEL'))).toBe(true);
1148
1084
  });
1149
1085
  it('does not warn about DISCOCLAW_VOICE_HOME_CHANNEL when both voice and home channel are set', () => {
1150
- const { warnings } = parseConfig(env({ DISCOCLAW_VOICE_ENABLED: '1', DISCOCLAW_VOICE_HOME_CHANNEL: '1000000000000000003', DEEPGRAM_API_KEY: 'dg-key', CARTESIA_API_KEY: 'ca-key' }));
1086
+ const { warnings } = parseConfig(env({ DISCOCLAW_VOICE_ENABLED: '1', DISCOCLAW_VOICE_HOME_CHANNEL: '1000000000000000003', GEMINI_API_KEY: 'gm-key' }));
1151
1087
  expect(warnings.some((w) => w.includes('DISCOCLAW_VOICE_HOME_CHANNEL'))).toBe(false);
1152
1088
  });
1153
1089
  // --- cold storage ---