npm - discoclaw - Versions diffs - 0.3.0 → 0.4.0 - Mend

discoclaw 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (127) hide show

package/.context/README.md +2 -0
package/.context/architecture.md +1 -0
package/.context/memory.md +26 -1
package/.context/project.md +2 -2
package/.context/voice.md +87 -0
package/.env.example +30 -0
package/.env.example.full +76 -5
package/README.md +89 -0
package/dist/cli/init-wizard.js +28 -0
package/dist/cli/init-wizard.test.js +66 -0
package/dist/config.js +68 -2
package/dist/config.test.js +153 -0
package/dist/cron/cron-prompt.js +72 -0
package/dist/cron/cron-prompt.test.js +244 -0
package/dist/cron/executor.js +66 -14
package/dist/cron/executor.test.js +116 -0
package/dist/cron/json-router.js +110 -0
package/dist/cron/json-router.test.js +303 -0
package/dist/cron/run-stats.js +14 -2
package/dist/cron/run-stats.test.js +70 -5
package/dist/discoclaw-recipe-format.test.js +87 -53
package/dist/discord/action-categories.js +2 -0
package/dist/discord/actions-config.js +130 -17
package/dist/discord/actions-config.test.js +47 -7
package/dist/discord/actions-crons.js +89 -1
package/dist/discord/actions-crons.test.js +110 -0
package/dist/discord/actions-forge.js +7 -3
package/dist/discord/actions-forge.test.js +36 -1
package/dist/discord/actions-imagegen.js +58 -5
package/dist/discord/actions-imagegen.test.js +151 -0
package/dist/discord/actions-voice.js +130 -0
package/dist/discord/actions-voice.test.js +283 -0
package/dist/discord/actions.js +17 -1
package/dist/discord/actions.test.js +50 -0
package/dist/discord/channel-context.js +31 -0
package/dist/discord/deferred-runner.js +2 -0
package/dist/discord/durable-consolidation.js +130 -0
package/dist/discord/durable-consolidation.test.js +367 -0
package/dist/discord/durable-memory.js +3 -0
package/dist/discord/file-download.js +4 -6
package/dist/discord/file-download.test.js +5 -3
package/dist/discord/forge-plan-registry.js +20 -0
package/dist/discord/forge-plan-registry.test.js +36 -1
package/dist/discord/help-command.js +2 -0
package/dist/discord/message-coordinator.js +81 -1
package/dist/discord/models-command.js +26 -4
package/dist/discord/output-utils.js +1 -1
package/dist/discord/plan-manager.test.js +6 -1
package/dist/discord/prompt-common.js +20 -0
package/dist/discord/prompt-common.test.js +66 -0
package/dist/discord/reaction-handler.js +2 -0
package/dist/discord/runtime-utils.js +61 -24
package/dist/discord/runtime-utils.test.js +20 -1
package/dist/discord/secret-commands.js +155 -0
package/dist/discord/secret-commands.test.js +264 -0
package/dist/discord/system-bootstrap.js +18 -0
package/dist/discord/system-bootstrap.test.js +33 -0
package/dist/discord/tool-aware-queue.js +1 -1
package/dist/discord/tool-aware-queue.test.js +13 -13
package/dist/discord/user-errors.js +4 -0
package/dist/discord/user-errors.test.js +10 -0
package/dist/discord/user-turn-to-durable.js +40 -4
package/dist/discord/user-turn-to-durable.test.js +134 -1
package/dist/discord/voice-command.js +76 -0
package/dist/discord/voice-command.test.js +285 -0
package/dist/discord/voice-status-command.js +56 -0
package/dist/discord/voice-status-command.test.js +149 -0
package/dist/discord/youtube-transcript.js +101 -0
package/dist/discord/youtube-transcript.test.js +196 -0
package/dist/discord.js +2 -1
package/dist/discord.render.test.js +18 -6
package/dist/index.js +376 -3
package/dist/index.post-connect.js +2 -0
package/dist/mcp-detect.js +15 -0
package/dist/mcp-detect.test.js +18 -1
package/dist/pipeline/engine.js +53 -16
package/dist/runtime/claude-code-cli.test.js +40 -0
package/dist/runtime/loop-detector.js +154 -0
package/dist/runtime/loop-detector.test.js +163 -0
package/dist/runtime/model-tiers.js +1 -1
package/dist/runtime/model-tiers.test.js +4 -4
package/dist/runtime/openai-tool-exec.js +2 -1
package/dist/runtime/openai-tool-exec.test.js +2 -1
package/dist/runtime/strategies/claude-strategy.js +28 -1
package/dist/runtime/tool-tiers.js +106 -0
package/dist/runtime/tool-tiers.test.js +120 -0
package/dist/runtime-overrides.js +87 -0
package/dist/runtime-overrides.test.js +239 -0
package/dist/sanitize-external.js +51 -0
package/dist/sanitize-external.test.js +95 -0
package/dist/voice/audio-pipeline.js +205 -0
package/dist/voice/audio-pipeline.test.js +619 -0
package/dist/voice/audio-receiver.js +165 -0
package/dist/voice/audio-receiver.test.js +497 -0
package/dist/voice/connection-manager.js +138 -0
package/dist/voice/connection-manager.test.js +317 -0
package/dist/voice/opus.js +37 -0
package/dist/voice/opus.test.js +81 -0
package/dist/voice/presence-handler.js +123 -0
package/dist/voice/presence-handler.test.js +545 -0
package/dist/voice/stt-deepgram.js +138 -0
package/dist/voice/stt-deepgram.test.js +202 -0
package/dist/voice/stt-factory.js +39 -0
package/dist/voice/stt-factory.test.js +45 -0
package/dist/voice/stt-openai.js +154 -0
package/dist/voice/stt-openai.test.js +281 -0
package/dist/voice/transcript-mirror.js +130 -0
package/dist/voice/transcript-mirror.test.js +257 -0
package/dist/voice/tts-cartesia.js +169 -0
package/dist/voice/tts-cartesia.test.js +228 -0
package/dist/voice/tts-deepgram.js +76 -0
package/dist/voice/tts-deepgram.test.js +197 -0
package/dist/voice/tts-factory.js +48 -0
package/dist/voice/tts-factory.test.js +53 -0
package/dist/voice/tts-openai.js +69 -0
package/dist/voice/tts-openai.test.js +138 -0
package/dist/voice/types.js +7 -0
package/dist/voice/types.test.js +84 -0
package/dist/voice/voice-action-flags.js +41 -0
package/dist/voice/voice-action-flags.test.js +80 -0
package/dist/voice/voice-responder.js +192 -0
package/dist/voice/voice-responder.test.js +468 -0
package/dist/voice/voice-style-prompt.js +13 -0
package/dist/voice/voice-style-prompt.test.js +20 -0
package/dist/workspace-bootstrap.test.js +1 -1
package/package.json +17 -3
package/templates/workspace/AGENTS.md +1 -2

package/.context/README.md CHANGED Viewed

@@ -18,6 +18,7 @@ Core instructions live in `CLAUDE.md` at the repo root.
 | **Task tracking** | `tasks.md` |
 | **Architecture / system overview** | `architecture.md` |
 | **Tool capabilities / browser automation** | `tools.md` |
+| **Voice system (STT/TTS, audio pipeline, actions)** | `voice.md` |
 | **Forge/plan standing constraints** | `project.md` *(auto-loaded by forge)* |
 | **Plan & Forge commands** | `plan-and-forge.md` *(in docs/, not .context/)* |
@@ -38,5 +39,6 @@ Core instructions live in `CLAUDE.md` at the repo root.
 - **architecture.md** — System overview, data flow, directory layout, key concepts
 - **bot-setup.md** — One-time bot creation and invite guide
 - **tools.md** — Available tools: browser automation (agent-browser), escalation ladder, CDP connect, security guardrails
+- **voice.md** — Voice subsystem: module map, audio data flow, key patterns (barge-in, allowlist gating), wiring sequence, dependencies, config reference
 - **project.md** — Standing constraints auto-loaded by forge drafter and auditor
 - **docs/plan-and-forge.md** — Canonical reference for `!plan` and `!forge` commands (lives in `docs/`, not `.context/` — human/developer reference, not auto-loaded into agent context)

package/.context/architecture.md CHANGED Viewed

@@ -25,6 +25,7 @@ Discord message
 | `src/discord/` | Discord subsystems: actions, allowlist, channel context, memory, output |
 | `src/runtime/` | Runtime adapters (Claude CLI), concurrency, process pool |
 | `src/tasks/` | In-process task data model + store + migration helpers |
+| `src/voice/` | Voice chat: STT/TTS providers, audio pipeline, connection manager |
 | `src/cron/` | Cron scheduler, executor, forum sync, run stats |
 | `src/observability/` | Metrics registry |
 | `src/sessions.ts` | Session manager (maps session keys to runtime session IDs) |

package/.context/memory.md CHANGED Viewed

@@ -51,6 +51,19 @@ Bot:   Given your preference for Rust in systems work, I'd lean that way —
        especially since this is a low-level networking tool.
 ```
+#### Consolidation
+When the active item count for a user crosses a threshold (`DISCOCLAW_DURABLE_CONSOLIDATION_THRESHOLD`, default `100`), consolidation can be triggered to prune and merge the list. A single `fast`-tier model call receives all active items and is asked to return a revised list — removing exact duplicates, merging near-duplicates, dropping clearly stale items, and preserving everything that is still plausibly useful. The model must not invent new facts or change the meaning of existing ones.
+The revised list is applied atomically: items absent from the model's output are deprecated via `deprecateItems()`; new or rewritten items are written via `addItem()`. Items present verbatim in the output are left untouched (no unnecessary writes).
+**Safety guards:**
+- The revised list must contain at least 50 % of the original count. If the model returns fewer items than that floor, consolidation is aborted and a warning is logged — no writes occur.
+- Consolidation runs at most once per session per user, regardless of how many writes happen. This prevents runaway API calls.
+- All mutations flow through the existing durable write queue, so consolidation is serialized with concurrent `!memory remember` / auto-extraction writes.
+**Config:** `DISCOCLAW_DURABLE_CONSOLIDATION_THRESHOLD=100` sets the item count at which consolidation becomes eligible. `DISCOCLAW_DURABLE_CONSOLIDATION_MODEL=fast` selects the model tier used for the consolidation call.
 ### 3. Memory Commands — user-facing control surface
 `src/discord/memory-commands.ts`
@@ -103,7 +116,16 @@ Bot:   Cool, platform work! What's your first project?
   [tool]  Works with Kubernetes and Terraform
 ```
-**Config:** `DISCOCLAW_SUMMARY_TO_DURABLE_ENABLED=false` to disable.
+**Supersession:** When extraction runs, active durable items for the user are appended
+to the prompt. The model may return a `supersedes` field on any extracted item, containing
+a substring that uniquely identifies the old item's text. The old item is then deprecated
+atomically before the new item is written — no additional API call required. This prevents
+stale preferences from accumulating (e.g. "I prefer Vim" is deprecated when "I switched to
+Neovim" is later extracted).
+**Config:** `DISCOCLAW_SUMMARY_TO_DURABLE_ENABLED=false` to disable extraction entirely.
+`DISCOCLAW_DURABLE_SUPERSESSION_SHADOW=1` to observe what the model would supersede without
+actually deprecating (shadow mode logs matches to stdout). Live supersession is on by default.
 ### 5. Short-Term Memory — cross-channel awareness
@@ -248,6 +270,9 @@ Short-term entries also store `channelId` alongside the existing `channelName`.
 | `DISCOCLAW_DURABLE_MAX_ITEMS` | `200` | Durable memory |
 | `DISCOCLAW_MEMORY_COMMANDS_ENABLED` | `true` | Memory commands |
 | `DISCOCLAW_SUMMARY_TO_DURABLE_ENABLED` | `true` | Auto-extraction |
+| `DISCOCLAW_DURABLE_SUPERSESSION_SHADOW` | `false` | Auto-extraction |
+| `DISCOCLAW_DURABLE_CONSOLIDATION_THRESHOLD` | `100` | Durable memory |
+| `DISCOCLAW_DURABLE_CONSOLIDATION_MODEL` | `fast` | Durable memory |
 | `DISCOCLAW_SHORTTERM_MEMORY_ENABLED` | `true` | Short-term memory |
 | `DISCOCLAW_SHORTTERM_MAX_ENTRIES` | `20` | Short-term memory |
 | `DISCOCLAW_SHORTTERM_MAX_AGE_HOURS` | `6` | Short-term memory |

package/.context/project.md CHANGED Viewed

@@ -26,7 +26,7 @@ Standing constraints for planning and auditing. These apply to all forge/plan op
 Plans that grow too large fail — they blow past token limits, cause audit/revise loops to diverge, and produce specs no human will review. These constraints prevent that.
 ### Scope limits
-- A plan should target **3–5 files** max. If a feature touches more, decompose into multiple sequential plans before drafting.
+- A plan should target **3–5 source modules** max (not counting co-located test files, lockfiles, or other generated artifacts). If a feature touches more source modules than that, decompose into multiple sequential plans before drafting.
 ### Size limits
 - Plan content (excluding the Audit Log section) should not exceed **200 lines**. If the draft exceeds this, the scope is too large.
@@ -40,4 +40,4 @@ Plans that grow too large fail — they blow past token limits, cause audit/revi
 ### Auditor guidance
 - Do NOT flag "underspecified implementation details" as medium/high. The plan describes intent and scope — the implementing agent fills in the details.
 - DO flag: missing scope items, incorrect assumptions about existing code, safety/correctness issues, missing error handling for external boundaries.
-- If a plan is too large or touches too many files, flag that as **high severity** with recommendation to split.
+- If a plan is too large or touches too many source modules, flag that as **high severity** with recommendation to split. Co-located test files, lockfile changes from dependency additions, and doc/inventory updates do not count toward the scope cap.

package/.context/voice.md ADDED Viewed

@@ -0,0 +1,87 @@
+# Voice System
+Real-time voice chat: STT transcription, AI response generation, TTS synthesis, and Discord voice playback. For operator setup, see `docs/voice.md`.
+## Dependencies
+Two native npm packages power the Discord voice integration:
+- **`@discordjs/voice`** — voice connection management, audio player/receiver, gateway adapter. Used in `connection-manager.ts`, `audio-receiver.ts`, `audio-pipeline.ts`, `voice-responder.ts`.
+- **`@discordjs/opus`** — native Opus codec binding (C++ addon, requires build tools). Wrapped by `opus.ts` to decode Discord's 48 kHz stereo Opus packets to PCM s16le.
+- **`sodium-native`** — encryption for Discord voice (auto-detected by discord.js).
+## Module Map
+| Module | Role |
+|--------|------|
+| `src/voice/types.ts` | Core interfaces: `VoiceConfig`, `AudioFrame`, `SttProvider`, `TtsProvider`, `TranscriptionResult` |
+| `src/voice/connection-manager.ts` | Per-guild `VoiceConnection` lifecycle (join/leave/mute/deafen), reconnect retries (default 5), error-to-destroy safety net |
+| `src/voice/audio-pipeline.ts` | Per-guild orchestrator — auto-starts STT + receiver + responder on connection Ready, auto-stops on Destroyed |
+| `src/voice/audio-receiver.ts` | Subscribes to allowlisted users' Opus streams, decodes via `OpusDecoderFactory`, downsamples 48→16 kHz mono, feeds `SttProvider` |
+| `src/voice/opus.ts` | `OpusDecoderFactory` implementation wrapping `@discordjs/opus` |
+| `src/voice/voice-responder.ts` | AI invoke → TTS synthesis → `AudioPlayer` playback; generation-based cancellation for barge-in |
+| `src/voice/stt-deepgram.ts` | Deepgram Nova-3 streaming STT via WebSocket |
+| `src/voice/tts-cartesia.ts` | Cartesia Sonic-3 TTS via WebSocket, outputs PCM s16le at 24 kHz |
+| `src/voice/tts-deepgram.ts` | Deepgram Aura TTS via REST, outputs PCM s16le at 24 kHz |
+| `src/voice/stt-factory.ts` | STT provider factory (deepgram or whisper stub) |
+| `src/voice/tts-factory.ts` | TTS provider factory (cartesia, deepgram, openai, or kokoro stub) |
+| `src/voice/presence-handler.ts` | Auto-join/leave on `voiceStateUpdate` (allowlisted users only) |
+| `src/voice/transcript-mirror.ts` | Posts user transcriptions and bot responses to a text channel |
+| `src/voice/voice-action-flags.ts` | Restricted action subset for voice invocations (messaging + tasks + memory only) |
+| `src/discord/actions-voice.ts` | Discord action types: `voiceJoin`, `voiceLeave`, `voiceStatus`, `voiceMute`, `voiceDeafen` |
+## Audio Data Flow
+```
+User speaks in Discord voice channel
+  → @discordjs/voice receiver emits Opus packets per user
+    → AudioReceiver: allowlist gate → OpusDecoder (48 kHz stereo PCM)
+      → downsample to 16 kHz mono
+        → SttProvider.feedAudio() (Deepgram WebSocket)
+          → TranscriptionResult (final transcript)
+            → VoiceResponder.handleTranscription()
+              → InvokeAiFn (AI runtime) → response text
+                → TtsProvider.synthesize() (Cartesia WebSocket → 24 kHz mono PCM)
+                  → upsampleToDiscord (48 kHz stereo)
+                    → AudioPlayer → Discord voice connection
+```
+## Key Patterns
+- **Allowlist gating** — `AudioReceiver` only subscribes to users in `DISCORD_ALLOW_USER_IDS`. Empty allowlist = ignore everyone (fail-closed).
+- **Dual-flag voice actions** — Voice action execution requires both `VOICE_ENABLED` and `DISCORD_ACTIONS_VOICE`. The `buildVoiceActionFlags()` function intersects a voice-specific allowlist (messaging, tasks, memory) with env config; all other action categories are hard-disabled.
+- **Generation-based cancellation** — `VoiceResponder` increments a generation counter on each new transcription. If a newer transcription arrives mid-pipeline, the older one is silently abandoned.
+- **Barge-in** — Gated on a non-empty STT transcription result, not the raw VAD `speaking.start` event. Echo from the bot's own TTS leaking through the user's mic produces empty transcriptions and is ignored. Only when `VoiceResponder.handleTranscription()` receives a non-empty transcript while the player is active does it stop playback and advance the generation counter. This eliminates false positives from echo without relying on a static grace-period timeout.
+- **Re-entrancy guard** — `AudioPipelineManager.startPipeline` uses a `starting` set because `VoiceConnection.subscribe()` synchronously fires a Ready state change.
+- **Error containment** — `VoiceConnectionManager` catches connection errors and destroys the connection to prevent process crashes (e.g. DAVE handshake failures).
+- **Deepgram TTS 2000-char limit** — Deepgram Aura REST TTS returns HTTP 413 (silent failure) for inputs exceeding ~2000 characters. `tts-deepgram.ts` truncates the input to 2000 chars before sending to prevent silent audio dropouts. If the AI response is unexpectedly long (e.g. from a missing `VOICE_STYLE_INSTRUCTION`), the user will still hear a truncated response rather than silence.
+## Wiring (`src/index.ts`)
+When `voiceEnabled=true`, the post-connect block in `src/index.ts` initializes the voice subsystem in order:
+1. **`TranscriptMirror.resolve()`** — resolves the voice home channel for text mirroring (may be `null` if unconfigured).
+2. **`voiceInvokeAi`** closure — builds the AI invocation function that prepends channel context, PA prompt, durable memory, voice system prompt, and action instructions to user speech. Supports up to 1 follow-up round for action results. `runtimeTimeoutMs` is applied to each invocation as a safety net against runaway queries.
+3. **`AudioPipelineManager`** — instantiated with voice config, allowlist, decoder factory, `voiceInvokeAi`, transcript mirror, and a transcription logging callback.
+4. **`VoiceConnectionManager`** — instantiated with `onReady` → `audioPipeline.startPipeline()` and `onDestroyed` → `audioPipeline.stopPipeline()` callbacks.
+5. **`botParams.voiceCtx`** — set when `DISCORD_ACTIONS_VOICE` is enabled, exposing `voiceManager` to Discord action handlers (`voiceJoin`, `voiceLeave`, etc.).
+6. **`VoicePresenceHandler`** — created and registered on the Discord client only when `VOICE_AUTO_JOIN` is enabled.
+## Config (env vars)
+| Variable | Default | Purpose |
+|----------|---------|---------|
+| `DISCOCLAW_VOICE_ENABLED` | `0` | Master switch |
+| `DISCOCLAW_DISCORD_ACTIONS_VOICE` | `0` | Enable voice action types |
+| `DISCOCLAW_VOICE_AUTO_JOIN` | `0` | Auto-join when allowlisted user enters |
+| `DISCOCLAW_STT_PROVIDER` | `deepgram` | STT backend |
+| `DISCOCLAW_TTS_PROVIDER` | `cartesia` | TTS backend (`cartesia`, `deepgram`, `openai`, `kokoro`) |
+| `DISCOCLAW_VOICE_HOME_CHANNEL` | — | Voice audio channel name/ID used for prompt context (not transcript mirroring) |
+| `DISCOCLAW_VOICE_LOG_CHANNEL` | — | Text channel name/ID where `TranscriptMirror` posts user transcriptions and bot responses; falls back to bootstrap-provided `voiceLogChannelId` if unset |
+| `DISCOCLAW_VOICE_MODEL` | `capable` | AI model tier for voice responses |
+| `DISCOCLAW_VOICE_SYSTEM_PROMPT` | — | Custom system prompt for voice invocations (max 4000 chars) |
+| `DEEPGRAM_API_KEY` | — | Required for deepgram STT and TTS |
+| `DEEPGRAM_STT_MODEL` | `nova-3-conversationalai` | Deepgram STT model name |
+| `DEEPGRAM_TTS_VOICE` | `aura-2-asteria-en` | Deepgram TTS voice name |
+| `CARTESIA_API_KEY` | — | Required for cartesia TTS |
+| *(built-in)* | — | Telegraphic style instruction hardcoded into every voice AI invocation — front-loads the answer, strips preambles/markdown/filler, keeps responses short for TTS latency. Not an env var; not overridable by `DISCOCLAW_VOICE_SYSTEM_PROMPT`. |

package/.env.example CHANGED Viewed

@@ -32,6 +32,11 @@ DISCORD_ALLOW_USER_IDS=
 # connect and persisted to system-scaffold.json. Only set this to override the auto-created channel.
 #DISCOCLAW_CRON_FORUM=
+# Default model for cron job execution: fast | capable (or concrete model names).
+# Defaults to capable (resolves to Sonnet on Claude Code) — avoids using Opus on routine cron work.
+# Override at runtime via `!models set cron-exec <model>`.
+#DISCOCLAW_CRON_EXEC_MODEL=capable
 # ----------------------------------------------------------
 # CORE — most users will want to review these
 # ----------------------------------------------------------
@@ -95,6 +100,31 @@ DISCORD_GUILD_ID=
 # auto-selected: systemctl --user on Linux, launchctl on macOS.
 #DC_RESTART_CMD=
+# ----------------------------------------------------------
+# Voice — configure via `pnpm setup` or `discoclaw init`
+# ----------------------------------------------------------
+# Run `pnpm setup` or `discoclaw init` to enable voice interactively,
+# or set these vars manually to enable voice chat (STT/TTS via Deepgram).
+#DISCOCLAW_VOICE_ENABLED=0
+# Text channel used for voice prompt context and actions (e.g. posting action results,
+# reading pinned notes). Required for full voice functionality when voice is enabled.
+#DISCOCLAW_VOICE_HOME_CHANNEL= # e.g. "voice"
+# Text channel for posting voice transcripts. Optional — auto-discovered via bootstrap
+# (the bot creates a "voice-log" text channel and stores its ID in system-scaffold.json).
+# Only set this to override the auto-discovered channel.
+#DISCOCLAW_VOICE_LOG_CHANNEL=
+#DEEPGRAM_API_KEY=
+# ----------------------------------------------------------
+# Secret management via Discord DM
+# ----------------------------------------------------------
+# Use !secret in a DM to the bot to add or update .env entries without
+# restarting or editing files directly. Values are never echoed back.
+#   !secret set KEY=value   — add/update an entry
+#   !secret unset KEY       — remove an entry
+#   !secret list            — list key names (values hidden)
+# The bot must be restarted (!restart) after changes take effect.
 # ----------------------------------------------------------
 # For all ~90 options (subsystems, actions, memory, identity,
 # observability, advanced/debug), see .env.example.full

package/.env.example.full CHANGED Viewed

@@ -64,9 +64,9 @@ DISCORD_ALLOW_USER_IDS=
 # Unset = use the built-in default shown in the comments.
 # Concrete model names (e.g. sonnet, gpt-4o-mini) are passed through unchanged.
 #
-# Claude Code adapter (default: fast=haiku, capable=opus):
+# Claude Code adapter (default: fast=haiku, capable=sonnet):
 #DISCOCLAW_TIER_CLAUDE_CODE_FAST=haiku
-#DISCOCLAW_TIER_CLAUDE_CODE_CAPABLE=opus
+#DISCOCLAW_TIER_CLAUDE_CODE_CAPABLE=sonnet
 #
 # Gemini CLI adapter (default: fast=gemini-2.5-flash, capable=gemini-2.5-pro):
 #DISCOCLAW_TIER_GEMINI_FAST=gemini-2.5-flash
@@ -79,6 +79,12 @@ DISCORD_ALLOW_USER_IDS=
 # Codex CLI adapter (default: adapter-default for both tiers):
 #DISCOCLAW_TIER_CODEX_FAST=
 #DISCOCLAW_TIER_CODEX_CAPABLE=
+#
+# Tool-tier map — override which tool tier a model resolves to.
+# Comma-separated model=tier pairs. Tiers: basic, standard, full.
+# Format: haiku=basic,sonnet=standard,opus=full
+# Unset = built-in pattern matching (haiku/flash→basic, sonnet→standard, opus/unknown→full).
+#DISCOCLAW_TOOL_TIER_MAP=
 # Output format for the Claude CLI. stream-json gives smoother streaming.
 #CLAUDE_OUTPUT_FORMAT=stream-json
@@ -107,6 +113,11 @@ DISCORD_ALLOW_USER_IDS=
 # Forum channel ID is auto-created on first connect (see AUTO-DETECTED above).
 # Model tier for cron execution: fast | capable (concrete names accepted as passthrough).
 #DISCOCLAW_CRON_MODEL=fast
+# Default model tier for cron job execution (fast | capable, concrete names accepted as passthrough).
+# Defaults to capable (resolves to Sonnet on Claude Code) — avoids using Opus on routine cron work.
+# Per-job overrides and AI-classified model still win when set. Override at runtime via
+# `!models set cron-exec <model>`.
+#DISCOCLAW_CRON_EXEC_MODEL=capable
 # Enable cron Discord actions (CRUD via Discord action blocks).
 #DISCOCLAW_DISCORD_ACTIONS_CRONS=1
 # Persistent stats directory (run counts, last run time, status).
@@ -232,6 +243,9 @@ DISCOCLAW_DISCORD_ACTIONS_DEFER=1
 #DISCOCLAW_DURABLE_MEMORY_ENABLED=1
 #DISCOCLAW_DURABLE_INJECT_MAX_CHARS=2000
 #DISCOCLAW_DURABLE_MAX_ITEMS=200
+# Shadow mode for durable memory supersession: log supersession matches without actually deprecating.
+# Set to 1 to observe what the model would supersede before enabling live deprecation.
+#DISCOCLAW_DURABLE_SUPERSESSION_SHADOW=0
 #DISCOCLAW_MEMORY_COMMANDS_ENABLED=1
 # Override storage directory for durable memory.
 #DISCOCLAW_DURABLE_DATA_DIR=
@@ -246,6 +260,12 @@ DISCOCLAW_DISCORD_ACTIONS_DEFER=1
 # Auto-extract notable facts from user messages into durable memory.
 # Runs after rolling summary generation. Default on.
 #DISCOCLAW_SUMMARY_TO_DURABLE_ENABLED=1
+# Consolidate durable memory when active item count reaches this threshold.
+# Merges near-duplicates and removes stale items via a fast-tier model call.
+# Set to 0 to disable automatic consolidation (default: 50).
+#DISCOCLAW_MEMORY_CONSOLIDATION_THRESHOLD=50
+# Model tier or name used for consolidation (default: fast).
+#DISCOCLAW_MEMORY_CONSOLIDATION_MODEL=fast
 # Character budget for recent conversation history in prompts (0 = disabled).
 #DISCOCLAW_MESSAGE_HISTORY_BUDGET=3000
@@ -455,16 +475,67 @@ DISCOCLAW_DISCORD_ACTIONS_DEFER=1
 # Image generation
 # ----------------------------------------------------------
 # Master switch — enables the imagegen Discord action category (default: off).
-# When enabled, the AI can generate images via action blocks using OpenAI or Gemini Imagen.
+# When enabled, the AI can generate images via action blocks using OpenAI or Gemini.
 DISCOCLAW_DISCORD_ACTIONS_IMAGEGEN=0
-# API key for Gemini Imagen models (imagen-4.0-generate-001 and similar).
+# API key for Gemini image generation (Imagen and native Gemini models).
 # Leave unset to use OpenAI only.
 #IMAGEGEN_GEMINI_API_KEY=
 # Override the default image generation model. If unset, auto-detected:
 # only IMAGEGEN_GEMINI_API_KEY set → imagen-4.0-generate-001; otherwise → dall-e-3.
 # OpenAI models: dall-e-3, gpt-image-1
-# Gemini models: imagen-4.0-generate-001, imagen-4.0-fast-generate-001, imagen-4.0-ultra-generate-001
+# Gemini Imagen models: imagen-4.0-generate-001, imagen-4.0-fast-generate-001, imagen-4.0-ultra-generate-001
+# Gemini native models (text+image in one call): gemini-3.1-flash-image-preview, gemini-3-pro-image-preview
 #IMAGEGEN_DEFAULT_MODEL=
 # Note: OpenAI image generation reuses OPENAI_API_KEY (documented above in the
 # OpenAI-compatible HTTP adapter section). When DISCOCLAW_DISCORD_ACTIONS_IMAGEGEN=1,
 # at least one of OPENAI_API_KEY or IMAGEGEN_GEMINI_API_KEY must be set.
+# ----------------------------------------------------------
+# Voice (STT/TTS) — join voice channels, listen and respond
+# ----------------------------------------------------------
+# Master switch — enables voice channel interaction (default: off).
+# When enabled, the bot can join Discord voice channels, transcribe speech via STT,
+# and respond with synthesized speech via TTS.
+#DISCOCLAW_VOICE_ENABLED=0
+# Enable voice Discord action category (voiceJoin, voiceLeave, voiceStatus, voiceMute, voiceDeafen).
+# Requires DISCOCLAW_VOICE_ENABLED=1 to take effect (default: off).
+#DISCOCLAW_DISCORD_ACTIONS_VOICE=0
+# Auto-join voice channels when a non-bot user joins, and auto-leave when the last
+# non-bot user leaves. Starts/tears down the audio pipeline (STT receiver) automatically.
+# Requires DISCOCLAW_VOICE_ENABLED=1 (default: off).
+#DISCOCLAW_VOICE_AUTO_JOIN=0
+# Speech-to-text provider: deepgram (Deepgram Nova-3 API) or whisper (whisper.cpp local).
+# deepgram requires DEEPGRAM_API_KEY; whisper runs locally with no API key.
+#DISCOCLAW_STT_PROVIDER=deepgram
+# Text-to-speech provider: cartesia | deepgram | kokoro | openai.
+# cartesia requires CARTESIA_API_KEY; deepgram reuses DEEPGRAM_API_KEY;
+# openai requires OPENAI_API_KEY; kokoro runs locally with no API key.
+#DISCOCLAW_TTS_PROVIDER=cartesia
+# Voice audio channel name or ID — the channel the bot joins for voice interaction.
+# Used as the prompt context source (root policy, PA files, channel context, durable memory).
+# The old env var DISCOCLAW_VOICE_TRANSCRIPT_CHANNEL is still accepted as a fallback.
+# Leave unset to skip voice channel context in prompts.
+#DISCOCLAW_VOICE_HOME_CHANNEL=  # e.g. "voice" if using the default scaffold
+# Text channel name or ID for posting voice transcripts (user STT and bot TTS responses).
+# Optional — auto-discovered via bootstrap (the bot creates "voice-log" and stores its ID
+# in system-scaffold.json). Only set this to override the auto-discovered channel.
+# Leave unset to disable transcript mirroring.
+#DISCOCLAW_VOICE_LOG_CHANNEL=  # e.g. "voice-log" if using the default scaffold
+# Model for voice AI responses: tier (fast | capable) or concrete name (sonnet, opus, haiku).
+# Independent of RUNTIME_MODEL — allows tuning voice latency vs quality separately from chat.
+# Switchable at runtime via `modelSet voice <model>`.
+# Default: follows DISCOCLAW_FAST_MODEL (override here for voice-specific tuning).
+#DISCOCLAW_VOICE_MODEL=sonnet
+# Custom system prompt prepended to voice AI invocations. Max 4000 chars.
+# Use this to set a conversational tone, brevity instructions, or persona for voice responses.
+#DISCOCLAW_VOICE_SYSTEM_PROMPT=
+# API key for Deepgram Nova-3 STT. Required when DISCOCLAW_STT_PROVIDER=deepgram.
+#DEEPGRAM_API_KEY=
+# Deepgram STT model for voice transcription (default: nova-3-conversationalai).
+# See https://developers.deepgram.com/docs/models-languages-overview for available models.
+#DEEPGRAM_STT_MODEL=nova-3-conversationalai
+# Deepgram TTS voice for speech synthesis (default: aura-2-asteria-en).
+# See https://developers.deepgram.com/docs/tts-models for available voices.
+#DEEPGRAM_TTS_VOICE=aura-2-asteria-en
+# API key for Cartesia Sonic-3 TTS. Required when DISCOCLAW_TTS_PROVIDER=cartesia.
+#CARTESIA_API_KEY=

package/README.md CHANGED Viewed

@@ -32,6 +32,10 @@ Your assistant carries context across every conversation, channel, and restart.
 **Why Discord fits:** channels = context boundaries, DMs = private deep context, conversation history is the raw material.
+### YouTube transcripts
+When you share a YouTube link in a message, DiscoClaw automatically fetches the video's transcript and injects it into the AI's context. This lets the bot answer questions about video content, summarize talks, or reference specific points — without you needing to copy-paste anything. Up to 3 videos per message are processed, with a 15-second timeout per fetch. Transcripts are sanitized before injection to prevent prompt manipulation.
 ## Tasks — the bot tracks your work
 A lightweight in-process task store that syncs bidirectionally with Discord forum threads.
@@ -54,6 +58,22 @@ Recurring tasks defined as forum threads in plain language — no crontab, no se
 **Why Discord fits:** forum threads = job definitions, archive/unarchive = pause/resume, no separate scheduler UI needed.
+<!-- source-of-truth: docs/voice.md -->
+## Voice — the bot talks back
+DiscoClaw can join Discord voice channels for real-time conversation: listen via speech-to-text, think with the AI runtime, and speak the response via text-to-speech.
+- **STT** — Deepgram Nova-3 streaming transcription (WebSocket)
+- **TTS** — Cartesia Sonic-3 speech synthesis (WebSocket, 24 kHz PCM)
+- **Barge-in** — interrupt the bot mid-sentence by speaking; playback stops immediately
+- **Auto-join** — optionally join/leave channels automatically when you enter or leave
+- **Transcript mirror** — voice conversations are mirrored to a text channel for persistence
+- **Voice actions** — the AI can execute a restricted action subset (messaging, tasks, memory) during voice
+Voice is **off by default**. Enable with `DISCOCLAW_VOICE_ENABLED=1` plus API keys for your STT/TTS providers. Requires Node 22+ (for native WebSocket used by Cartesia TTS) and C++ build tools (for the `@discordjs/opus` native addon).
+Full setup guide: [docs/voice.md](docs/voice.md)
 ## How it works
 DiscoClaw orchestrates the flow between Discord and AI runtimes (Claude Code by default, with Gemini, OpenAI, Codex, and OpenRouter adapters available via `PRIMARY_RUNTIME`). The OpenAI-compatible and OpenRouter adapters support optional tool use (function calling) when `OPENAI_COMPAT_TOOLS_ENABLED=1` is set. It doesn't contain intelligence itself — it decides *when* to call the AI, *what context* to give it, and *what to do* with the output. When you send a message, the orchestrator:
@@ -64,12 +84,51 @@ DiscoClaw orchestrates the flow between Discord and AI runtimes (Claude Code by
 4. Streams the response back, chunked to fit Discord's message limits
 5. Parses and executes any Discord actions the assistant emitted
+### Message batching
+When multiple messages arrive while the bot is thinking (i.e., an AI invocation is already active for that session), they're automatically combined into a single prompt rather than queued individually. This means rapid follow-up messages are processed together, giving the bot full context in one shot. Commands (`!`-prefixed messages) bypass batching and are always processed individually.
 ### OpenRouter
 Set `PRIMARY_RUNTIME=openrouter` to route requests through [OpenRouter](https://openrouter.ai), which provides access to models from Anthropic, OpenAI, Google, and others via a single API key — useful if you want to switch models without managing multiple provider accounts.
 Required: `OPENROUTER_API_KEY`. Optional overrides: `OPENROUTER_BASE_URL` (default: `https://openrouter.ai/api/v1`) and `OPENROUTER_MODEL` (default: `anthropic/claude-sonnet-4`). See `.env.example` for the full reference.
+## Model Overrides
+The `!models` command lets you view and swap AI models per role at runtime — no restart needed, and changes persist across restarts.
+**Roles:** `chat`, `fast`, `forge-drafter`, `forge-auditor`, `summary`, `cron`, `cron-exec`, `voice`
+| Command | Description |
+|---------|-------------|
+| `!models` | Show current model assignments |
+| `!models set <role> <model>` | Change the model for a role |
+| `!models reset` | Revert all roles to env-var defaults |
+| `!models reset <role>` | Revert a specific role |
+**Examples:**
+- `!models set chat claude-sonnet-4` — use Sonnet for chat
+- `!models set chat openrouter` — switch chat to the OpenRouter runtime
+- `!models set cron-exec haiku` — run crons on a cheaper model
+- `!models set voice sonnet` — use a specific model for voice
+- `!models reset` — clear all overrides
+Setting the `chat` role to a runtime name (`openrouter`, `openai`, `gemini`, `codex`, `claude`) switches the active runtime adapter for that role.
+## Secret Management
+The `!secret` command lets you manage `.env` entries from Discord without touching the file directly. It works in DMs only — values are never echoed back.
+| Command | Description |
+|---------|-------------|
+| `!secret set KEY=value` | Add or update a `.env` entry |
+| `!secret unset KEY` | Remove a `.env` entry |
+| `!secret list` | List key names in `.env` (values hidden) |
+| `!secret help` | Show usage |
+Changes take effect after a restart (`!restart`). Writes are atomic — a partial write can't corrupt your `.env`.
 ## Customization
 ### Shareable integration recipes
@@ -106,6 +165,7 @@ When using the Claude runtime, you can connect external tool servers via MCP. Pl
 **Contributors (from source):**
 - Everything above, plus **pnpm** — enable via Corepack (`corepack enable`) or install separately
+<!-- source-of-truth: docs/discord-bot-setup.md -->
 ## Quick start
 ### Discord setup (private server + bot)
@@ -125,6 +185,35 @@ When using the Claude runtime, you can connect external tool servers via MCP. Pl
 Full step-by-step guide: [docs/discord-bot-setup.md](docs/discord-bot-setup.md)
+## Documentation
+### Getting Started
+- [Discord bot setup](docs/discord-bot-setup.md) — create a bot, invite it, configure permissions
+- [MCP (Model Context Protocol)](docs/mcp.md) — connect external tool servers
+### Features & Usage
+- [Memory system](docs/memory.md) — five-layer memory architecture, tuning, and troubleshooting
+- [Plan & Forge](docs/plan-and-forge.md) — autonomous planning and code generation
+- [Discord actions](docs/discord-actions.md) — channels, messaging, moderation, tasks, crons
+- [Cron / automations](docs/cron.md) — recurring task setup, advanced options, debugging
+- [Tasks](docs/tasks.md) — task lifecycle, bidirectional sync, tag maps
+- [Voice](docs/voice.md) — real-time voice chat setup (STT/TTS)
+- [Shareable recipes](docs/discoclaw-recipe-spec.md) — integration recipe format spec
+### Development
+- [Philosophy](docs/philosophy.md) — design principles and trade-offs
+- [Releasing](docs/releasing.md) — npm publish workflow and versioning
+- [Inventory](docs/INVENTORY.md) — full component inventory and MVP status
+### Operations
+- [Configuration reference](docs/configuration.md) — all environment variables indexed by category
+- [Webhook exposure](docs/webhook-exposure.md) — tunnel/proxy setup and webhook security
+- [Data migration](docs/data-migration.md) — migrating task data between formats
 ### Install and run
 1. **Install globally:**

package/dist/cli/init-wizard.js CHANGED Viewed

@@ -69,6 +69,23 @@ export function buildEnvContent(vals, now = new Date()) {
     lines.push('# DEFAULTS');
     lines.push(`DISCOCLAW_DISCORD_ACTIONS=${vals.DISCOCLAW_DISCORD_ACTIONS ?? '1'}`);
     lines.push('');
+    // Voice
+    const voiceKeys = [
+        'DISCOCLAW_VOICE_ENABLED',
+        'DEEPGRAM_API_KEY',
+        'DISCOCLAW_DISCORD_ACTIONS_VOICE',
+        'DISCOCLAW_STT_PROVIDER',
+        'DISCOCLAW_TTS_PROVIDER',
+    ];
+    const hasVoice = voiceKeys.some((k) => vals[k]);
+    if (hasVoice) {
+        lines.push('# VOICE');
+        for (const k of voiceKeys) {
+            if (vals[k])
+                lines.push(`${k}=${vals[k]}`);
+        }
+        lines.push('');
+    }
     const autoDetectedKeys = ['DISCOCLAW_TASKS_FORUM', 'DISCOCLAW_CRON_FORUM'];
     const hasAutoDetected = autoDetectedKeys.some((k) => vals[k]);
     if (hasAutoDetected) {
@@ -298,6 +315,16 @@ export async function runInitWizard() {
         values.OPENROUTER_MODEL = 'anthropic/claude-sonnet-4';
     }
     values.DISCOCLAW_DISCORD_ACTIONS = '1';
+    // ── Voice setup ───────────────────────────────────────────────────────────
+    const enableVoice = await ask('\nEnable voice chat? (requires a Deepgram API key — you can skip this and enable later) [y/N] ');
+    if (enableVoice.toLowerCase() === 'y') {
+        const deepgramKey = await askValidated('Deepgram API key: ', (val) => (val ? null : 'Deepgram API key is required'));
+        values.DISCOCLAW_VOICE_ENABLED = '1';
+        values.DEEPGRAM_API_KEY = deepgramKey;
+        values.DISCOCLAW_DISCORD_ACTIONS_VOICE = '1';
+        values.DISCOCLAW_STT_PROVIDER = 'deepgram';
+        values.DISCOCLAW_TTS_PROVIDER = 'deepgram';
+    }
     // ── Write .env ────────────────────────────────────────────────────────────
     const envContent = buildEnvContent(values);
     const tmpPath = path.join(cwd, '.env.tmp');
@@ -329,6 +356,7 @@ export async function runInitWizard() {
     console.log('Configuration complete!\n');
     console.log('Next steps:');
     console.log('  Note: The bot will auto-create its forum channels on first connect.');
+    console.log('  Tip: To add API keys or secrets later, DM the bot: !secret set KEY=value');
     if (values.PRIMARY_RUNTIME === 'claude') {
         console.log(`  ${daemonHint}`);
     }