discoclaw 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. package/.context/README.md +2 -0
  2. package/.context/architecture.md +1 -0
  3. package/.context/memory.md +26 -1
  4. package/.context/project.md +2 -2
  5. package/.context/voice.md +87 -0
  6. package/.env.example +30 -0
  7. package/.env.example.full +76 -5
  8. package/README.md +89 -0
  9. package/dist/cli/init-wizard.js +28 -0
  10. package/dist/cli/init-wizard.test.js +66 -0
  11. package/dist/config.js +68 -2
  12. package/dist/config.test.js +153 -0
  13. package/dist/cron/cron-prompt.js +72 -0
  14. package/dist/cron/cron-prompt.test.js +244 -0
  15. package/dist/cron/executor.js +66 -14
  16. package/dist/cron/executor.test.js +116 -0
  17. package/dist/cron/json-router.js +110 -0
  18. package/dist/cron/json-router.test.js +303 -0
  19. package/dist/cron/run-stats.js +14 -2
  20. package/dist/cron/run-stats.test.js +70 -5
  21. package/dist/discoclaw-recipe-format.test.js +87 -53
  22. package/dist/discord/action-categories.js +2 -0
  23. package/dist/discord/actions-config.js +130 -17
  24. package/dist/discord/actions-config.test.js +47 -7
  25. package/dist/discord/actions-crons.js +89 -1
  26. package/dist/discord/actions-crons.test.js +110 -0
  27. package/dist/discord/actions-forge.js +7 -3
  28. package/dist/discord/actions-forge.test.js +36 -1
  29. package/dist/discord/actions-imagegen.js +58 -5
  30. package/dist/discord/actions-imagegen.test.js +151 -0
  31. package/dist/discord/actions-voice.js +130 -0
  32. package/dist/discord/actions-voice.test.js +283 -0
  33. package/dist/discord/actions.js +17 -1
  34. package/dist/discord/actions.test.js +50 -0
  35. package/dist/discord/channel-context.js +31 -0
  36. package/dist/discord/deferred-runner.js +2 -0
  37. package/dist/discord/durable-consolidation.js +130 -0
  38. package/dist/discord/durable-consolidation.test.js +367 -0
  39. package/dist/discord/durable-memory.js +3 -0
  40. package/dist/discord/file-download.js +4 -6
  41. package/dist/discord/file-download.test.js +5 -3
  42. package/dist/discord/forge-plan-registry.js +20 -0
  43. package/dist/discord/forge-plan-registry.test.js +36 -1
  44. package/dist/discord/help-command.js +2 -0
  45. package/dist/discord/message-coordinator.js +81 -1
  46. package/dist/discord/models-command.js +26 -4
  47. package/dist/discord/output-utils.js +1 -1
  48. package/dist/discord/plan-manager.test.js +6 -1
  49. package/dist/discord/prompt-common.js +20 -0
  50. package/dist/discord/prompt-common.test.js +66 -0
  51. package/dist/discord/reaction-handler.js +2 -0
  52. package/dist/discord/runtime-utils.js +61 -24
  53. package/dist/discord/runtime-utils.test.js +20 -1
  54. package/dist/discord/secret-commands.js +155 -0
  55. package/dist/discord/secret-commands.test.js +264 -0
  56. package/dist/discord/system-bootstrap.js +18 -0
  57. package/dist/discord/system-bootstrap.test.js +33 -0
  58. package/dist/discord/tool-aware-queue.js +1 -1
  59. package/dist/discord/tool-aware-queue.test.js +13 -13
  60. package/dist/discord/user-errors.js +4 -0
  61. package/dist/discord/user-errors.test.js +10 -0
  62. package/dist/discord/user-turn-to-durable.js +40 -4
  63. package/dist/discord/user-turn-to-durable.test.js +134 -1
  64. package/dist/discord/voice-command.js +76 -0
  65. package/dist/discord/voice-command.test.js +285 -0
  66. package/dist/discord/voice-status-command.js +56 -0
  67. package/dist/discord/voice-status-command.test.js +149 -0
  68. package/dist/discord/youtube-transcript.js +101 -0
  69. package/dist/discord/youtube-transcript.test.js +196 -0
  70. package/dist/discord.js +2 -1
  71. package/dist/discord.render.test.js +18 -6
  72. package/dist/index.js +376 -3
  73. package/dist/index.post-connect.js +2 -0
  74. package/dist/mcp-detect.js +15 -0
  75. package/dist/mcp-detect.test.js +18 -1
  76. package/dist/pipeline/engine.js +53 -16
  77. package/dist/runtime/claude-code-cli.test.js +40 -0
  78. package/dist/runtime/loop-detector.js +154 -0
  79. package/dist/runtime/loop-detector.test.js +163 -0
  80. package/dist/runtime/model-tiers.js +1 -1
  81. package/dist/runtime/model-tiers.test.js +4 -4
  82. package/dist/runtime/openai-tool-exec.js +2 -1
  83. package/dist/runtime/openai-tool-exec.test.js +2 -1
  84. package/dist/runtime/strategies/claude-strategy.js +28 -1
  85. package/dist/runtime/tool-tiers.js +106 -0
  86. package/dist/runtime/tool-tiers.test.js +120 -0
  87. package/dist/runtime-overrides.js +87 -0
  88. package/dist/runtime-overrides.test.js +239 -0
  89. package/dist/sanitize-external.js +51 -0
  90. package/dist/sanitize-external.test.js +95 -0
  91. package/dist/voice/audio-pipeline.js +205 -0
  92. package/dist/voice/audio-pipeline.test.js +619 -0
  93. package/dist/voice/audio-receiver.js +165 -0
  94. package/dist/voice/audio-receiver.test.js +497 -0
  95. package/dist/voice/connection-manager.js +138 -0
  96. package/dist/voice/connection-manager.test.js +317 -0
  97. package/dist/voice/opus.js +37 -0
  98. package/dist/voice/opus.test.js +81 -0
  99. package/dist/voice/presence-handler.js +123 -0
  100. package/dist/voice/presence-handler.test.js +545 -0
  101. package/dist/voice/stt-deepgram.js +138 -0
  102. package/dist/voice/stt-deepgram.test.js +202 -0
  103. package/dist/voice/stt-factory.js +39 -0
  104. package/dist/voice/stt-factory.test.js +45 -0
  105. package/dist/voice/stt-openai.js +154 -0
  106. package/dist/voice/stt-openai.test.js +281 -0
  107. package/dist/voice/transcript-mirror.js +130 -0
  108. package/dist/voice/transcript-mirror.test.js +257 -0
  109. package/dist/voice/tts-cartesia.js +169 -0
  110. package/dist/voice/tts-cartesia.test.js +228 -0
  111. package/dist/voice/tts-deepgram.js +76 -0
  112. package/dist/voice/tts-deepgram.test.js +197 -0
  113. package/dist/voice/tts-factory.js +48 -0
  114. package/dist/voice/tts-factory.test.js +53 -0
  115. package/dist/voice/tts-openai.js +69 -0
  116. package/dist/voice/tts-openai.test.js +138 -0
  117. package/dist/voice/types.js +7 -0
  118. package/dist/voice/types.test.js +84 -0
  119. package/dist/voice/voice-action-flags.js +41 -0
  120. package/dist/voice/voice-action-flags.test.js +80 -0
  121. package/dist/voice/voice-responder.js +192 -0
  122. package/dist/voice/voice-responder.test.js +468 -0
  123. package/dist/voice/voice-style-prompt.js +13 -0
  124. package/dist/voice/voice-style-prompt.test.js +20 -0
  125. package/dist/workspace-bootstrap.test.js +1 -1
  126. package/package.json +17 -3
  127. package/templates/workspace/AGENTS.md +1 -2
@@ -18,6 +18,7 @@ Core instructions live in `CLAUDE.md` at the repo root.
18
18
  | **Task tracking** | `tasks.md` |
19
19
  | **Architecture / system overview** | `architecture.md` |
20
20
  | **Tool capabilities / browser automation** | `tools.md` |
21
+ | **Voice system (STT/TTS, audio pipeline, actions)** | `voice.md` |
21
22
  | **Forge/plan standing constraints** | `project.md` *(auto-loaded by forge)* |
22
23
  | **Plan & Forge commands** | `plan-and-forge.md` *(in docs/, not .context/)* |
23
24
 
@@ -38,5 +39,6 @@ Core instructions live in `CLAUDE.md` at the repo root.
38
39
  - **architecture.md** — System overview, data flow, directory layout, key concepts
39
40
  - **bot-setup.md** — One-time bot creation and invite guide
40
41
  - **tools.md** — Available tools: browser automation (agent-browser), escalation ladder, CDP connect, security guardrails
42
+ - **voice.md** — Voice subsystem: module map, audio data flow, key patterns (barge-in, allowlist gating), wiring sequence, dependencies, config reference
41
43
  - **project.md** — Standing constraints auto-loaded by forge drafter and auditor
42
44
  - **docs/plan-and-forge.md** — Canonical reference for `!plan` and `!forge` commands (lives in `docs/`, not `.context/` — human/developer reference, not auto-loaded into agent context)
@@ -25,6 +25,7 @@ Discord message
25
25
  | `src/discord/` | Discord subsystems: actions, allowlist, channel context, memory, output |
26
26
  | `src/runtime/` | Runtime adapters (Claude CLI), concurrency, process pool |
27
27
  | `src/tasks/` | In-process task data model + store + migration helpers |
28
+ | `src/voice/` | Voice chat: STT/TTS providers, audio pipeline, connection manager |
28
29
  | `src/cron/` | Cron scheduler, executor, forum sync, run stats |
29
30
  | `src/observability/` | Metrics registry |
30
31
  | `src/sessions.ts` | Session manager (maps session keys to runtime session IDs) |
@@ -51,6 +51,19 @@ Bot: Given your preference for Rust in systems work, I'd lean that way —
51
51
  especially since this is a low-level networking tool.
52
52
  ```
53
53
 
54
+ #### Consolidation
55
+
56
+ When the active item count for a user crosses a threshold (`DISCOCLAW_DURABLE_CONSOLIDATION_THRESHOLD`, default `100`), consolidation can be triggered to prune and merge the list. A single `fast`-tier model call receives all active items and is asked to return a revised list — removing exact duplicates, merging near-duplicates, dropping clearly stale items, and preserving everything that is still plausibly useful. The model must not invent new facts or change the meaning of existing ones.
57
+
58
+ The revised list is applied atomically: items absent from the model's output are deprecated via `deprecateItems()`; new or rewritten items are written via `addItem()`. Items present verbatim in the output are left untouched (no unnecessary writes).
59
+
60
+ **Safety guards:**
61
+ - The revised list must contain at least 50 % of the original count. If the model returns fewer items than that floor, consolidation is aborted and a warning is logged — no writes occur.
62
+ - Consolidation runs at most once per session per user, regardless of how many writes happen. This prevents runaway API calls.
63
+ - All mutations flow through the existing durable write queue, so consolidation is serialized with concurrent `!memory remember` / auto-extraction writes.
64
+
65
+ **Config:** `DISCOCLAW_DURABLE_CONSOLIDATION_THRESHOLD=100` sets the item count at which consolidation becomes eligible. `DISCOCLAW_DURABLE_CONSOLIDATION_MODEL=fast` selects the model tier used for the consolidation call.
66
+
54
67
  ### 3. Memory Commands — user-facing control surface
55
68
 
56
69
  `src/discord/memory-commands.ts`
@@ -103,7 +116,16 @@ Bot: Cool, platform work! What's your first project?
103
116
  [tool] Works with Kubernetes and Terraform
104
117
  ```
105
118
 
106
- **Config:** `DISCOCLAW_SUMMARY_TO_DURABLE_ENABLED=false` to disable.
119
+ **Supersession:** When extraction runs, active durable items for the user are appended
120
+ to the prompt. The model may return a `supersedes` field on any extracted item, containing
121
+ a substring that uniquely identifies the old item's text. The old item is then deprecated
122
+ atomically before the new item is written — no additional API call required. This prevents
123
+ stale preferences from accumulating (e.g. "I prefer Vim" is deprecated when "I switched to
124
+ Neovim" is later extracted).
125
+
126
+ **Config:** `DISCOCLAW_SUMMARY_TO_DURABLE_ENABLED=false` to disable extraction entirely.
127
+ `DISCOCLAW_DURABLE_SUPERSESSION_SHADOW=1` to observe what the model would supersede without
128
+ actually deprecating (shadow mode logs matches to stdout). Live supersession is on by default.
107
129
 
108
130
  ### 5. Short-Term Memory — cross-channel awareness
109
131
 
@@ -248,6 +270,9 @@ Short-term entries also store `channelId` alongside the existing `channelName`.
248
270
  | `DISCOCLAW_DURABLE_MAX_ITEMS` | `200` | Durable memory |
249
271
  | `DISCOCLAW_MEMORY_COMMANDS_ENABLED` | `true` | Memory commands |
250
272
  | `DISCOCLAW_SUMMARY_TO_DURABLE_ENABLED` | `true` | Auto-extraction |
273
+ | `DISCOCLAW_DURABLE_SUPERSESSION_SHADOW` | `false` | Auto-extraction |
274
+ | `DISCOCLAW_DURABLE_CONSOLIDATION_THRESHOLD` | `100` | Durable memory |
275
+ | `DISCOCLAW_DURABLE_CONSOLIDATION_MODEL` | `fast` | Durable memory |
251
276
  | `DISCOCLAW_SHORTTERM_MEMORY_ENABLED` | `true` | Short-term memory |
252
277
  | `DISCOCLAW_SHORTTERM_MAX_ENTRIES` | `20` | Short-term memory |
253
278
  | `DISCOCLAW_SHORTTERM_MAX_AGE_HOURS` | `6` | Short-term memory |
@@ -26,7 +26,7 @@ Standing constraints for planning and auditing. These apply to all forge/plan op
26
26
  Plans that grow too large fail — they blow past token limits, cause audit/revise loops to diverge, and produce specs no human will review. These constraints prevent that.
27
27
 
28
28
  ### Scope limits
29
- - A plan should target **3–5 files** max. If a feature touches more, decompose into multiple sequential plans before drafting.
29
+ - A plan should target **3–5 source modules** max (not counting co-located test files, lockfiles, or other generated artifacts). If a feature touches more source modules than that, decompose into multiple sequential plans before drafting.
30
30
 
31
31
  ### Size limits
32
32
  - Plan content (excluding the Audit Log section) should not exceed **200 lines**. If the draft exceeds this, the scope is too large.
@@ -40,4 +40,4 @@ Plans that grow too large fail — they blow past token limits, cause audit/revi
40
40
  ### Auditor guidance
41
41
  - Do NOT flag "underspecified implementation details" as medium/high. The plan describes intent and scope — the implementing agent fills in the details.
42
42
  - DO flag: missing scope items, incorrect assumptions about existing code, safety/correctness issues, missing error handling for external boundaries.
43
- - If a plan is too large or touches too many files, flag that as **high severity** with recommendation to split.
43
+ - If a plan is too large or touches too many source modules, flag that as **high severity** with recommendation to split. Co-located test files, lockfile changes from dependency additions, and doc/inventory updates do not count toward the scope cap.
@@ -0,0 +1,87 @@
1
+ # Voice System
2
+
3
+ Real-time voice chat: STT transcription, AI response generation, TTS synthesis, and Discord voice playback. For operator setup, see `docs/voice.md`.
4
+
5
+ ## Dependencies
6
+
7
+ Two native npm packages power the Discord voice integration:
8
+
9
+ - **`@discordjs/voice`** — voice connection management, audio player/receiver, gateway adapter. Used in `connection-manager.ts`, `audio-receiver.ts`, `audio-pipeline.ts`, `voice-responder.ts`.
10
+ - **`@discordjs/opus`** — native Opus codec binding (C++ addon, requires build tools). Wrapped by `opus.ts` to decode Discord's 48 kHz stereo Opus packets to PCM s16le.
11
+ - **`sodium-native`** — encryption for Discord voice (auto-detected by discord.js).
12
+
13
+ ## Module Map
14
+
15
+ | Module | Role |
16
+ |--------|------|
17
+ | `src/voice/types.ts` | Core interfaces: `VoiceConfig`, `AudioFrame`, `SttProvider`, `TtsProvider`, `TranscriptionResult` |
18
+ | `src/voice/connection-manager.ts` | Per-guild `VoiceConnection` lifecycle (join/leave/mute/deafen), reconnect retries (default 5), error-to-destroy safety net |
19
+ | `src/voice/audio-pipeline.ts` | Per-guild orchestrator — auto-starts STT + receiver + responder on connection Ready, auto-stops on Destroyed |
20
+ | `src/voice/audio-receiver.ts` | Subscribes to allowlisted users' Opus streams, decodes via `OpusDecoderFactory`, downsamples 48→16 kHz mono, feeds `SttProvider` |
21
+ | `src/voice/opus.ts` | `OpusDecoderFactory` implementation wrapping `@discordjs/opus` |
22
+ | `src/voice/voice-responder.ts` | AI invoke → TTS synthesis → `AudioPlayer` playback; generation-based cancellation for barge-in |
23
+ | `src/voice/stt-deepgram.ts` | Deepgram Nova-3 streaming STT via WebSocket |
24
+ | `src/voice/tts-cartesia.ts` | Cartesia Sonic-3 TTS via WebSocket, outputs PCM s16le at 24 kHz |
25
+ | `src/voice/tts-deepgram.ts` | Deepgram Aura TTS via REST, outputs PCM s16le at 24 kHz |
26
+ | `src/voice/stt-factory.ts` | STT provider factory (deepgram or whisper stub) |
27
+ | `src/voice/tts-factory.ts` | TTS provider factory (cartesia, deepgram, openai, or kokoro stub) |
28
+ | `src/voice/presence-handler.ts` | Auto-join/leave on `voiceStateUpdate` (allowlisted users only) |
29
+ | `src/voice/transcript-mirror.ts` | Posts user transcriptions and bot responses to a text channel |
30
+ | `src/voice/voice-action-flags.ts` | Restricted action subset for voice invocations (messaging + tasks + memory only) |
31
+ | `src/discord/actions-voice.ts` | Discord action types: `voiceJoin`, `voiceLeave`, `voiceStatus`, `voiceMute`, `voiceDeafen` |
32
+
33
+ ## Audio Data Flow
34
+
35
+ ```
36
+ User speaks in Discord voice channel
37
+ → @discordjs/voice receiver emits Opus packets per user
38
+ → AudioReceiver: allowlist gate → OpusDecoder (48 kHz stereo PCM)
39
+ → downsample to 16 kHz mono
40
+ → SttProvider.feedAudio() (Deepgram WebSocket)
41
+ → TranscriptionResult (final transcript)
42
+ → VoiceResponder.handleTranscription()
43
+ → InvokeAiFn (AI runtime) → response text
44
+ → TtsProvider.synthesize() (Cartesia WebSocket → 24 kHz mono PCM)
45
+ → upsampleToDiscord (48 kHz stereo)
46
+ → AudioPlayer → Discord voice connection
47
+ ```
48
+
49
+ ## Key Patterns
50
+
51
+ - **Allowlist gating** — `AudioReceiver` only subscribes to users in `DISCORD_ALLOW_USER_IDS`. Empty allowlist = ignore everyone (fail-closed).
52
+ - **Dual-flag voice actions** — Voice action execution requires both `VOICE_ENABLED` and `DISCORD_ACTIONS_VOICE`. The `buildVoiceActionFlags()` function intersects a voice-specific allowlist (messaging, tasks, memory) with env config; all other action categories are hard-disabled.
53
+ - **Generation-based cancellation** — `VoiceResponder` increments a generation counter on each new transcription. If a newer transcription arrives mid-pipeline, the older one is silently abandoned.
54
+ - **Barge-in** — Gated on a non-empty STT transcription result, not the raw VAD `speaking.start` event. Echo from the bot's own TTS leaking through the user's mic produces empty transcriptions and is ignored. Only when `VoiceResponder.handleTranscription()` receives a non-empty transcript while the player is active does it stop playback and advance the generation counter. This eliminates false positives from echo without relying on a static grace-period timeout.
55
+ - **Re-entrancy guard** — `AudioPipelineManager.startPipeline` uses a `starting` set because `VoiceConnection.subscribe()` synchronously fires a Ready state change.
56
+ - **Error containment** — `VoiceConnectionManager` catches connection errors and destroys the connection to prevent process crashes (e.g. DAVE handshake failures).
57
+ - **Deepgram TTS 2000-char limit** — Deepgram Aura REST TTS returns HTTP 413 (silent failure) for inputs exceeding ~2000 characters. `tts-deepgram.ts` truncates the input to 2000 chars before sending to prevent silent audio dropouts. If the AI response is unexpectedly long (e.g. from a missing `VOICE_STYLE_INSTRUCTION`), the user will still hear a truncated response rather than silence.
58
+
59
+ ## Wiring (`src/index.ts`)
60
+
61
+ When `voiceEnabled=true`, the post-connect block in `src/index.ts` initializes the voice subsystem in order:
62
+
63
+ 1. **`TranscriptMirror.resolve()`** — resolves the voice home channel for text mirroring (may be `null` if unconfigured).
64
+ 2. **`voiceInvokeAi`** closure — builds the AI invocation function that prepends channel context, PA prompt, durable memory, voice system prompt, and action instructions to user speech. Supports up to 1 follow-up round for action results. `runtimeTimeoutMs` is applied to each invocation as a safety net against runaway queries.
65
+ 3. **`AudioPipelineManager`** — instantiated with voice config, allowlist, decoder factory, `voiceInvokeAi`, transcript mirror, and a transcription logging callback.
66
+ 4. **`VoiceConnectionManager`** — instantiated with `onReady` → `audioPipeline.startPipeline()` and `onDestroyed` → `audioPipeline.stopPipeline()` callbacks.
67
+ 5. **`botParams.voiceCtx`** — set when `DISCORD_ACTIONS_VOICE` is enabled, exposing `voiceManager` to Discord action handlers (`voiceJoin`, `voiceLeave`, etc.).
68
+ 6. **`VoicePresenceHandler`** — created and registered on the Discord client only when `VOICE_AUTO_JOIN` is enabled.
69
+
70
+ ## Config (env vars)
71
+
72
+ | Variable | Default | Purpose |
73
+ |----------|---------|---------|
74
+ | `DISCOCLAW_VOICE_ENABLED` | `0` | Master switch |
75
+ | `DISCOCLAW_DISCORD_ACTIONS_VOICE` | `0` | Enable voice action types |
76
+ | `DISCOCLAW_VOICE_AUTO_JOIN` | `0` | Auto-join when allowlisted user enters |
77
+ | `DISCOCLAW_STT_PROVIDER` | `deepgram` | STT backend |
78
+ | `DISCOCLAW_TTS_PROVIDER` | `cartesia` | TTS backend (`cartesia`, `deepgram`, `openai`, `kokoro`) |
79
+ | `DISCOCLAW_VOICE_HOME_CHANNEL` | — | Voice audio channel name/ID used for prompt context (not transcript mirroring) |
80
+ | `DISCOCLAW_VOICE_LOG_CHANNEL` | — | Text channel name/ID where `TranscriptMirror` posts user transcriptions and bot responses; falls back to bootstrap-provided `voiceLogChannelId` if unset |
81
+ | `DISCOCLAW_VOICE_MODEL` | `capable` | AI model tier for voice responses |
82
+ | `DISCOCLAW_VOICE_SYSTEM_PROMPT` | — | Custom system prompt for voice invocations (max 4000 chars) |
83
+ | `DEEPGRAM_API_KEY` | — | Required for deepgram STT and TTS |
84
+ | `DEEPGRAM_STT_MODEL` | `nova-3-conversationalai` | Deepgram STT model name |
85
+ | `DEEPGRAM_TTS_VOICE` | `aura-2-asteria-en` | Deepgram TTS voice name |
86
+ | `CARTESIA_API_KEY` | — | Required for cartesia TTS |
87
+ | *(built-in)* | — | Telegraphic style instruction hardcoded into every voice AI invocation — front-loads the answer, strips preambles/markdown/filler, keeps responses short for TTS latency. Not an env var; not overridable by `DISCOCLAW_VOICE_SYSTEM_PROMPT`. |
package/.env.example CHANGED
@@ -32,6 +32,11 @@ DISCORD_ALLOW_USER_IDS=
32
32
  # connect and persisted to system-scaffold.json. Only set this to override the auto-created channel.
33
33
  #DISCOCLAW_CRON_FORUM=
34
34
 
35
+ # Default model for cron job execution: fast | capable (or concrete model names).
36
+ # Defaults to capable (resolves to Sonnet on Claude Code) — avoids using Opus on routine cron work.
37
+ # Override at runtime via `!models set cron-exec <model>`.
38
+ #DISCOCLAW_CRON_EXEC_MODEL=capable
39
+
35
40
  # ----------------------------------------------------------
36
41
  # CORE — most users will want to review these
37
42
  # ----------------------------------------------------------
@@ -95,6 +100,31 @@ DISCORD_GUILD_ID=
95
100
  # auto-selected: systemctl --user on Linux, launchctl on macOS.
96
101
  #DC_RESTART_CMD=
97
102
 
103
+ # ----------------------------------------------------------
104
+ # Voice — configure via `pnpm setup` or `discoclaw init`
105
+ # ----------------------------------------------------------
106
+ # Run `pnpm setup` or `discoclaw init` to enable voice interactively,
107
+ # or set these vars manually to enable voice chat (STT/TTS via Deepgram).
108
+ #DISCOCLAW_VOICE_ENABLED=0
109
+ # Text channel used for voice prompt context and actions (e.g. posting action results,
110
+ # reading pinned notes). Required for full voice functionality when voice is enabled.
111
+ #DISCOCLAW_VOICE_HOME_CHANNEL= # e.g. "voice"
112
+ # Text channel for posting voice transcripts. Optional — auto-discovered via bootstrap
113
+ # (the bot creates a "voice-log" text channel and stores its ID in system-scaffold.json).
114
+ # Only set this to override the auto-discovered channel.
115
+ #DISCOCLAW_VOICE_LOG_CHANNEL=
116
+ #DEEPGRAM_API_KEY=
117
+
118
+ # ----------------------------------------------------------
119
+ # Secret management via Discord DM
120
+ # ----------------------------------------------------------
121
+ # Use !secret in a DM to the bot to add or update .env entries without
122
+ # restarting or editing files directly. Values are never echoed back.
123
+ # !secret set KEY=value — add/update an entry
124
+ # !secret unset KEY — remove an entry
125
+ # !secret list — list key names (values hidden)
126
+ # The bot must be restarted (!restart) after changes take effect.
127
+
98
128
  # ----------------------------------------------------------
99
129
  # For all ~90 options (subsystems, actions, memory, identity,
100
130
  # observability, advanced/debug), see .env.example.full
package/.env.example.full CHANGED
@@ -64,9 +64,9 @@ DISCORD_ALLOW_USER_IDS=
64
64
  # Unset = use the built-in default shown in the comments.
65
65
  # Concrete model names (e.g. sonnet, gpt-4o-mini) are passed through unchanged.
66
66
  #
67
- # Claude Code adapter (default: fast=haiku, capable=opus):
67
+ # Claude Code adapter (default: fast=haiku, capable=sonnet):
68
68
  #DISCOCLAW_TIER_CLAUDE_CODE_FAST=haiku
69
- #DISCOCLAW_TIER_CLAUDE_CODE_CAPABLE=opus
69
+ #DISCOCLAW_TIER_CLAUDE_CODE_CAPABLE=sonnet
70
70
  #
71
71
  # Gemini CLI adapter (default: fast=gemini-2.5-flash, capable=gemini-2.5-pro):
72
72
  #DISCOCLAW_TIER_GEMINI_FAST=gemini-2.5-flash
@@ -79,6 +79,12 @@ DISCORD_ALLOW_USER_IDS=
79
79
  # Codex CLI adapter (default: adapter-default for both tiers):
80
80
  #DISCOCLAW_TIER_CODEX_FAST=
81
81
  #DISCOCLAW_TIER_CODEX_CAPABLE=
82
+ #
83
+ # Tool-tier map — override which tool tier a model resolves to.
84
+ # Comma-separated model=tier pairs. Tiers: basic, standard, full.
85
+ # Format: haiku=basic,sonnet=standard,opus=full
86
+ # Unset = built-in pattern matching (haiku/flash→basic, sonnet→standard, opus/unknown→full).
87
+ #DISCOCLAW_TOOL_TIER_MAP=
82
88
 
83
89
  # Output format for the Claude CLI. stream-json gives smoother streaming.
84
90
  #CLAUDE_OUTPUT_FORMAT=stream-json
@@ -107,6 +113,11 @@ DISCORD_ALLOW_USER_IDS=
107
113
  # Forum channel ID is auto-created on first connect (see AUTO-DETECTED above).
108
114
  # Model tier for cron execution: fast | capable (concrete names accepted as passthrough).
109
115
  #DISCOCLAW_CRON_MODEL=fast
116
+ # Default model tier for cron job execution (fast | capable, concrete names accepted as passthrough).
117
+ # Defaults to capable (resolves to Sonnet on Claude Code) — avoids using Opus on routine cron work.
118
+ # Per-job overrides and AI-classified model still win when set. Override at runtime via
119
+ # `!models set cron-exec <model>`.
120
+ #DISCOCLAW_CRON_EXEC_MODEL=capable
110
121
  # Enable cron Discord actions (CRUD via Discord action blocks).
111
122
  #DISCOCLAW_DISCORD_ACTIONS_CRONS=1
112
123
  # Persistent stats directory (run counts, last run time, status).
@@ -232,6 +243,9 @@ DISCOCLAW_DISCORD_ACTIONS_DEFER=1
232
243
  #DISCOCLAW_DURABLE_MEMORY_ENABLED=1
233
244
  #DISCOCLAW_DURABLE_INJECT_MAX_CHARS=2000
234
245
  #DISCOCLAW_DURABLE_MAX_ITEMS=200
246
+ # Shadow mode for durable memory supersession: log supersession matches without actually deprecating.
247
+ # Set to 1 to observe what the model would supersede before enabling live deprecation.
248
+ #DISCOCLAW_DURABLE_SUPERSESSION_SHADOW=0
235
249
  #DISCOCLAW_MEMORY_COMMANDS_ENABLED=1
236
250
  # Override storage directory for durable memory.
237
251
  #DISCOCLAW_DURABLE_DATA_DIR=
@@ -246,6 +260,12 @@ DISCOCLAW_DISCORD_ACTIONS_DEFER=1
246
260
  # Auto-extract notable facts from user messages into durable memory.
247
261
  # Runs after rolling summary generation. Default on.
248
262
  #DISCOCLAW_SUMMARY_TO_DURABLE_ENABLED=1
263
+ # Consolidate durable memory when active item count reaches this threshold.
264
+ # Merges near-duplicates and removes stale items via a fast-tier model call.
265
+ # Set to 0 to disable automatic consolidation (default: 50).
266
+ #DISCOCLAW_MEMORY_CONSOLIDATION_THRESHOLD=50
267
+ # Model tier or name used for consolidation (default: fast).
268
+ #DISCOCLAW_MEMORY_CONSOLIDATION_MODEL=fast
249
269
  # Character budget for recent conversation history in prompts (0 = disabled).
250
270
  #DISCOCLAW_MESSAGE_HISTORY_BUDGET=3000
251
271
 
@@ -455,16 +475,67 @@ DISCOCLAW_DISCORD_ACTIONS_DEFER=1
455
475
  # Image generation
456
476
  # ----------------------------------------------------------
457
477
  # Master switch — enables the imagegen Discord action category (default: off).
458
- # When enabled, the AI can generate images via action blocks using OpenAI or Gemini Imagen.
478
+ # When enabled, the AI can generate images via action blocks using OpenAI or Gemini.
459
479
  DISCOCLAW_DISCORD_ACTIONS_IMAGEGEN=0
460
- # API key for Gemini Imagen models (imagen-4.0-generate-001 and similar).
480
+ # API key for Gemini image generation (Imagen and native Gemini models).
461
481
  # Leave unset to use OpenAI only.
462
482
  #IMAGEGEN_GEMINI_API_KEY=
463
483
  # Override the default image generation model. If unset, auto-detected:
464
484
  # only IMAGEGEN_GEMINI_API_KEY set → imagen-4.0-generate-001; otherwise → dall-e-3.
465
485
  # OpenAI models: dall-e-3, gpt-image-1
466
- # Gemini models: imagen-4.0-generate-001, imagen-4.0-fast-generate-001, imagen-4.0-ultra-generate-001
486
+ # Gemini Imagen models: imagen-4.0-generate-001, imagen-4.0-fast-generate-001, imagen-4.0-ultra-generate-001
487
+ # Gemini native models (text+image in one call): gemini-3.1-flash-image-preview, gemini-3-pro-image-preview
467
488
  #IMAGEGEN_DEFAULT_MODEL=
468
489
  # Note: OpenAI image generation reuses OPENAI_API_KEY (documented above in the
469
490
  # OpenAI-compatible HTTP adapter section). When DISCOCLAW_DISCORD_ACTIONS_IMAGEGEN=1,
470
491
  # at least one of OPENAI_API_KEY or IMAGEGEN_GEMINI_API_KEY must be set.
492
+
493
+ # ----------------------------------------------------------
494
+ # Voice (STT/TTS) — join voice channels, listen and respond
495
+ # ----------------------------------------------------------
496
+ # Master switch — enables voice channel interaction (default: off).
497
+ # When enabled, the bot can join Discord voice channels, transcribe speech via STT,
498
+ # and respond with synthesized speech via TTS.
499
+ #DISCOCLAW_VOICE_ENABLED=0
500
+ # Enable voice Discord action category (voiceJoin, voiceLeave, voiceStatus, voiceMute, voiceDeafen).
501
+ # Requires DISCOCLAW_VOICE_ENABLED=1 to take effect (default: off).
502
+ #DISCOCLAW_DISCORD_ACTIONS_VOICE=0
503
+ # Auto-join voice channels when a non-bot user joins, and auto-leave when the last
504
+ # non-bot user leaves. Starts/tears down the audio pipeline (STT receiver) automatically.
505
+ # Requires DISCOCLAW_VOICE_ENABLED=1 (default: off).
506
+ #DISCOCLAW_VOICE_AUTO_JOIN=0
507
+ # Speech-to-text provider: deepgram (Deepgram Nova-3 API) or whisper (whisper.cpp local).
508
+ # deepgram requires DEEPGRAM_API_KEY; whisper runs locally with no API key.
509
+ #DISCOCLAW_STT_PROVIDER=deepgram
510
+ # Text-to-speech provider: cartesia | deepgram | kokoro | openai.
511
+ # cartesia requires CARTESIA_API_KEY; deepgram reuses DEEPGRAM_API_KEY;
512
+ # openai requires OPENAI_API_KEY; kokoro runs locally with no API key.
513
+ #DISCOCLAW_TTS_PROVIDER=cartesia
514
+ # Voice audio channel name or ID — the channel the bot joins for voice interaction.
515
+ # Used as the prompt context source (root policy, PA files, channel context, durable memory).
516
+ # The old env var DISCOCLAW_VOICE_TRANSCRIPT_CHANNEL is still accepted as a fallback.
517
+ # Leave unset to skip voice channel context in prompts.
518
+ #DISCOCLAW_VOICE_HOME_CHANNEL= # e.g. "voice" if using the default scaffold
519
+ # Text channel name or ID for posting voice transcripts (user STT and bot TTS responses).
520
+ # Optional — auto-discovered via bootstrap (the bot creates "voice-log" and stores its ID
521
+ # in system-scaffold.json). Only set this to override the auto-discovered channel.
522
+ # Leave unset to disable transcript mirroring.
523
+ #DISCOCLAW_VOICE_LOG_CHANNEL= # e.g. "voice-log" if using the default scaffold
524
+ # Model for voice AI responses: tier (fast | capable) or concrete name (sonnet, opus, haiku).
525
+ # Independent of RUNTIME_MODEL — allows tuning voice latency vs quality separately from chat.
526
+ # Switchable at runtime via `modelSet voice <model>`.
527
+ # Default: follows DISCOCLAW_FAST_MODEL (override here for voice-specific tuning).
528
+ #DISCOCLAW_VOICE_MODEL=sonnet
529
+ # Custom system prompt prepended to voice AI invocations. Max 4000 chars.
530
+ # Use this to set a conversational tone, brevity instructions, or persona for voice responses.
531
+ #DISCOCLAW_VOICE_SYSTEM_PROMPT=
532
+ # API key for Deepgram Nova-3 STT. Required when DISCOCLAW_STT_PROVIDER=deepgram.
533
+ #DEEPGRAM_API_KEY=
534
+ # Deepgram STT model for voice transcription (default: nova-3-conversationalai).
535
+ # See https://developers.deepgram.com/docs/models-languages-overview for available models.
536
+ #DEEPGRAM_STT_MODEL=nova-3-conversationalai
537
+ # Deepgram TTS voice for speech synthesis (default: aura-2-asteria-en).
538
+ # See https://developers.deepgram.com/docs/tts-models for available voices.
539
+ #DEEPGRAM_TTS_VOICE=aura-2-asteria-en
540
+ # API key for Cartesia Sonic-3 TTS. Required when DISCOCLAW_TTS_PROVIDER=cartesia.
541
+ #CARTESIA_API_KEY=
package/README.md CHANGED
@@ -32,6 +32,10 @@ Your assistant carries context across every conversation, channel, and restart.
32
32
 
33
33
  **Why Discord fits:** channels = context boundaries, DMs = private deep context, conversation history is the raw material.
34
34
 
35
+ ### YouTube transcripts
36
+
37
+ When you share a YouTube link in a message, DiscoClaw automatically fetches the video's transcript and injects it into the AI's context. This lets the bot answer questions about video content, summarize talks, or reference specific points — without you needing to copy-paste anything. Up to 3 videos per message are processed, with a 15-second timeout per fetch. Transcripts are sanitized before injection to prevent prompt manipulation.
38
+
35
39
  ## Tasks — the bot tracks your work
36
40
 
37
41
  A lightweight in-process task store that syncs bidirectionally with Discord forum threads.
@@ -54,6 +58,22 @@ Recurring tasks defined as forum threads in plain language — no crontab, no se
54
58
 
55
59
  **Why Discord fits:** forum threads = job definitions, archive/unarchive = pause/resume, no separate scheduler UI needed.
56
60
 
61
+ <!-- source-of-truth: docs/voice.md -->
62
+ ## Voice — the bot talks back
63
+
64
+ DiscoClaw can join Discord voice channels for real-time conversation: listen via speech-to-text, think with the AI runtime, and speak the response via text-to-speech.
65
+
66
+ - **STT** — Deepgram Nova-3 streaming transcription (WebSocket)
67
+ - **TTS** — Cartesia Sonic-3 speech synthesis (WebSocket, 24 kHz PCM)
68
+ - **Barge-in** — interrupt the bot mid-sentence by speaking; playback stops immediately
69
+ - **Auto-join** — optionally join/leave channels automatically when you enter or leave
70
+ - **Transcript mirror** — voice conversations are mirrored to a text channel for persistence
71
+ - **Voice actions** — the AI can execute a restricted action subset (messaging, tasks, memory) during voice
72
+
73
+ Voice is **off by default**. Enable with `DISCOCLAW_VOICE_ENABLED=1` plus API keys for your STT/TTS providers. Requires Node 22+ (for native WebSocket used by Cartesia TTS) and C++ build tools (for the `@discordjs/opus` native addon).
74
+
75
+ Full setup guide: [docs/voice.md](docs/voice.md)
76
+
57
77
  ## How it works
58
78
 
59
79
  DiscoClaw orchestrates the flow between Discord and AI runtimes (Claude Code by default, with Gemini, OpenAI, Codex, and OpenRouter adapters available via `PRIMARY_RUNTIME`). The OpenAI-compatible and OpenRouter adapters support optional tool use (function calling) when `OPENAI_COMPAT_TOOLS_ENABLED=1` is set. It doesn't contain intelligence itself — it decides *when* to call the AI, *what context* to give it, and *what to do* with the output. When you send a message, the orchestrator:
@@ -64,12 +84,51 @@ DiscoClaw orchestrates the flow between Discord and AI runtimes (Claude Code by
64
84
  4. Streams the response back, chunked to fit Discord's message limits
65
85
  5. Parses and executes any Discord actions the assistant emitted
66
86
 
87
+ ### Message batching
88
+
89
+ When multiple messages arrive while the bot is thinking (i.e., an AI invocation is already active for that session), they're automatically combined into a single prompt rather than queued individually. This means rapid follow-up messages are processed together, giving the bot full context in one shot. Commands (`!`-prefixed messages) bypass batching and are always processed individually.
90
+
67
91
  ### OpenRouter
68
92
 
69
93
  Set `PRIMARY_RUNTIME=openrouter` to route requests through [OpenRouter](https://openrouter.ai), which provides access to models from Anthropic, OpenAI, Google, and others via a single API key — useful if you want to switch models without managing multiple provider accounts.
70
94
 
71
95
  Required: `OPENROUTER_API_KEY`. Optional overrides: `OPENROUTER_BASE_URL` (default: `https://openrouter.ai/api/v1`) and `OPENROUTER_MODEL` (default: `anthropic/claude-sonnet-4`). See `.env.example` for the full reference.
72
96
 
97
+ ## Model Overrides
98
+
99
+ The `!models` command lets you view and swap AI models per role at runtime — no restart needed, and changes persist across restarts.
100
+
101
+ **Roles:** `chat`, `fast`, `forge-drafter`, `forge-auditor`, `summary`, `cron`, `cron-exec`, `voice`
102
+
103
+ | Command | Description |
104
+ |---------|-------------|
105
+ | `!models` | Show current model assignments |
106
+ | `!models set <role> <model>` | Change the model for a role |
107
+ | `!models reset` | Revert all roles to env-var defaults |
108
+ | `!models reset <role>` | Revert a specific role |
109
+
110
+ **Examples:**
111
+ - `!models set chat claude-sonnet-4` — use Sonnet for chat
112
+ - `!models set chat openrouter` — switch chat to the OpenRouter runtime
113
+ - `!models set cron-exec haiku` — run crons on a cheaper model
114
+ - `!models set voice sonnet` — use a specific model for voice
115
+ - `!models reset` — clear all overrides
116
+
117
+ Setting the `chat` role to a runtime name (`openrouter`, `openai`, `gemini`, `codex`, `claude`) switches the active runtime adapter for that role.
118
+
119
+ ## Secret Management
120
+
121
+ The `!secret` command lets you manage `.env` entries from Discord without touching the file directly. It works in DMs only — values are never echoed back.
122
+
123
+ | Command | Description |
124
+ |---------|-------------|
125
+ | `!secret set KEY=value` | Add or update a `.env` entry |
126
+ | `!secret unset KEY` | Remove a `.env` entry |
127
+ | `!secret list` | List key names in `.env` (values hidden) |
128
+ | `!secret help` | Show usage |
129
+
130
+ Changes take effect after a restart (`!restart`). Writes are atomic — a partial write can't corrupt your `.env`.
131
+
73
132
  ## Customization
74
133
 
75
134
  ### Shareable integration recipes
@@ -106,6 +165,7 @@ When using the Claude runtime, you can connect external tool servers via MCP. Pl
106
165
  **Contributors (from source):**
107
166
  - Everything above, plus **pnpm** — enable via Corepack (`corepack enable`) or install separately
108
167
 
168
+ <!-- source-of-truth: docs/discord-bot-setup.md -->
109
169
  ## Quick start
110
170
 
111
171
  ### Discord setup (private server + bot)
@@ -125,6 +185,35 @@ When using the Claude runtime, you can connect external tool servers via MCP. Pl
125
185
 
126
186
  Full step-by-step guide: [docs/discord-bot-setup.md](docs/discord-bot-setup.md)
127
187
 
188
+ ## Documentation
189
+
190
+ ### Getting Started
191
+
192
+ - [Discord bot setup](docs/discord-bot-setup.md) — create a bot, invite it, configure permissions
193
+ - [MCP (Model Context Protocol)](docs/mcp.md) — connect external tool servers
194
+
195
+ ### Features & Usage
196
+
197
+ - [Memory system](docs/memory.md) — five-layer memory architecture, tuning, and troubleshooting
198
+ - [Plan & Forge](docs/plan-and-forge.md) — autonomous planning and code generation
199
+ - [Discord actions](docs/discord-actions.md) — channels, messaging, moderation, tasks, crons
200
+ - [Cron / automations](docs/cron.md) — recurring task setup, advanced options, debugging
201
+ - [Tasks](docs/tasks.md) — task lifecycle, bidirectional sync, tag maps
202
+ - [Voice](docs/voice.md) — real-time voice chat setup (STT/TTS)
203
+ - [Shareable recipes](docs/discoclaw-recipe-spec.md) — integration recipe format spec
204
+
205
+ ### Development
206
+
207
+ - [Philosophy](docs/philosophy.md) — design principles and trade-offs
208
+ - [Releasing](docs/releasing.md) — npm publish workflow and versioning
209
+ - [Inventory](docs/INVENTORY.md) — full component inventory and MVP status
210
+
211
+ ### Operations
212
+
213
+ - [Configuration reference](docs/configuration.md) — all environment variables indexed by category
214
+ - [Webhook exposure](docs/webhook-exposure.md) — tunnel/proxy setup and webhook security
215
+ - [Data migration](docs/data-migration.md) — migrating task data between formats
216
+
128
217
  ### Install and run
129
218
 
130
219
  1. **Install globally:**
@@ -69,6 +69,23 @@ export function buildEnvContent(vals, now = new Date()) {
69
69
  lines.push('# DEFAULTS');
70
70
  lines.push(`DISCOCLAW_DISCORD_ACTIONS=${vals.DISCOCLAW_DISCORD_ACTIONS ?? '1'}`);
71
71
  lines.push('');
72
+ // Voice
73
+ const voiceKeys = [
74
+ 'DISCOCLAW_VOICE_ENABLED',
75
+ 'DEEPGRAM_API_KEY',
76
+ 'DISCOCLAW_DISCORD_ACTIONS_VOICE',
77
+ 'DISCOCLAW_STT_PROVIDER',
78
+ 'DISCOCLAW_TTS_PROVIDER',
79
+ ];
80
+ const hasVoice = voiceKeys.some((k) => vals[k]);
81
+ if (hasVoice) {
82
+ lines.push('# VOICE');
83
+ for (const k of voiceKeys) {
84
+ if (vals[k])
85
+ lines.push(`${k}=${vals[k]}`);
86
+ }
87
+ lines.push('');
88
+ }
72
89
  const autoDetectedKeys = ['DISCOCLAW_TASKS_FORUM', 'DISCOCLAW_CRON_FORUM'];
73
90
  const hasAutoDetected = autoDetectedKeys.some((k) => vals[k]);
74
91
  if (hasAutoDetected) {
@@ -298,6 +315,16 @@ export async function runInitWizard() {
298
315
  values.OPENROUTER_MODEL = 'anthropic/claude-sonnet-4';
299
316
  }
300
317
  values.DISCOCLAW_DISCORD_ACTIONS = '1';
318
+ // ── Voice setup ───────────────────────────────────────────────────────────
319
+ const enableVoice = await ask('\nEnable voice chat? (requires a Deepgram API key — you can skip this and enable later) [y/N] ');
320
+ if (enableVoice.toLowerCase() === 'y') {
321
+ const deepgramKey = await askValidated('Deepgram API key: ', (val) => (val ? null : 'Deepgram API key is required'));
322
+ values.DISCOCLAW_VOICE_ENABLED = '1';
323
+ values.DEEPGRAM_API_KEY = deepgramKey;
324
+ values.DISCOCLAW_DISCORD_ACTIONS_VOICE = '1';
325
+ values.DISCOCLAW_STT_PROVIDER = 'deepgram';
326
+ values.DISCOCLAW_TTS_PROVIDER = 'deepgram';
327
+ }
301
328
  // ── Write .env ────────────────────────────────────────────────────────────
302
329
  const envContent = buildEnvContent(values);
303
330
  const tmpPath = path.join(cwd, '.env.tmp');
@@ -329,6 +356,7 @@ export async function runInitWizard() {
329
356
  console.log('Configuration complete!\n');
330
357
  console.log('Next steps:');
331
358
  console.log(' Note: The bot will auto-create its forum channels on first connect.');
359
+ console.log(' Tip: To add API keys or secrets later, DM the bot: !secret set KEY=value');
332
360
  if (values.PRIMARY_RUNTIME === 'claude') {
333
361
  console.log(` ${daemonHint}`);
334
362
  }