aiden-runtime 4.0.2 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. package/README.md +11 -7
  2. package/config/hardware.json +2 -2
  3. package/dist/api/server.js +50 -52
  4. package/dist/cli/v4/aidenCLI.js +421 -5
  5. package/dist/cli/v4/aidenPrompt.js +317 -0
  6. package/dist/cli/v4/box.js +105 -39
  7. package/dist/cli/v4/callbacks.js +39 -6
  8. package/dist/cli/v4/chatSession.js +256 -55
  9. package/dist/cli/v4/citationFooter.js +97 -0
  10. package/dist/cli/v4/commands/channel.js +656 -0
  11. package/dist/cli/v4/commands/clear.js +1 -1
  12. package/dist/cli/v4/commands/compress.js +1 -1
  13. package/dist/cli/v4/commands/cron.js +44 -16
  14. package/dist/cli/v4/commands/fanout.js +236 -0
  15. package/dist/cli/v4/commands/help.js +15 -4
  16. package/dist/cli/v4/commands/history.js +84 -0
  17. package/dist/cli/v4/commands/index.js +16 -1
  18. package/dist/cli/v4/commands/mcp.js +358 -0
  19. package/dist/cli/v4/commands/show.js +43 -0
  20. package/dist/cli/v4/commands/skills.js +169 -4
  21. package/dist/cli/v4/commands/status.js +84 -0
  22. package/dist/cli/v4/commands/subagent.js +78 -0
  23. package/dist/cli/v4/commands/verbose.js +1 -1
  24. package/dist/cli/v4/commands/voice.js +218 -0
  25. package/dist/cli/v4/cronCli.js +103 -0
  26. package/dist/cli/v4/display.js +297 -13
  27. package/dist/cli/v4/doctor.js +41 -0
  28. package/dist/cli/v4/envSources.js +105 -0
  29. package/dist/cli/v4/ghostMatch.js +74 -0
  30. package/dist/cli/v4/historyStore.js +163 -0
  31. package/dist/cli/v4/pasteCompression.js +124 -0
  32. package/dist/cli/v4/pasteIntercept.js +203 -0
  33. package/dist/cli/v4/replyRenderer.js +209 -0
  34. package/dist/cli/v4/resizeGuard.js +92 -0
  35. package/dist/cli/v4/shellInterpolation.js +139 -0
  36. package/dist/cli/v4/skinEngine.js +21 -1
  37. package/dist/cli/v4/streamingPrefix.js +121 -0
  38. package/dist/cli/v4/syntaxHighlight.js +345 -0
  39. package/dist/cli/v4/table.js +216 -0
  40. package/dist/cli/v4/themeDetect.js +81 -0
  41. package/dist/cli/v4/uiBuild.js +74 -0
  42. package/dist/cli/v4/voiceCli.js +113 -0
  43. package/dist/cli/v4/voicePromptApi.js +196 -0
  44. package/dist/core/channels/discord.js +16 -10
  45. package/dist/core/channels/email.js +13 -9
  46. package/dist/core/channels/imessage.js +13 -9
  47. package/dist/core/channels/manager.js +25 -7
  48. package/dist/core/channels/pdf-extract.js +180 -0
  49. package/dist/core/channels/photo-vision.js +157 -0
  50. package/dist/core/channels/signal.js +11 -7
  51. package/dist/core/channels/slack.js +13 -10
  52. package/dist/core/channels/telegram-commands.js +154 -0
  53. package/dist/core/channels/telegram-groups.js +198 -0
  54. package/dist/core/channels/telegram-rate-limit.js +124 -0
  55. package/dist/core/channels/telegram.js +1980 -0
  56. package/dist/core/channels/twilio.js +11 -7
  57. package/dist/core/channels/webhook.js +9 -5
  58. package/dist/core/channels/whatsapp.js +15 -11
  59. package/dist/core/channels/whisper-transcribe.js +163 -0
  60. package/dist/core/cronManager.js +33 -294
  61. package/dist/core/gateway.js +29 -8
  62. package/dist/core/playwrightBridge.js +90 -0
  63. package/dist/core/v4/aidenAgent.js +35 -0
  64. package/dist/core/v4/auxiliaryClient.js +2 -2
  65. package/dist/core/v4/cron/atomicWrite.js +18 -4
  66. package/dist/core/v4/cron/cronExecute.js +300 -0
  67. package/dist/core/v4/cron/cronManager.js +502 -0
  68. package/dist/core/v4/cron/cronState.js +314 -0
  69. package/dist/core/v4/cron/cronTick.js +90 -0
  70. package/dist/core/v4/cron/diagnostics.js +104 -0
  71. package/dist/core/v4/cron/graceWindow.js +79 -0
  72. package/dist/core/v4/logger/factory.js +110 -0
  73. package/dist/core/v4/logger/index.js +22 -0
  74. package/dist/core/v4/logger/logger.js +101 -0
  75. package/dist/core/v4/logger/sinks/fileSink.js +110 -0
  76. package/dist/core/v4/logger/sinks/multiSink.js +43 -0
  77. package/dist/core/v4/logger/sinks/nullSink.js +53 -0
  78. package/dist/core/v4/logger/sinks/stdSink.js +81 -0
  79. package/dist/core/v4/mcp/server/diagnostics.js +40 -0
  80. package/dist/core/v4/mcp/server/skillBridge.js +94 -0
  81. package/dist/core/v4/mcp/server/stdioServer.js +119 -0
  82. package/dist/core/v4/mcp/server/toolBridge.js +168 -0
  83. package/dist/core/v4/platformPaths.js +105 -0
  84. package/dist/core/v4/providerFallback.js +25 -0
  85. package/dist/core/v4/skillLoader.js +21 -5
  86. package/dist/core/v4/skillMining/candidateStore.js +164 -0
  87. package/dist/core/v4/skillMining/extractorPrompt.js +111 -0
  88. package/dist/core/v4/skillMining/proposalBuilder.js +139 -0
  89. package/dist/core/v4/skillMining/skillMiner.js +191 -0
  90. package/dist/core/v4/skillMining/traceFingerprint.js +51 -0
  91. package/dist/core/v4/subagent/budget.js +76 -0
  92. package/dist/core/v4/subagent/diagnostics.js +22 -0
  93. package/dist/core/v4/subagent/fanout.js +216 -0
  94. package/dist/core/v4/subagent/merger.js +148 -0
  95. package/dist/core/v4/subagent/providerRotation.js +54 -0
  96. package/dist/core/v4/voice/audioStream.js +373 -0
  97. package/dist/core/v4/voice/cliVoice.js +393 -0
  98. package/dist/core/v4/voice/diagnostics.js +66 -0
  99. package/dist/core/v4/voice/ttsStream.js +193 -0
  100. package/dist/core/version.js +1 -1
  101. package/dist/core/visionAnalyze.js +291 -90
  102. package/dist/core/voice/audio.js +61 -5
  103. package/dist/core/voice/audioBackend.js +134 -0
  104. package/dist/core/voice/stt.js +61 -6
  105. package/dist/core/voice/tts.js +19 -3
  106. package/dist/tools/v4/index.js +32 -1
  107. package/dist/tools/v4/subagent/subagentFanout.js +166 -0
  108. package/package.json +11 -2
@@ -0,0 +1,393 @@
1
+ "use strict";
2
+ /**
3
+ * Copyright (c) 2026 Shiva Deore (Taracod).
4
+ * Licensed under AGPL-3.0. See LICENSE for details.
5
+ *
6
+ * Aiden — local-first agent.
7
+ */
8
+ /**
9
+ * core/v4/voice/cliVoice.ts — Phase v4.1-voice-cli
10
+ *
11
+ * Push-to-talk and continuous-mode state machines for the CLI.
12
+ * Wraps `audioStream.startAudioStream()` with:
13
+ *
14
+ * - RMS-based VAD with the tuned knobs from prior multi-agent
15
+ * systems' hard-learned experience:
16
+ * * SILENCE_RMS_THRESHOLD = 200
17
+ * * SILENCE_DURATION_SECONDS = 3.0
18
+ * * 0.3s sustained speech confirmation (mic click filter)
19
+ * * 0.3s dip tolerance (natural micro-pauses don't reset
20
+ * the speech tracker)
21
+ * * Peak RMS check on stop — rejects "no speech ever"
22
+ * recordings where mean RMS is dragged down by silence
23
+ * * 15s max_wait when no speech detected at all
24
+ *
25
+ * - Hallucination filter (delegated to
26
+ * `core/channels/whisper-transcribe.ts` — already battle-
27
+ * tested in v4.1-3 for Telegram voice messages).
28
+ *
29
+ * - Continuous mode: 3-consecutive-silent-cycle stop.
30
+ * - `_ttsPlaying` flag prevents the live mic from capturing
31
+ * the agent's spoken reply (would feedback-loop in ~3s).
32
+ * - 0.3s post-TTS sleep before VAD re-arm.
33
+ *
34
+ * - Status callback: `idle | listening | recording | transcribing
35
+ * | speaking`. UI subscribes for live indicator updates.
36
+ *
37
+ * - Pure orchestrator — no TTY, no display, no persistence.
38
+ * Tests inject `audioFactory` + `transcribeFn` to verify state
39
+ * transitions without an actual mic.
40
+ */
41
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
42
+ if (k2 === undefined) k2 = k;
43
+ var desc = Object.getOwnPropertyDescriptor(m, k);
44
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
45
+ desc = { enumerable: true, get: function() { return m[k]; } };
46
+ }
47
+ Object.defineProperty(o, k2, desc);
48
+ }) : (function(o, m, k, k2) {
49
+ if (k2 === undefined) k2 = k;
50
+ o[k2] = m[k];
51
+ }));
52
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
53
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
54
+ }) : function(o, v) {
55
+ o["default"] = v;
56
+ });
57
+ var __importStar = (this && this.__importStar) || (function () {
58
+ var ownKeys = function(o) {
59
+ ownKeys = Object.getOwnPropertyNames || function (o) {
60
+ var ar = [];
61
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
62
+ return ar;
63
+ };
64
+ return ownKeys(o);
65
+ };
66
+ return function (mod) {
67
+ if (mod && mod.__esModule) return mod;
68
+ var result = {};
69
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
70
+ __setModuleDefault(result, mod);
71
+ return result;
72
+ };
73
+ })();
74
+ var __importDefault = (this && this.__importDefault) || function (mod) {
75
+ return (mod && mod.__esModule) ? mod : { "default": mod };
76
+ };
77
+ Object.defineProperty(exports, "__esModule", { value: true });
78
+ exports.HALLUCINATION_PATTERNS = exports.CONTINUOUS_NO_SPEECH_LIMIT = exports.POST_TTS_REARM_DELAY_MS = exports.MAX_WAIT_NO_SPEECH_SECONDS = exports.PEAK_RMS_REJECT_THRESHOLD = exports.DIP_TOLERANCE_SECONDS = exports.MIN_SPEECH_DURATION_SECONDS = exports.SILENCE_DURATION_SECONDS = exports.SILENCE_RMS_THRESHOLD = void 0;
79
+ exports.isHallucination = isHallucination;
80
+ exports.createCliVoice = createCliVoice;
81
+ exports.makeContinuousLoop = makeContinuousLoop;
82
+ exports.pcmToWav = pcmToWav;
83
+ const node_fs_1 = require("node:fs");
84
+ const node_path_1 = __importDefault(require("node:path"));
85
+ const node_os_1 = __importDefault(require("node:os"));
86
+ const audioStream_1 = require("./audioStream");
87
+ const whisper_transcribe_1 = require("../../channels/whisper-transcribe");
88
+ const factory_1 = require("../logger/factory");
89
+ // ── VAD constants (battle-tested defaults) ──────────────────────────────
90
+ exports.SILENCE_RMS_THRESHOLD = 200;
91
+ exports.SILENCE_DURATION_SECONDS = 3.0;
92
+ exports.MIN_SPEECH_DURATION_SECONDS = 0.3; // sustained-above-threshold filter
93
+ exports.DIP_TOLERANCE_SECONDS = 0.3; // natural micro-pause
94
+ exports.PEAK_RMS_REJECT_THRESHOLD = 400; // 2x silence threshold
95
+ exports.MAX_WAIT_NO_SPEECH_SECONDS = 15.0; // bail if user never speaks
96
+ exports.POST_TTS_REARM_DELAY_MS = 300;
97
+ exports.CONTINUOUS_NO_SPEECH_LIMIT = 3;
98
+ // ── Hallucination filter ──────────────────────────────────────────────────
99
+ /** Whisper emits these on near-silent audio. Reused from v4.1-3
100
+ * Telegram voice — same patterns apply to CLI mic. */
101
+ exports.HALLUCINATION_PATTERNS = [
102
+ /^thank you[.!]?$/i,
103
+ /^thanks for watching[.!]?$/i,
104
+ /^subscribe[.!]?$/i,
105
+ /^subtitles by .+$/i,
106
+ /amara\.org/i,
107
+ /^you$/i,
108
+ /^bye[.!]?$/i,
109
+ ];
110
+ function isHallucination(text) {
111
+ const trimmed = text.trim();
112
+ if (trimmed.length === 0)
113
+ return true;
114
+ if (trimmed.length < 3)
115
+ return true;
116
+ for (const re of exports.HALLUCINATION_PATTERNS) {
117
+ if (re.test(trimmed))
118
+ return true;
119
+ }
120
+ return false;
121
+ }
122
+ function createCliVoice(options = {}) {
123
+ const logger = (options.logger ?? (0, factory_1.noopLogger)()).child('cli-voice');
124
+ const callbacks = options.callbacks ?? {};
125
+ const now = options.now ?? Date.now;
126
+ let status = 'idle';
127
+ let stream = null;
128
+ let peakRms = 0;
129
+ let ttsPlaying = false;
130
+ let silentCycleCount = 0;
131
+ let vad = null;
132
+ let recordingPromise = null;
133
+ let recordingResolve = null;
134
+ let stopRequested = false;
135
+ const transitionStatus = (next) => {
136
+ if (status === next)
137
+ return;
138
+ status = next;
139
+ try {
140
+ callbacks.onStatus?.(next);
141
+ }
142
+ catch (e) {
143
+ logger.warn('onStatus callback threw', { error: e.message });
144
+ }
145
+ };
146
+ const fireRms = (rms) => {
147
+ try {
148
+ callbacks.onRms?.(rms);
149
+ }
150
+ catch { /* ignore */ }
151
+ };
152
+ const tickVad = (rms) => {
153
+ if (!vad)
154
+ return { stop: false, cancelNoSpeech: false };
155
+ const t = now();
156
+ const above = rms > exports.SILENCE_RMS_THRESHOLD;
157
+ if (above) {
158
+ if (vad.speechSinceMs === null)
159
+ vad.speechSinceMs = t;
160
+ vad.lastAboveMs = t;
161
+ vad.silenceSinceMs = null;
162
+ // Confirm speech once we've been above threshold for the
163
+ // sustained duration — this filters mic clicks.
164
+ if (!vad.speechConfirmed
165
+ && t - vad.speechSinceMs >= exports.MIN_SPEECH_DURATION_SECONDS * 1000) {
166
+ vad.speechConfirmed = true;
167
+ transitionStatus('recording');
168
+ }
169
+ }
170
+ else {
171
+ // Below threshold. Two cases:
172
+ // (1) Pre-speech: count toward the no-speech max-wait timer.
173
+ // (2) Post-speech: count toward silence-stop timer, with a
174
+ // dip tolerance so micro-pauses don't trip it.
175
+ if (!vad.speechConfirmed) {
176
+ // No speech yet — check max-wait.
177
+ if (t - vad.startMs >= exports.MAX_WAIT_NO_SPEECH_SECONDS * 1000) {
178
+ return { stop: false, cancelNoSpeech: true };
179
+ }
180
+ }
181
+ else {
182
+ // Speech confirmed; allow a brief dip without resetting.
183
+ if (vad.lastAboveMs !== null
184
+ && t - vad.lastAboveMs > exports.DIP_TOLERANCE_SECONDS * 1000) {
185
+ if (vad.silenceSinceMs === null)
186
+ vad.silenceSinceMs = t;
187
+ if (t - vad.silenceSinceMs >= exports.SILENCE_DURATION_SECONDS * 1000) {
188
+ return { stop: true, cancelNoSpeech: false };
189
+ }
190
+ }
191
+ }
192
+ }
193
+ return { stop: false, cancelNoSpeech: false };
194
+ };
195
+ const finishRecording = async () => {
196
+ if (!stream || stream.closed) {
197
+ transitionStatus('idle');
198
+ return;
199
+ }
200
+ transitionStatus('transcribing');
201
+ let pcm;
202
+ try {
203
+ pcm = await stream.stop();
204
+ }
205
+ catch (err) {
206
+ logger.warn('stream stop failed', { error: err.message });
207
+ transitionStatus('idle');
208
+ stream = null;
209
+ return;
210
+ }
211
+ stream = null;
212
+ // Peak-RMS gate — reject "no speech ever" recordings.
213
+ peakRms = (0, audioStream_1.computePeakRms)(pcm);
214
+ if (peakRms < exports.PEAK_RMS_REJECT_THRESHOLD) {
215
+ logger.info('recording rejected: peak RMS below threshold', {
216
+ peakRms,
217
+ threshold: exports.PEAK_RMS_REJECT_THRESHOLD,
218
+ });
219
+ callbacks.onError?.('No speech detected');
220
+ transitionStatus('idle');
221
+ return;
222
+ }
223
+ // Persist PCM as a WAV for the transcribe pipeline.
224
+ const wavPath = await persistPcmAsWav(pcm);
225
+ try {
226
+ const transcribe = options.transcribeFn ?? whisper_transcribe_1.transcribeForChannel;
227
+ const result = await transcribe({
228
+ filePath: wavPath,
229
+ logger: logger,
230
+ });
231
+ if (!result.success || !result.text) {
232
+ callbacks.onError?.(result.error ?? 'Transcription returned no text');
233
+ transitionStatus('idle');
234
+ return;
235
+ }
236
+ if (isHallucination(result.text)) {
237
+ logger.info('transcript dropped: matches hallucination pattern', {
238
+ text: result.text,
239
+ });
240
+ callbacks.onError?.('Transcript looked like silence noise — ignored');
241
+ transitionStatus('idle');
242
+ return;
243
+ }
244
+ callbacks.onTranscript?.(result.text, result.avgLogprob ?? null);
245
+ transitionStatus('idle');
246
+ }
247
+ finally {
248
+ try {
249
+ await node_fs_1.promises.unlink(wavPath);
250
+ }
251
+ catch { /* ignore */ }
252
+ }
253
+ };
254
+ return {
255
+ async startRecording() {
256
+ if (status !== 'idle') {
257
+ logger.warn('startRecording: not idle', { status });
258
+ return;
259
+ }
260
+ stopRequested = false;
261
+ vad = {
262
+ speechConfirmed: false,
263
+ speechSinceMs: null,
264
+ lastAboveMs: null,
265
+ silenceSinceMs: null,
266
+ startMs: now(),
267
+ };
268
+ peakRms = 0;
269
+ transitionStatus('listening');
270
+ const factory = options.audioFactory ?? (async (o) => {
271
+ const { startAudioStream } = await Promise.resolve().then(() => __importStar(require('./audioStream')));
272
+ return startAudioStream(o);
273
+ });
274
+ stream = await factory({ logger });
275
+ if (!stream) {
276
+ callbacks.onError?.('Microphone not available');
277
+ transitionStatus('idle');
278
+ return;
279
+ }
280
+ stream.events.on('frame', ({ rms }) => {
281
+ if (!stream || stream.closed)
282
+ return;
283
+ if (rms > peakRms)
284
+ peakRms = rms;
285
+ fireRms(rms);
286
+ const decision = tickVad(rms);
287
+ if (decision.cancelNoSpeech) {
288
+ logger.info('vad: max wait elapsed without speech');
289
+ stream?.cancel();
290
+ stream = null;
291
+ callbacks.onError?.('No speech detected within window');
292
+ transitionStatus('idle');
293
+ recordingResolve?.();
294
+ recordingResolve = null;
295
+ return;
296
+ }
297
+ if (decision.stop && !stopRequested) {
298
+ stopRequested = true;
299
+ // Drain on next tick — finishRecording is async.
300
+ finishRecording()
301
+ .catch((err) => logger.warn('finishRecording failed', {
302
+ error: err.message,
303
+ }))
304
+ .finally(() => {
305
+ recordingResolve?.();
306
+ recordingResolve = null;
307
+ });
308
+ }
309
+ });
310
+ // Block until something resolves the recording.
311
+ recordingPromise = new Promise((resolve) => { recordingResolve = resolve; });
312
+ await recordingPromise;
313
+ },
314
+ async stopRecording() {
315
+ if (status === 'idle')
316
+ return;
317
+ stopRequested = true;
318
+ await finishRecording();
319
+ recordingResolve?.();
320
+ recordingResolve = null;
321
+ },
322
+ cancel() {
323
+ if (stream) {
324
+ stream.cancel();
325
+ stream = null;
326
+ }
327
+ transitionStatus('idle');
328
+ recordingResolve?.();
329
+ recordingResolve = null;
330
+ },
331
+ markTtsPlaying() {
332
+ ttsPlaying = true;
333
+ transitionStatus('speaking');
334
+ },
335
+ async markTtsDone() {
336
+ transitionStatus('idle');
337
+ // Sleep briefly so the speaker tail doesn't bleed into the
338
+ // next mic re-arm — without this, continuous mode feedback-
339
+ // loops within ~3 seconds when the live mic captures the
340
+ // agent's own spoken reply.
341
+ await new Promise((r) => setTimeout(r, exports.POST_TTS_REARM_DELAY_MS));
342
+ ttsPlaying = false;
343
+ },
344
+ getStatus() { return status; },
345
+ getPeakRms() { return peakRms; },
346
+ };
347
+ }
348
+ function makeContinuousLoop() {
349
+ const state = { silentCycles: 0, active: true };
350
+ return {
351
+ state,
352
+ recordCycleResult(gotTranscript) {
353
+ state.silentCycles = gotTranscript ? 0 : state.silentCycles + 1;
354
+ },
355
+ shouldContinue() {
356
+ return state.active && state.silentCycles < exports.CONTINUOUS_NO_SPEECH_LIMIT;
357
+ },
358
+ stop() {
359
+ state.active = false;
360
+ },
361
+ };
362
+ }
363
+ // ── Internals ────────────────────────────────────────────────────────────
364
+ /** Persist Int16 PCM frames as a WAV file. 16 kHz / mono / 16-bit
365
+ * RIFF header — what the existing whisper-transcribe pipeline
366
+ * consumes. */
367
+ async function persistPcmAsWav(pcm) {
368
+ const tmp = node_path_1.default.join(node_os_1.default.tmpdir(), `aiden-voice-${Date.now()}.wav`);
369
+ const wav = pcmToWav(pcm, 16000, 1, 16);
370
+ await node_fs_1.promises.writeFile(tmp, wav);
371
+ return tmp;
372
+ }
373
+ function pcmToWav(pcm, sampleRate, channels, bitsPerSample) {
374
+ const byteRate = sampleRate * channels * (bitsPerSample / 8);
375
+ const blockAlign = channels * (bitsPerSample / 8);
376
+ const dataSize = pcm.length;
377
+ const fileSize = 36 + dataSize;
378
+ const header = Buffer.alloc(44);
379
+ header.write('RIFF', 0);
380
+ header.writeUInt32LE(fileSize, 4);
381
+ header.write('WAVE', 8);
382
+ header.write('fmt ', 12);
383
+ header.writeUInt32LE(16, 16); // fmt chunk size
384
+ header.writeUInt16LE(1, 20); // PCM
385
+ header.writeUInt16LE(channels, 22);
386
+ header.writeUInt32LE(sampleRate, 24);
387
+ header.writeUInt32LE(byteRate, 28);
388
+ header.writeUInt16LE(blockAlign, 32);
389
+ header.writeUInt16LE(bitsPerSample, 34);
390
+ header.write('data', 36);
391
+ header.writeUInt32LE(dataSize, 40);
392
+ return Buffer.concat([header, pcm]);
393
+ }
@@ -0,0 +1,66 @@
1
+ "use strict";
2
+ /**
3
+ * Copyright (c) 2026 Shiva Deore (Taracod).
4
+ * Licensed under AGPL-3.0. See LICENSE for details.
5
+ *
6
+ * Aiden — local-first agent.
7
+ */
8
+ /**
9
+ * core/v4/voice/diagnostics.ts — Phase v4.1-voice-cli
10
+ *
11
+ * Build fingerprint + provider/backend snapshot surfaced by
12
+ * `aiden voice doctor` and `/voice status`. Bump on every shipped
13
+ * phase. Format: `v4.1-voice-cli[+suffix]`.
14
+ */
15
+ Object.defineProperty(exports, "__esModule", { value: true });
16
+ exports.AIDEN_VOICE_CLI_BUILD = void 0;
17
+ exports.readVoiceConfig = readVoiceConfig;
18
+ exports.collectVoiceDiagnostics = collectVoiceDiagnostics;
19
+ const audioStream_1 = require("./audioStream");
20
+ const tts_1 = require("../../voice/tts");
21
+ const factory_1 = require("../logger/factory");
22
+ /** Build fingerprint — bump per phase. Surfaced in `aiden voice
23
+ * doctor` and the `/voice status` slash command. */
24
+ exports.AIDEN_VOICE_CLI_BUILD = 'v4.1-voice-cli';
25
+ const DEFAULT_VOICE_CONFIG = {
26
+ ttsVoice: 'en-US-AriaNeural',
27
+ mode: 'push-to-talk',
28
+ beepsEnabled: false,
29
+ };
30
+ /** Read voice-mode env config. Pure function over `process.env` —
31
+ * callers can override env by passing a different bag. */
32
+ function readVoiceConfig(env = process.env) {
33
+ const cfg = { ...DEFAULT_VOICE_CONFIG };
34
+ if (typeof env.AIDEN_VOICE_TTS_VOICE === 'string' && env.AIDEN_VOICE_TTS_VOICE.length > 0) {
35
+ cfg.ttsVoice = env.AIDEN_VOICE_TTS_VOICE;
36
+ }
37
+ if (env.AIDEN_VOICE_MODE === 'continuous') {
38
+ cfg.mode = 'continuous';
39
+ }
40
+ if (env.AIDEN_VOICE_BEEPS === '1' || env.AIDEN_VOICE_BEEPS === 'true') {
41
+ cfg.beepsEnabled = true;
42
+ }
43
+ return cfg;
44
+ }
45
+ /** Build the diagnostics snapshot. Used by `aiden voice doctor`,
46
+ * `/voice status`, and runtime smoke verification. */
47
+ async function collectVoiceDiagnostics(logger = (0, factory_1.noopLogger)()) {
48
+ const isTty = !!process.stdin.isTTY && !!process.stdout.isTTY;
49
+ // Voice mode is REFUSED when stdin isn't a TTY — that's the MCP
50
+ // stdio invariant. The `aiden mcp serve` process must never enter
51
+ // raw mode (would corrupt JSON-RPC frames).
52
+ const enabled = isTty;
53
+ const audio = await (0, audioStream_1.getAudioDiagnostics)(logger);
54
+ return {
55
+ build: exports.AIDEN_VOICE_CLI_BUILD,
56
+ isTty,
57
+ enabled,
58
+ audio: {
59
+ backend: audio.resolved ?? 'unavailable',
60
+ active: audio.active,
61
+ soxOnPath: audio.soxOnPath,
62
+ },
63
+ ttsProviders: (0, tts_1.getTtsProviders)(),
64
+ config: readVoiceConfig(),
65
+ };
66
+ }
@@ -0,0 +1,193 @@
1
+ "use strict";
2
+ /**
3
+ * Copyright (c) 2026 Shiva Deore (Taracod).
4
+ * Licensed under AGPL-3.0. See LICENSE for details.
5
+ *
6
+ * Aiden — local-first agent.
7
+ */
8
+ /**
9
+ * core/v4/voice/ttsStream.ts — Phase v4.1-voice-cli
10
+ *
11
+ * Sentence-buffer streaming wrapper around `core/voice/tts.ts`.
12
+ * The standard `synthesize()` flow buffers the WHOLE assistant
13
+ * reply, synthesises one MP3, then plays it — for replies > 3
14
+ * seconds the user perceives a long silent pause before any audio.
15
+ *
16
+ * This module accumulates streamed text deltas, splits at sentence
17
+ * boundaries, and synth+plays each sentence chunk as it arrives.
18
+ * Net effect: ~60% reduction in time-to-first-word for long
19
+ * replies. Mirrors a battle-tested pattern from prior multi-agent
20
+ * systems.
21
+ *
22
+ * Cancellation: the consumer holds an `AbortSignal`; the streamer
23
+ * checks between every chunk. Aborting STOPS new synth calls and
24
+ * cancels any in-flight playback (best effort — system audio
25
+ * subsystems differ in interrupt support).
26
+ *
27
+ * `<think>...</think>` strip mid-stream — extends `cleanForTTS`
28
+ * for streaming mode. Some models emit reasoning blocks before
29
+ * their final answer; speaking the reasoning is wasteful and
30
+ * confusing. We strip mid-stream rather than post-buffering so
31
+ * sentence emission isn't blocked waiting for the closing tag.
32
+ */
33
+ Object.defineProperty(exports, "__esModule", { value: true });
34
+ exports.SENTENCE_BOUNDARY_RE = void 0;
35
+ exports.stripThinkChunk = stripThinkChunk;
36
+ exports.splitSentences = splitSentences;
37
+ exports.startTtsStream = startTtsStream;
38
+ const tts_1 = require("../../voice/tts");
39
+ const factory_1 = require("../logger/factory");
40
+ // ── Sentence boundary regex ───────────────────────────────────────────────
41
+ /**
42
+ * Matches a sentence terminator followed by whitespace.
43
+ * Inclusive on the terminator (capture group includes the punctuation).
44
+ *
45
+ * Common terminators: `.`, `!`, `?`, `:`, `;`, plus their full-width
46
+ * CJK equivalents `。`, `!`, `?`. We intentionally skip mid-sentence
47
+ * commas — speaking each clause separately sounds unnatural.
48
+ *
49
+ * The regex is GLOBAL with a lookahead for whitespace OR end so we
50
+ * don't false-trigger on decimal points (`3.14`) — those are
51
+ * followed by digits, not whitespace.
52
+ */
53
+ exports.SENTENCE_BOUNDARY_RE = /([.!?:;。!?])(?=\s|$)/g;
54
+ /**
55
+ * Strip `<think>...</think>` mid-stream. Returns the cleaned chunk
56
+ * plus updated state. Handles partial open / close tags split
57
+ * across delta boundaries — the next push() consumes the previous
58
+ * carry-over.
59
+ *
60
+ * Pure function — caller threads the state object.
61
+ */
62
+ function stripThinkChunk(chunk, state) {
63
+ let out = '';
64
+ let i = 0;
65
+ while (i < chunk.length) {
66
+ if (state.inside) {
67
+ const close = chunk.indexOf('</think>', i);
68
+ if (close === -1) {
69
+ // Whole rest of chunk is inside — drop it.
70
+ return out;
71
+ }
72
+ i = close + '</think>'.length;
73
+ state.inside = false;
74
+ continue;
75
+ }
76
+ const open = chunk.indexOf('<think>', i);
77
+ if (open === -1) {
78
+ out += chunk.slice(i);
79
+ return out;
80
+ }
81
+ out += chunk.slice(i, open);
82
+ i = open + '<think>'.length;
83
+ state.inside = true;
84
+ }
85
+ return out;
86
+ }
87
+ // ── Sentence splitter ─────────────────────────────────────────────────────
88
+ /**
89
+ * Slice a buffer into completed sentences + remainder. The
90
+ * remainder is whatever follows the last terminator (or the whole
91
+ * buffer if no terminator). Caller keeps the remainder for the next
92
+ * push() call.
93
+ */
94
+ function splitSentences(buf) {
95
+ const sentences = [];
96
+ let lastEnd = 0;
97
+ // Reset regex state per call.
98
+ const re = new RegExp(exports.SENTENCE_BOUNDARY_RE.source, 'g');
99
+ let match;
100
+ while ((match = re.exec(buf)) !== null) {
101
+ const end = match.index + match[0].length;
102
+ const sentence = buf.slice(lastEnd, end).trim();
103
+ if (sentence.length > 0)
104
+ sentences.push(sentence);
105
+ lastEnd = end;
106
+ }
107
+ const rest = buf.slice(lastEnd);
108
+ return { sentences, rest };
109
+ }
110
+ // ── Stream handle factory ─────────────────────────────────────────────────
111
+ /**
112
+ * Start a streaming TTS session. Call `push(text)` as deltas arrive
113
+ * from the agent loop, `end()` when the assistant turn finishes,
114
+ * `cancel()` to abort. The handle queues sentence-by-sentence
115
+ * synthesis; only one chunk plays at a time (sequential to preserve
116
+ * order).
117
+ */
118
+ function startTtsStream(opts = {}) {
119
+ const logger = (opts.logger ?? (0, factory_1.noopLogger)()).child('tts-stream');
120
+ const signal = opts.signal;
121
+ const synthFn = opts.synthFn ?? tts_1.synthesize;
122
+ let buffer = '';
123
+ const thinkState = { inside: false };
124
+ let closed = false;
125
+ let cancelled = false;
126
+ // Sequential dispatch queue — only one synth+play in flight.
127
+ let dispatchChain = Promise.resolve();
128
+ const pushSentence = (raw) => {
129
+ if (cancelled)
130
+ return;
131
+ const cleaned = (0, tts_1.cleanForTTS)(raw);
132
+ if (!cleaned)
133
+ return;
134
+ dispatchChain = dispatchChain.then(async () => {
135
+ if (cancelled || signal?.aborted)
136
+ return;
137
+ try {
138
+ const r = await synthFn({
139
+ text: cleaned,
140
+ voice: opts.voice,
141
+ timeoutMs: opts.timeoutMs ?? 20000,
142
+ });
143
+ logger.info('tts chunk synth', {
144
+ provider: r.provider,
145
+ ms: r.durationMs,
146
+ chars: cleaned.length,
147
+ });
148
+ }
149
+ catch (err) {
150
+ logger.warn('tts chunk synth failed', {
151
+ error: err instanceof Error ? err.message : String(err),
152
+ });
153
+ }
154
+ });
155
+ };
156
+ return {
157
+ get closed() { return closed; },
158
+ push(text) {
159
+ if (closed || cancelled)
160
+ return;
161
+ const cleanedDelta = stripThinkChunk(text, thinkState);
162
+ if (!cleanedDelta)
163
+ return;
164
+ buffer += cleanedDelta;
165
+ const { sentences, rest } = splitSentences(buffer);
166
+ buffer = rest;
167
+ for (const s of sentences)
168
+ pushSentence(s);
169
+ },
170
+ async end() {
171
+ if (closed)
172
+ return;
173
+ closed = true;
174
+ // Flush leftover (no terminator).
175
+ if (buffer.trim().length > 0)
176
+ pushSentence(buffer);
177
+ buffer = '';
178
+ // Wait for the chain to drain.
179
+ try {
180
+ await dispatchChain;
181
+ }
182
+ catch { /* surfaced via logger already */ }
183
+ },
184
+ cancel() {
185
+ cancelled = true;
186
+ closed = true;
187
+ buffer = '';
188
+ logger.info('tts stream cancelled');
189
+ // The in-flight synth call is best-effort to interrupt — we
190
+ // don't await its rejection, the chain will settle on its own.
191
+ },
192
+ };
193
+ }
@@ -2,4 +2,4 @@
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.VERSION = void 0;
4
4
  // AUTO-GENERATED by scripts/inject-version.js — do not edit by hand
5
- exports.VERSION = '4.0.2';
5
+ exports.VERSION = '4.1.0';