@markusylisiurunen/tau 0.2.34 → 0.2.36

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  import { randomUUID } from "node:crypto";
2
2
  import { realpathSync, statSync } from "node:fs";
3
- import { mkdtemp, writeFile } from "node:fs/promises";
3
+ import { mkdtemp, readFile, unlink, writeFile } from "node:fs/promises";
4
4
  import { tmpdir } from "node:os";
5
5
  import { join, relative, resolve, sep } from "node:path";
6
6
  import { formatCodexAuthError } from "../core/auth/auth_messages.js";
@@ -8,7 +8,7 @@ import { getAuthPath } from "../core/auth/auth_paths.js";
8
8
  import { AuthStorage } from "../core/auth/auth_storage.js";
9
9
  import { createCredentialResolver, } from "../core/auth/credential_resolver.js";
10
10
  import { createCommandRegistry, getRiskLevelDescription, } from "../core/commands/index.js";
11
- import { createDefaultConfigDeps, loadRuntimeConfig, } from "../core/config/index.js";
11
+ import { createDefaultConfigDeps, getMistralApiKey, loadRuntimeConfig, } from "../core/config/index.js";
12
12
  import { createDefaultCoreDeps } from "../core/runtime/deps.js";
13
13
  import { createCheckpoint } from "../core/session/checkpoint.js";
14
14
  import { CoreSession } from "../core/session/core_session.js";
@@ -59,6 +59,10 @@ const PRUNED_EDIT_ARGUMENT_MARKER = "[content pruned]";
59
59
  const PRUNE_EDIT_UNCHANGED_CONTEXT_LINES = 4;
60
60
  const PRUNE_PREVIEW_MAX_TOKENS = 512;
61
61
  const PRUNE_MAX_OVERAGE_RATIO = 0.1;
62
+ const SPEAK_TEMP_FILE_TEMPLATE = "/tmp/tau-speak.XXXXXX";
63
+ const SPEAK_MISTRAL_TRANSCRIBE_MODEL = "voxtral-mini-latest";
64
+ const SPEAK_RECORDING_MIN_BYTES = 1024;
65
+ const SPEAK_RECORDING_MAX_DURATION_MS = 5 * 60 * 1000;
62
66
  export class ChatController {
63
67
  view;
64
68
  personas;
@@ -114,6 +118,9 @@ export class ChatController {
114
118
  lastTurnDurationMs = 0;
115
119
  turnTimer;
116
120
  lastEmptySubmitAt;
121
+ speakRecording;
122
+ isTranscribingSpeak = false;
123
+ speakTransition;
117
124
  constructor(options) {
118
125
  this.view = options.view;
119
126
  this.deps = options.deps ?? createDefaultCoreDeps();
@@ -211,6 +218,7 @@ export class ChatController {
211
218
  pruneLargest: (extra) => this.pruneToolResults("largest", extra),
212
219
  pruneSmart: (extra) => this.pruneToolResultsSmart(extra),
213
220
  reload: () => this.reloadContent(),
221
+ speak: () => this.toggleSpeakCapture(),
214
222
  risk: (level) => this.setRiskLevel(level),
215
223
  persona: (id) => this.switchPersona(id),
216
224
  prompt: (id) => this.insertPrompt(id),
@@ -263,6 +271,7 @@ export class ChatController {
263
271
  onCtrlR: () => this.cycleRiskLevel(),
264
272
  onCtrlP: () => this.cyclePersonality(),
265
273
  onCtrlS: () => void this.stashEditorToClipboard(),
274
+ onCtrlY: () => void this.toggleSpeakCapture(),
266
275
  onEscape: () => this.onInterrupt(),
267
276
  onCtrlF: () => {
268
277
  this.expandFileMentions().catch((err) => {
@@ -284,6 +293,10 @@ export class ChatController {
284
293
  }
285
294
  async dispose() {
286
295
  this.subagentUnsubscribe?.();
296
+ if (this.speakTransition) {
297
+ await this.speakTransition;
298
+ }
299
+ await this.cancelSpeakCapture();
287
300
  if (!this.toolBackendDispose)
288
301
  return;
289
302
  await this.toolBackendDispose();
@@ -293,6 +306,10 @@ export class ChatController {
293
306
  await this.handleSubmit(text);
294
307
  }
295
308
  onInterrupt() {
309
+ if (this.speakRecording) {
310
+ void this.runSpeakTransition(() => this.stopSpeakCapture());
311
+ return;
312
+ }
296
313
  this.interruptAssistantTurn();
297
314
  }
298
315
  onEvent(event) {
@@ -444,6 +461,8 @@ export class ChatController {
444
461
  });
445
462
  }
446
463
  getInputMode() {
464
+ if (this.speakRecording)
465
+ return "recording";
447
466
  if (this.isBashIncognito)
448
467
  return "bash_incognito";
449
468
  if (this.isBashMode)
@@ -767,6 +786,8 @@ export class ChatController {
767
786
  }
768
787
  // Input Handling --------------------------------------------------------------------------------
769
788
  beforeSubmit(text) {
789
+ if (this.speakRecording)
790
+ return false;
770
791
  if (!this.isStreaming)
771
792
  return true;
772
793
  const trimmed = text.trimStart();
@@ -853,6 +874,8 @@ export class ChatController {
853
874
  return "prune smart-selected tool results and compact edit calls, optional fraction and guidance";
854
875
  case "reload":
855
876
  return "reload prompts, skills, themes, bash commands, and AGENTS.md";
877
+ case "speak":
878
+ return "toggle microphone recording and transcribe to editor";
856
879
  case "risk":
857
880
  return "set risk level: /risk:read-only or /risk:read-write";
858
881
  case "bash":
@@ -1089,6 +1112,258 @@ export class ChatController {
1089
1112
  this.view.addMessage({ type: "user", text: trimmed }, historyEntryId);
1090
1113
  await this.runAssistantTurn();
1091
1114
  }
1115
+ async toggleSpeakCapture() {
1116
+ if (this.speakTransition) {
1117
+ this.view.addSystemMessage("speech recording state change already in progress", "warn");
1118
+ return;
1119
+ }
1120
+ if (this.speakRecording) {
1121
+ await this.runSpeakTransition(() => this.stopSpeakCapture());
1122
+ return;
1123
+ }
1124
+ if (this.isTranscribingSpeak) {
1125
+ this.view.addSystemMessage("speech transcription already in progress", "warn");
1126
+ return;
1127
+ }
1128
+ if (this.isStreaming) {
1129
+ this.view.addSystemMessage("wait for the assistant to finish before recording", "warn");
1130
+ return;
1131
+ }
1132
+ await this.runSpeakTransition(() => this.startSpeakCapture());
1133
+ }
1134
+ async runSpeakTransition(task) {
1135
+ if (this.speakTransition) {
1136
+ return;
1137
+ }
1138
+ const transition = task();
1139
+ this.speakTransition = transition;
1140
+ try {
1141
+ await transition;
1142
+ }
1143
+ finally {
1144
+ if (this.speakTransition === transition) {
1145
+ this.speakTransition = undefined;
1146
+ }
1147
+ }
1148
+ }
1149
+ async startSpeakCapture() {
1150
+ const apiKey = getMistralApiKey(this.config, this.deps.env.env());
1151
+ if (!apiKey) {
1152
+ this.view.addSystemMessage("set MISTRAL_API_KEY or apiKeys.mistral to use /speak", "error");
1153
+ return;
1154
+ }
1155
+ let audioPath;
1156
+ try {
1157
+ audioPath = await this.createSpeakTempFilePath();
1158
+ const abortController = new AbortController();
1159
+ const completion = this.deps.spawn("ffmpeg", [
1160
+ "-hide_banner",
1161
+ "-loglevel",
1162
+ "error",
1163
+ "-nostdin",
1164
+ "-f",
1165
+ "avfoundation",
1166
+ "-i",
1167
+ ":0",
1168
+ "-ac",
1169
+ "1",
1170
+ "-ar",
1171
+ "16000",
1172
+ "-c:a",
1173
+ "pcm_s16le",
1174
+ "-f",
1175
+ "wav",
1176
+ "-y",
1177
+ audioPath,
1178
+ ], {
1179
+ detached: true,
1180
+ killProcessGroup: true,
1181
+ signal: abortController.signal,
1182
+ stdio: ["ignore", "ignore", "ignore"],
1183
+ });
1184
+ const recording = {
1185
+ audioPath,
1186
+ stopRequested: false,
1187
+ abortController,
1188
+ completion,
1189
+ };
1190
+ recording.maxDurationTimeout = setTimeout(() => {
1191
+ if (this.speakRecording !== recording || this.speakTransition)
1192
+ return;
1193
+ void this.runSpeakTransition(() => this.stopSpeakCapture());
1194
+ }, SPEAK_RECORDING_MAX_DURATION_MS);
1195
+ this.speakRecording = recording;
1196
+ this.view.setEditorInputEnabled(false);
1197
+ this.refreshStatus();
1198
+ void this.watchSpeakRecording(recording);
1199
+ }
1200
+ catch (err) {
1201
+ if (audioPath) {
1202
+ await this.cleanupSpeakTempFile(audioPath);
1203
+ }
1204
+ this.view.addSystemMessage(`failed to start recording: ${err.message}`, "error");
1205
+ }
1206
+ }
1207
+ async stopSpeakCapture() {
1208
+ const recording = this.speakRecording;
1209
+ if (!recording)
1210
+ return;
1211
+ recording.stopRequested = true;
1212
+ this.clearSpeakRecordingMaxDurationTimeout(recording);
1213
+ this.speakRecording = undefined;
1214
+ this.view.setEditorInputEnabled(true);
1215
+ this.refreshStatus();
1216
+ recording.abortController.abort();
1217
+ try {
1218
+ await recording.completion;
1219
+ }
1220
+ catch (err) {
1221
+ this.view.addSystemMessage(`recording failed: ${err.message}`, "error");
1222
+ await this.cleanupSpeakTempFile(recording.audioPath);
1223
+ return;
1224
+ }
1225
+ this.isTranscribingSpeak = true;
1226
+ try {
1227
+ const audio = await readFile(recording.audioPath);
1228
+ if (audio.byteLength < SPEAK_RECORDING_MIN_BYTES) {
1229
+ this.view.addSystemMessage("recording too short, try again", "warn");
1230
+ return;
1231
+ }
1232
+ const transcript = await this.transcribeSpeakAudio(audio);
1233
+ const text = transcript.trim();
1234
+ if (!text) {
1235
+ return;
1236
+ }
1237
+ this.view.insertEditorTextAtCursor(text);
1238
+ }
1239
+ catch (err) {
1240
+ this.view.addSystemMessage(`speech transcription failed: ${err.message}`, "error");
1241
+ }
1242
+ finally {
1243
+ this.isTranscribingSpeak = false;
1244
+ await this.cleanupSpeakTempFile(recording.audioPath);
1245
+ }
1246
+ }
1247
+ async cancelSpeakCapture() {
1248
+ const recording = this.speakRecording;
1249
+ if (!recording)
1250
+ return;
1251
+ recording.stopRequested = true;
1252
+ this.clearSpeakRecordingMaxDurationTimeout(recording);
1253
+ this.speakRecording = undefined;
1254
+ this.view.setEditorInputEnabled(true);
1255
+ this.refreshStatus();
1256
+ recording.abortController.abort();
1257
+ try {
1258
+ await recording.completion;
1259
+ }
1260
+ catch {
1261
+ // ignore disposal errors
1262
+ }
1263
+ await this.cleanupSpeakTempFile(recording.audioPath);
1264
+ }
1265
+ async watchSpeakRecording(recording) {
1266
+ try {
1267
+ const result = await recording.completion;
1268
+ this.clearSpeakRecordingMaxDurationTimeout(recording);
1269
+ if (this.speakRecording !== recording || recording.stopRequested)
1270
+ return;
1271
+ this.speakRecording = undefined;
1272
+ this.view.setEditorInputEnabled(true);
1273
+ this.refreshStatus();
1274
+ const detail = result.exitCode !== null
1275
+ ? `ffmpeg exited with code ${result.exitCode}`
1276
+ : result.closeSignal
1277
+ ? `ffmpeg terminated by signal ${result.closeSignal}`
1278
+ : "ffmpeg exited";
1279
+ this.view.addSystemMessage(`recording stopped unexpectedly (${detail})`, "error");
1280
+ await this.cleanupSpeakTempFile(recording.audioPath);
1281
+ }
1282
+ catch (err) {
1283
+ this.clearSpeakRecordingMaxDurationTimeout(recording);
1284
+ if (this.speakRecording !== recording || recording.stopRequested)
1285
+ return;
1286
+ this.speakRecording = undefined;
1287
+ this.view.setEditorInputEnabled(true);
1288
+ this.refreshStatus();
1289
+ const error = err;
1290
+ if (error.code === "ENOENT") {
1291
+ this.view.addSystemMessage("ffmpeg not found. install it with: brew install ffmpeg", "error");
1292
+ }
1293
+ else {
1294
+ this.view.addSystemMessage(`recording failed: ${error.message}`, "error");
1295
+ }
1296
+ await this.cleanupSpeakTempFile(recording.audioPath);
1297
+ }
1298
+ }
1299
+ clearSpeakRecordingMaxDurationTimeout(recording) {
1300
+ if (!recording.maxDurationTimeout)
1301
+ return;
1302
+ clearTimeout(recording.maxDurationTimeout);
1303
+ recording.maxDurationTimeout = undefined;
1304
+ }
1305
+ async createSpeakTempFilePath() {
1306
+ const result = await this.deps.spawn("mktemp", [SPEAK_TEMP_FILE_TEMPLATE]);
1307
+ if (result.exitCode !== 0) {
1308
+ const message = result.stderr.trim() || result.stdout.trim() || "mktemp failed";
1309
+ throw new Error(message);
1310
+ }
1311
+ const path = result.stdout.trim().split(/\r?\n/, 1)[0]?.trim();
1312
+ if (!path) {
1313
+ throw new Error("mktemp returned an empty path");
1314
+ }
1315
+ return path;
1316
+ }
1317
+ async transcribeSpeakAudio(audio) {
1318
+ const apiKey = getMistralApiKey(this.config, this.deps.env.env());
1319
+ if (!apiKey) {
1320
+ throw new Error("missing MISTRAL_API_KEY or apiKeys.mistral");
1321
+ }
1322
+ const formData = new FormData();
1323
+ formData.append("model", SPEAK_MISTRAL_TRANSCRIBE_MODEL);
1324
+ formData.append("file", new Blob([audio], { type: "audio/wav" }), "speech.wav");
1325
+ formData.append("language", "en");
1326
+ const response = await fetch("https://api.mistral.ai/v1/audio/transcriptions", {
1327
+ method: "POST",
1328
+ headers: {
1329
+ Authorization: `Bearer ${apiKey}`,
1330
+ },
1331
+ body: formData,
1332
+ });
1333
+ let payload;
1334
+ const responseText = await response.text();
1335
+ if (responseText) {
1336
+ try {
1337
+ payload = JSON.parse(responseText);
1338
+ }
1339
+ catch {
1340
+ payload = undefined;
1341
+ }
1342
+ }
1343
+ if (!response.ok) {
1344
+ const fromObject = payload && typeof payload === "object" && "message" in payload
1345
+ ? payload.message
1346
+ : undefined;
1347
+ const fromString = typeof fromObject === "string" ? fromObject : undefined;
1348
+ const fallback = responseText.trim() || `HTTP ${response.status}`;
1349
+ throw new Error(fromString || fallback);
1350
+ }
1351
+ const text = payload && typeof payload === "object" && "text" in payload
1352
+ ? payload.text
1353
+ : undefined;
1354
+ if (typeof text !== "string") {
1355
+ return "";
1356
+ }
1357
+ return text;
1358
+ }
1359
+ async cleanupSpeakTempFile(path) {
1360
+ try {
1361
+ await unlink(path);
1362
+ }
1363
+ catch {
1364
+ // best-effort cleanup
1365
+ }
1366
+ }
1092
1367
  getMemoryModeFilePath() {
1093
1368
  const cwd = this.deps.env.cwd();
1094
1369
  const home = this.deps.env.home();