dikt 1.0.2 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/cli.mjs +661 -151
  2. package/package.json +4 -2
package/cli.mjs CHANGED
@@ -17,6 +17,9 @@ let DIM = `${ESC}2m`;
17
17
  let RED = `${ESC}31m`;
18
18
  let GREEN = `${ESC}32m`;
19
19
  let YELLOW = `${ESC}33m`;
20
+ let BLUE = `${ESC}34m`;
21
+ let MAGENTA = `${ESC}35m`;
22
+ let CYAN = `${ESC}36m`;
20
23
  let GREY = `${ESC}90m`;
21
24
  let WHITE = `${ESC}37m`;
22
25
  let RED_BG = `${ESC}41m`;
@@ -29,14 +32,14 @@ const ALT_SCREEN_ON = `${ESC}?1049h`;
29
32
  const ALT_SCREEN_OFF = `${ESC}?1049l`;
30
33
 
31
34
  if (process.env.NO_COLOR != null || process.env.TERM === 'dumb' || process.argv.includes('--no-color')) {
32
- RESET = BOLD = DIM = RED = GREEN = YELLOW = GREY = WHITE = RED_BG = '';
35
+ RESET = BOLD = DIM = RED = GREEN = YELLOW = BLUE = MAGENTA = CYAN = GREY = WHITE = RED_BG = '';
33
36
  }
34
37
 
35
38
  const moveTo = (row, col = 1) => `${ESC}${row};${col}H`;
36
39
 
37
40
  // ── Constants ─────────────────────────────────────────────────────────────────
38
41
 
39
- const VERSION = '1.0.2';
42
+ const VERSION = '1.1.0';
40
43
  const CONFIG_BASE = process.env.XDG_CONFIG_HOME || path.join(os.homedir(), '.config');
41
44
  const CONFIG_DIR = path.join(CONFIG_BASE, 'dikt');
42
45
  const CONFIG_FILE = path.join(CONFIG_DIR, 'config.json');
@@ -88,51 +91,179 @@ function validateConfig(cfg) {
88
91
  return { valid: errors.length === 0, errors };
89
92
  }
90
93
 
91
- // ── Secret input ──────────────────────────────────────────────────────────────
94
+ // ── Setup wizard (form-based) ─────────────────────────────────────────────────
95
+
96
+ const TIMESTAMPS_DISPLAY = { '': 'off', 'segment': 'segment', 'word': 'word', 'segment,word': 'both' };
97
+ const TIMESTAMPS_VALUE = { 'off': '', 'segment': 'segment', 'word': 'word', 'both': 'segment,word' };
98
+
99
+ async function setupWizard() {
100
+ const existing = loadConfig() || {};
101
+
102
+ const fields = [
103
+ { key: 'apiKey', label: 'API key', type: 'secret', value: '', display: existing.apiKey ? '••••' + existing.apiKey.slice(-4) : '', fallback: existing.apiKey || '' },
104
+ { key: 'model', label: 'Model', type: 'text', value: '', display: existing.model || 'voxtral-mini-latest', fallback: existing.model || 'voxtral-mini-latest' },
105
+ { key: 'language', label: 'Language', type: 'text', value: '', display: existing.language || 'auto', fallback: existing.language || '' },
106
+ { key: 'temperature', label: 'Temperature', type: 'text', value: '', display: existing.temperature != null ? String(existing.temperature) : 'default', fallback: existing.temperature != null ? String(existing.temperature) : '' },
107
+ { key: 'contextBias', label: 'Context bias', type: 'text', value: '', display: existing.contextBias || '', fallback: existing.contextBias || '' },
108
+ { key: 'timestamps', label: 'Timestamps', type: 'select', options: ['off', 'segment', 'word', 'both'], idx: ['off', 'segment', 'word', 'both'].indexOf(TIMESTAMPS_DISPLAY[existing.timestamps || ''] || 'off') },
109
+ { key: 'diarize', label: 'Diarize', type: 'select', options: ['off', 'on'], idx: existing.diarize ? 1 : 0 },
110
+ ];
111
+
112
+ const LABEL_W = 15; // right-align labels to this width
113
+ let active = 0;
114
+ let editing = false; // true when typing into a text/secret field
115
+ let inputBuf = '';
116
+
117
+ function renderForm() {
118
+ let out = `\x1b[H\x1b[2J`; // move home + clear screen
119
+ out += `\n${BOLD} dikt — setup${RESET}\n`;
120
+
121
+ // Contextual hint
122
+ const f = fields[active];
123
+ if (f.type === 'select') {
124
+ out += ` ${DIM}Tab/arrows to change, Enter to confirm${RESET}\n`;
125
+ } else if (editing) {
126
+ out += ` ${DIM}Type to ${f.type === 'secret' ? 'enter' : 'change'}, Enter to confirm${RESET}\n`;
127
+ } else {
128
+ out += ` ${DIM}Enter to keep default, or start typing to change${RESET}\n`;
129
+ }
130
+ out += '\n';
131
+
132
+ for (let i = 0; i < fields.length; i++) {
133
+ const fi = fields[i];
134
+ const label = fi.label.padStart(LABEL_W);
135
+ const isActive = i === active;
136
+ const marker = isActive ? `${GREEN}>${RESET}` : ' ';
137
+
138
+ if (fi.type === 'select') {
139
+ const parts = fi.options.map((opt, j) => {
140
+ if (isActive) {
141
+ return j === fi.idx ? `${BOLD}${GREEN}${opt}${RESET}` : `${DIM}${opt}${RESET}`;
142
+ }
143
+ return j === fi.idx ? opt : `${DIM}${opt}${RESET}`;
144
+ });
145
+ out += `${marker} ${isActive ? BOLD : DIM}${label}${RESET} ${parts.join(' ')}\n`;
146
+ } else {
147
+ let valueStr;
148
+ if (isActive && editing) {
149
+ valueStr = fi.type === 'secret'
150
+ ? `${GREEN}${'•'.repeat(inputBuf.length)}${RESET}█`
151
+ : `${GREEN}${inputBuf}${RESET}█`;
152
+ } else if (isActive && !editing) {
153
+ valueStr = `${DIM}${fi.display}${RESET}`;
154
+ } else {
155
+ // Show confirmed value or default
156
+ const show = fi.value || fi.display;
157
+ valueStr = fi.value
158
+ ? (fi.type === 'secret' ? '••••' + fi.value.slice(-4) : fi.value)
159
+ : `${DIM}${show}${RESET}`;
160
+ }
161
+ out += `${marker} ${isActive ? BOLD : DIM}${label}${RESET} ${valueStr}\n`;
162
+ }
163
+ }
164
+
165
+ process.stderr.write(out);
166
+ }
92
167
 
93
- function readSecret(prompt) {
94
168
  return new Promise((resolve) => {
95
- process.stderr.write(prompt);
96
169
  const { stdin } = process;
97
170
  stdin.setRawMode(true);
98
171
  stdin.resume();
99
172
  stdin.setEncoding('utf8');
100
173
 
101
- let secret = '';
174
+ renderForm();
102
175
 
103
- const cleanup = () => {
104
- stdin.removeListener('data', onData);
105
- stdin.setRawMode(false);
106
- stdin.pause();
107
- };
176
+ function advance() {
177
+ const f = fields[active];
178
+ // Commit text/secret field value
179
+ if (f.type !== 'select') {
180
+ if (inputBuf.trim()) {
181
+ f.value = inputBuf.trim();
182
+ } else {
183
+ f.value = f.fallback;
184
+ }
185
+ // Validate API key
186
+ if (f.key === 'apiKey' && !f.value) {
187
+ editing = false;
188
+ inputBuf = '';
189
+ renderForm();
190
+ process.stderr.write(`\n ${RED}API key is required.${RESET}\n`);
191
+ return; // stay on this field
192
+ }
193
+ editing = false;
194
+ inputBuf = '';
195
+ }
196
+
197
+ active++;
198
+ if (active >= fields.length) {
199
+ // Save and exit
200
+ stdin.removeListener('data', onData);
201
+ stdin.setRawMode(false);
202
+ stdin.pause();
203
+
204
+ const ts = fields.find(f => f.key === 'timestamps');
205
+ const di = fields.find(f => f.key === 'diarize');
206
+ const tsValue = TIMESTAMPS_VALUE[ts.options[ts.idx]];
207
+ const diValue = di.options[di.idx] === 'on';
208
+
209
+ const lang = fields.find(f => f.key === 'language').value;
210
+ const tempVal = fields.find(f => f.key === 'temperature').value;
211
+
212
+ const cfg = {
213
+ apiKey: fields.find(f => f.key === 'apiKey').value,
214
+ model: fields.find(f => f.key === 'model').value,
215
+ language: lang === 'auto' ? '' : lang,
216
+ temperature: tempVal && tempVal !== 'default' ? parseFloat(tempVal) : null,
217
+ contextBias: fields.find(f => f.key === 'contextBias').value,
218
+ autoCopy: existing.autoCopy || false,
219
+ timestamps: tsValue,
220
+ diarize: diValue,
221
+ };
222
+ saveConfig(cfg);
223
+ process.stderr.write(`\n ${GREEN}✓${RESET} Saved to ${DIM}${CONFIG_FILE}${RESET}\n\n`);
224
+ resolve(cfg);
225
+ return;
226
+ }
227
+ renderForm();
228
+ }
108
229
 
109
230
  const onData = (ch) => {
110
- switch (ch) {
111
- case '\n':
112
- case '\r':
113
- case '\u0004': // Ctrl+D
114
- cleanup();
115
- process.stderr.write('\n');
116
- resolve(secret);
117
- break;
118
- case '\u0003': // Ctrl+C
119
- cleanup();
120
- process.stderr.write('\n');
121
- process.exit(EXIT_CONFIG);
122
- break;
123
- case '\u007F': // Backspace (macOS)
124
- case '\b': // Backspace
125
- if (secret.length > 0) {
126
- secret = secret.slice(0, -1);
127
- process.stderr.write('\b \b');
128
- }
129
- break;
130
- default:
131
- if (ch.charCodeAt(0) >= 32) {
132
- secret += ch;
133
- process.stderr.write('*'.repeat(ch.length));
231
+ const f = fields[active];
232
+
233
+ // Ctrl+C — exit
234
+ if (ch === '\u0003') {
235
+ stdin.removeListener('data', onData);
236
+ stdin.setRawMode(false);
237
+ stdin.pause();
238
+ process.stderr.write('\n');
239
+ process.exit(EXIT_CONFIG);
240
+ }
241
+
242
+ if (f.type === 'select') {
243
+ if (ch === '\t' || ch === '\x1b[C' || ch === '\x1b[B') { // Tab, Right, Down
244
+ f.idx = (f.idx + 1) % f.options.length;
245
+ renderForm();
246
+ } else if (ch === '\x1b[D' || ch === '\x1b[A') { // Left, Up
247
+ f.idx = (f.idx - 1 + f.options.length) % f.options.length;
248
+ renderForm();
249
+ } else if (ch === '\n' || ch === '\r') {
250
+ advance();
251
+ }
252
+ } else {
253
+ // text / secret field
254
+ if (ch === '\n' || ch === '\r') {
255
+ advance();
256
+ } else if (ch === '\u007F' || ch === '\b') { // Backspace
257
+ if (inputBuf.length > 0) {
258
+ inputBuf = inputBuf.slice(0, -1);
259
+ if (!inputBuf) editing = false;
260
+ renderForm();
134
261
  }
135
- break;
262
+ } else if (ch.charCodeAt(0) >= 32 && !ch.startsWith('\x1b')) {
263
+ if (!editing) editing = true;
264
+ inputBuf += ch;
265
+ renderForm();
266
+ }
136
267
  }
137
268
  };
138
269
 
@@ -140,41 +271,6 @@ function readSecret(prompt) {
140
271
  });
141
272
  }
142
273
 
143
- // ── Setup wizard ──────────────────────────────────────────────────────────────
144
-
145
- async function setupWizard() {
146
- const existing = loadConfig() || {};
147
-
148
- process.stderr.write(`\n${BOLD} dikt — setup${RESET}\n`);
149
- process.stderr.write(` ${DIM}Press Enter to keep the default shown in brackets.${RESET}\n\n`);
150
-
151
- const apiKey = (await readSecret(` Mistral API key [${existing.apiKey ? '••••' + existing.apiKey.slice(-4) : ''}]: `)).trim()
152
- || existing.apiKey || '';
153
- if (!apiKey) {
154
- process.stderr.write(`\n ${RED}API key is required.${RESET}\n\n`);
155
- process.exit(EXIT_CONFIG);
156
- }
157
-
158
- const rl = readline.createInterface({ input: process.stdin, output: process.stderr });
159
- const ask = (q) => new Promise((res) => rl.question(q, res));
160
-
161
- const model = (await ask(` Model [${existing.model || 'voxtral-mini-latest'}]: `)).trim()
162
- || existing.model || 'voxtral-mini-latest';
163
- const language = (await ask(` Language [${existing.language || 'auto'}]: `)).trim()
164
- || existing.language || '';
165
- const tempStr = (await ask(` Temperature [${existing.temperature ?? 'default'}]: `)).trim();
166
- const temperature = tempStr ? parseFloat(tempStr) : (existing.temperature ?? null);
167
- const contextBias = (await ask(` Context bias [${existing.contextBias || ''}]: `)).trim()
168
- || existing.contextBias || '';
169
-
170
- rl.close();
171
-
172
- const cfg = { apiKey, model, language: language === 'auto' ? '' : language, temperature, contextBias, autoCopy: existing.autoCopy || false };
173
- saveConfig(cfg);
174
- process.stderr.write(`\n ${GREEN}✓${RESET} Saved to ${DIM}${CONFIG_FILE}${RESET}\n\n`);
175
- return cfg;
176
- }
177
-
178
274
  // ── Prerequisites ─────────────────────────────────────────────────────────────
179
275
 
180
276
  function checkSox() {
@@ -236,8 +332,13 @@ function getTermWidth() {
236
332
  function render() {
237
333
  const w = getTermWidth();
238
334
  const header = ` dikt`;
239
- const right = `[?] [q]uit `;
240
- const pad = Math.max(0, w - header.length - right.length);
335
+ const tags = [];
336
+ if (config.diarize) tags.push('diarize');
337
+ if (config.timestamps) tags.push('timestamps');
338
+ const tagStr = tags.length ? ` ${DIM}${tags.join(' · ')}${RESET}` : '';
339
+ const tagPlain = tags.length ? ` ${tags.join(' · ')}` : '';
340
+ const right = `[s]etup [?] [q]uit `;
341
+ const pad = Math.max(0, w - header.length - tagPlain.length - right.length);
241
342
 
242
343
  let out = moveTo(1);
243
344
 
@@ -251,7 +352,7 @@ function render() {
251
352
  out += CLEAR_LINE + '\n';
252
353
  out += renderHelp();
253
354
  } else {
254
- out += CLEAR_LINE + BOLD + header + ' '.repeat(pad) + DIM + right + RESET + '\n';
355
+ out += CLEAR_LINE + BOLD + header + RESET + tagStr + ' '.repeat(pad) + DIM + right + RESET + '\n';
255
356
  out += CLEAR_LINE + ` ${'─'.repeat(Math.max(0, w - 2))}` + '\n';
256
357
  out += CLEAR_LINE + '\n';
257
358
  out += CLEAR_LINE + renderKeybar() + '\n';
@@ -270,7 +371,9 @@ function render() {
270
371
  const rows = process.stdout.rows || 24;
271
372
  const availableRows = rows - 9; // header(2) + blank + keybar + blank + status + blank + meta + cleardown
272
373
  if (availableRows > 0 && lines.length > availableRows) {
273
- lines = lines.slice(lines.length - availableRows);
374
+ const hidden = lines.length - availableRows + 1; // +1 to make room for the hint
375
+ lines = lines.slice(lines.length - availableRows + 1);
376
+ lines.unshift(` ${DIM}↑ ${hidden} more line${hidden === 1 ? '' : 's'} above${RESET}`);
274
377
  }
275
378
  for (const line of lines) {
276
379
  out += CLEAR_LINE + line + '\n';
@@ -326,6 +429,46 @@ function wrapTranscript(termWidth) {
326
429
  if (!text) return [];
327
430
  const indent = ' ';
328
431
  const maxLen = termWidth - indent.length - 1; // leave 1 col margin
432
+
433
+ // Diarized transcript: each line is already formatted with speaker labels + ANSI colors.
434
+ // Handle each speaker line independently — no quotes, just indent and wrap.
435
+ if (config.diarize && text.includes('\n')) {
436
+ const result = [];
437
+ for (const speakerLine of text.split('\n')) {
438
+ if (!speakerLine) continue;
439
+ // ANSI codes mess up length calculation — strip them for measuring
440
+ const plain = speakerLine.replace(/\x1b\[[0-9;]*m/g, '');
441
+ if (plain.length <= maxLen || maxLen < 10) {
442
+ result.push(`${indent}${speakerLine}`);
443
+ } else {
444
+ // Wrap long speaker lines: first line keeps the label, continuation lines get extra indent
445
+ const labelMatch = plain.match(/^([A-Z]\s{2})/);
446
+ const contIndent = labelMatch ? ' '.repeat(labelMatch[1].length) : '';
447
+ const words = speakerLine.split(/(\s+)/);
448
+ let cur = '';
449
+ let curPlain = '';
450
+ let first = true;
451
+ for (const word of words) {
452
+ const wordPlain = word.replace(/\x1b\[[0-9;]*m/g, '');
453
+ if (curPlain.length + wordPlain.length > maxLen && curPlain.length > 0) {
454
+ result.push(`${indent}${cur}`);
455
+ cur = first ? contIndent : '';
456
+ curPlain = first ? contIndent : '';
457
+ first = false;
458
+ const trimmed = word.replace(/^\s+/, '');
459
+ cur += trimmed;
460
+ curPlain += trimmed.replace(/\x1b\[[0-9;]*m/g, '');
461
+ } else {
462
+ cur += word;
463
+ curPlain += wordPlain;
464
+ }
465
+ }
466
+ if (cur) result.push(`${indent}${first ? '' : contIndent}${cur}`);
467
+ }
468
+ }
469
+ return result;
470
+ }
471
+
329
472
  if (maxLen < 10) return [`${indent}${text}`];
330
473
 
331
474
  const words = text.split(/(\s+)/);
@@ -564,60 +707,40 @@ async function transcribe(wavPath) {
564
707
  try {
565
708
  const blob = await fs.openAsBlob(wavPath, { type: 'audio/wav' });
566
709
  const file = new File([blob], 'recording.wav', { type: 'audio/wav' });
567
- const fd = new FormData();
568
- fd.append('file', file);
569
- fd.append('model', config.model);
570
- if (config.language) fd.append('language', config.language);
571
- if (config.temperature != null) fd.append('temperature', String(config.temperature));
572
- if (config.contextBias) fd.append('context_bias', config.contextBias);
573
710
 
574
711
  const t0 = Date.now();
575
- const resp = await fetch('https://api.mistral.ai/v1/audio/transcriptions', {
576
- method: 'POST',
577
- headers: { Authorization: `Bearer ${config.apiKey}` },
578
- body: fd,
712
+ const result = await callTranscribeAPI(file, {
579
713
  signal: AbortSignal.timeout(30_000),
714
+ timestamps: config.timestamps || '',
715
+ diarize: config.diarize || false,
580
716
  });
581
717
  state.latency = Date.now() - t0;
582
718
 
583
- if (!resp.ok) {
584
- const raw = await resp.text().catch(() => '');
585
- let msg;
586
- try {
587
- const e = JSON.parse(raw);
588
- msg = e.message;
589
- if (!msg && Array.isArray(e.detail)) {
590
- msg = e.detail.map(d => [d.loc?.join('.'), d.msg].filter(Boolean).join(': ')).join('; ');
591
- } else if (!msg && e.detail) {
592
- msg = typeof e.detail === 'string' ? e.detail : JSON.stringify(e.detail);
593
- }
594
- if (!msg) msg = raw;
595
- } catch {
596
- msg = raw || `HTTP ${resp.status}`;
597
- }
598
- if (resp.status === 401) msg += ' — press [s] to reconfigure';
599
- throw new Error(msg);
600
- }
601
-
602
- const data = await resp.json();
603
- const text = (data.text || '').trim();
719
+ const text = result.text;
604
720
 
605
721
  if (!text) {
606
722
  state.mode = 'error';
607
723
  state.error = 'No speech detected';
608
724
  } else {
609
- state.transcript = text;
725
+ // Format with speaker labels if diarization is active
726
+ if (config.diarize && result.segments) {
727
+ state.transcript = formatDiarizedText(result.segments, { color: true });
728
+ } else {
729
+ state.transcript = text;
730
+ }
610
731
  state.wordCount = text.split(/\s+/).filter(Boolean).length;
611
732
  state.mode = 'ready';
612
733
 
613
734
  // Push to history
614
- state.history.unshift({ transcript: text, wordCount: state.wordCount, duration: state.duration, latency: state.latency });
735
+ state.history.unshift({ transcript: state.transcript, wordCount: state.wordCount, duration: state.duration, latency: state.latency });
615
736
  if (state.history.length > MAX_HISTORY) state.history.pop();
616
737
  state.historyIndex = -1;
617
738
  }
618
739
  } catch (err) {
619
740
  state.mode = 'error';
620
- state.error = err.name === 'TimeoutError' ? 'Transcription timed out' : err.message;
741
+ let msg = err.name === 'TimeoutError' ? 'Transcription timed out' : err.message;
742
+ if (err.status === 401) msg += ' — press [s] to reconfigure';
743
+ state.error = msg;
621
744
  } finally {
622
745
  clearInterval(state.spinnerInterval);
623
746
  cleanupRecFile();
@@ -746,29 +869,265 @@ async function runSetup() {
746
869
  renderAll();
747
870
  }
748
871
 
872
+ // ── Audio helpers ─────────────────────────────────────────────────────────────
873
+
874
+ const SILENCE_THRESHOLD = Math.round(32768 * 0.01); // 1% of max 16-bit amplitude
875
+
876
+ function createWavHeader(dataSize) {
877
+ const buf = Buffer.alloc(44);
878
+ buf.write('RIFF', 0);
879
+ buf.writeUInt32LE(36 + dataSize, 4);
880
+ buf.write('WAVE', 8);
881
+ buf.write('fmt ', 12);
882
+ buf.writeUInt32LE(16, 16);
883
+ buf.writeUInt16LE(1, 20); // PCM
884
+ buf.writeUInt16LE(1, 22); // mono
885
+ buf.writeUInt32LE(16000, 24); // sample rate
886
+ buf.writeUInt32LE(32000, 28); // byte rate (16000 * 1 * 2)
887
+ buf.writeUInt16LE(2, 32); // block align
888
+ buf.writeUInt16LE(16, 34); // bits per sample
889
+ buf.write('data', 36);
890
+ buf.writeUInt32LE(dataSize, 40);
891
+ return buf;
892
+ }
893
+
894
+ function peakAmplitude(chunk) {
895
+ let peak = 0;
896
+ for (let i = 0; i < chunk.length - 1; i += 2) {
897
+ const abs = Math.abs(chunk.readInt16LE(i));
898
+ if (abs > peak) peak = abs;
899
+ }
900
+ return peak;
901
+ }
902
+
903
+ function trimSilence(rawData) {
904
+ const SAMPLE_RATE = 16000;
905
+ const BYTES_PER_SAMPLE = 2;
906
+ const WINDOW_SAMPLES = Math.round(SAMPLE_RATE * 0.05); // 50ms windows
907
+ const WINDOW_BYTES = WINDOW_SAMPLES * BYTES_PER_SAMPLE;
908
+ const MAX_SILENCE_WINDOWS = Math.round(1.0 / 0.05); // 1 second = 20 windows
909
+ const PAD_WINDOWS = Math.round(0.1 / 0.05); // 100ms padding = 2 windows
910
+
911
+ const windows = [];
912
+ for (let offset = 0; offset + WINDOW_BYTES <= rawData.length; offset += WINDOW_BYTES) {
913
+ windows.push(rawData.subarray(offset, offset + WINDOW_BYTES));
914
+ }
915
+ // Include any trailing partial window
916
+ const remainder = rawData.length % WINDOW_BYTES;
917
+ if (remainder > 0) {
918
+ windows.push(rawData.subarray(rawData.length - remainder));
919
+ }
920
+
921
+ const output = [];
922
+ let silentCount = 0;
923
+
924
+ for (const win of windows) {
925
+ const peak = peakAmplitude(win);
926
+ if (peak < SILENCE_THRESHOLD) {
927
+ silentCount++;
928
+ if (silentCount <= MAX_SILENCE_WINDOWS) {
929
+ output.push(win);
930
+ } else if (silentCount === MAX_SILENCE_WINDOWS + 1) {
931
+ // Replace excess silence with padding
932
+ const padBytes = PAD_WINDOWS * WINDOW_BYTES;
933
+ output.push(Buffer.alloc(padBytes)); // zeros = silence
934
+ }
935
+ // else: skip (already added padding)
936
+ } else {
937
+ silentCount = 0;
938
+ output.push(win);
939
+ }
940
+ }
941
+
942
+ return Buffer.concat(output);
943
+ }
944
+
945
+ async function callTranscribeAPI(file, { signal, timestamps, diarize } = {}) {
946
+ const fd = new FormData();
947
+ fd.append('file', file);
948
+ fd.append('model', config.model);
949
+ if (config.language) fd.append('language', config.language);
950
+ if (config.temperature != null) fd.append('temperature', String(config.temperature));
951
+ if (config.contextBias) fd.append('context_bias', config.contextBias);
952
+ if (timestamps) {
953
+ for (const g of timestamps.split(',')) fd.append('timestamp_granularities[]', g.trim());
954
+ }
955
+ if (diarize) {
956
+ fd.append('diarize', 'true');
957
+ // API requires segment timestamps when diarize is enabled
958
+ if (!timestamps) fd.append('timestamp_granularities[]', 'segment');
959
+ }
960
+
961
+ const t0 = Date.now();
962
+ const resp = await fetch('https://api.mistral.ai/v1/audio/transcriptions', {
963
+ method: 'POST',
964
+ headers: { Authorization: `Bearer ${config.apiKey}` },
965
+ body: fd,
966
+ signal: signal || AbortSignal.timeout(30_000),
967
+ });
968
+ const latency = Date.now() - t0;
969
+
970
+ if (!resp.ok) {
971
+ const raw = await resp.text().catch(() => '');
972
+ let msg;
973
+ try {
974
+ const e = JSON.parse(raw);
975
+ msg = e.message;
976
+ if (typeof msg === 'object' && msg !== null) msg = JSON.stringify(msg);
977
+ if (!msg && Array.isArray(e.detail)) {
978
+ msg = e.detail.map(d => [d.loc?.join('.'), d.msg].filter(Boolean).join(': ')).join('; ');
979
+ } else if (!msg && e.detail) {
980
+ msg = typeof e.detail === 'string' ? e.detail : JSON.stringify(e.detail);
981
+ }
982
+ if (!msg) msg = raw;
983
+ } catch {
984
+ msg = raw || `HTTP ${resp.status}`;
985
+ }
986
+ const err = new Error(msg);
987
+ err.status = resp.status;
988
+ throw err;
989
+ }
990
+
991
+ const data = await resp.json();
992
+ const text = (data.text || '').trim();
993
+ return { text, latency, segments: data.segments, words: data.words };
994
+ }
995
+
996
+ async function transcribeBuffer(rawChunks, { signal, timestamps, diarize } = {}) {
997
+ const rawData = Buffer.concat(rawChunks);
998
+ const trimmed = trimSilence(rawData);
999
+ const wavData = Buffer.concat([createWavHeader(trimmed.length), trimmed]);
1000
+ const blob = new Blob([wavData], { type: 'audio/wav' });
1001
+ const file = new File([blob], 'recording.wav', { type: 'audio/wav' });
1002
+ return callTranscribeAPI(file, { signal, timestamps, diarize });
1003
+ }
1004
+
1005
+ // ── Output formatting helpers ─────────────────────────────────────────────────
1006
+
1007
+ const SPEAKER_COLORS = [GREEN, YELLOW, CYAN, MAGENTA, BLUE, RED];
1008
+
1009
+ function formatDiarizedText(segments, { color = false } = {}) {
1010
+ if (!segments || !segments.length) return '';
1011
+
1012
+ // Map speaker IDs to short letters (A, B, C, ...)
1013
+ const speakerMap = new Map();
1014
+ for (const s of segments) {
1015
+ if (s.speaker_id != null && !speakerMap.has(s.speaker_id)) {
1016
+ speakerMap.set(s.speaker_id, speakerMap.size);
1017
+ }
1018
+ }
1019
+
1020
+ // Merge consecutive segments from the same speaker
1021
+ const merged = [];
1022
+ for (const s of segments) {
1023
+ const text = (s.text || '').trim();
1024
+ if (!text) continue;
1025
+ const last = merged[merged.length - 1];
1026
+ if (last && last.speaker_id === s.speaker_id) {
1027
+ last.text += ' ' + text;
1028
+ } else {
1029
+ merged.push({ speaker_id: s.speaker_id, text });
1030
+ }
1031
+ }
1032
+
1033
+ return merged.map(s => {
1034
+ const idx = speakerMap.get(s.speaker_id) ?? 0;
1035
+ const letter = String.fromCharCode(65 + idx); // A, B, C, ...
1036
+ if (color) {
1037
+ const c = SPEAKER_COLORS[idx % SPEAKER_COLORS.length];
1038
+ return `${c}${BOLD}${letter}${RESET} ${s.text}`;
1039
+ }
1040
+ return `${letter} ${s.text}`;
1041
+ }).join('\n');
1042
+ }
1043
+
1044
+ function buildJsonOutput(base, { segments, words, timestamps, diarize } = {}) {
1045
+ const out = { ...base, timestamp: new Date().toISOString() };
1046
+ if ((timestamps || diarize) && segments) out.segments = segments;
1047
+ if (timestamps && words) out.words = words;
1048
+ return out;
1049
+ }
1050
+
1051
+ // ── File mode ────────────────────────────────────────────────────────────────
1052
+
1053
+ async function runFile(flags) {
1054
+ try {
1055
+ if (!flags.file || !fs.existsSync(flags.file)) {
1056
+ process.stderr.write(`Error: file not found: ${flags.file}\n`);
1057
+ return EXIT_TRANSCRIPTION;
1058
+ }
1059
+
1060
+ const blob = await fs.openAsBlob(flags.file);
1061
+ const ext = path.extname(flags.file).slice(1) || 'wav';
1062
+ const mimeTypes = { wav: 'audio/wav', mp3: 'audio/mpeg', m4a: 'audio/mp4', ogg: 'audio/ogg', flac: 'audio/flac', webm: 'audio/webm' };
1063
+ const mime = mimeTypes[ext] || 'audio/wav';
1064
+ const file = new File([blob], path.basename(flags.file), { type: mime });
1065
+
1066
+ const result = await callTranscribeAPI(file, { timestamps: flags.timestamps, diarize: flags.diarize });
1067
+
1068
+ if (!result.text) {
1069
+ process.stderr.write('No speech detected\n');
1070
+ return EXIT_TRANSCRIPTION;
1071
+ }
1072
+
1073
+ const wordCount = result.text.split(/\s+/).filter(Boolean).length;
1074
+
1075
+ if (flags.json) {
1076
+ const out = buildJsonOutput(
1077
+ { text: result.text, latency: result.latency, words: wordCount },
1078
+ { segments: result.segments, words: result.words, timestamps: flags.timestamps, diarize: flags.diarize },
1079
+ );
1080
+ process.stdout.write(JSON.stringify(out) + '\n');
1081
+ } else if (flags.diarize && result.segments) {
1082
+ process.stdout.write(formatDiarizedText(result.segments) + '\n');
1083
+ } else {
1084
+ process.stdout.write(result.text + '\n');
1085
+ }
1086
+
1087
+ return EXIT_OK;
1088
+ } catch (err) {
1089
+ process.stderr.write(`Error: ${err.message}\n`);
1090
+ return EXIT_TRANSCRIPTION;
1091
+ }
1092
+ }
1093
+
749
1094
  // ── Single-shot mode ──────────────────────────────────────────────────────────
750
1095
 
751
1096
  async function runOnce(flags) {
752
- const recFile = path.join(os.tmpdir(), `dikt-${Date.now()}.wav`);
753
-
754
1097
  try {
755
- // Record with silence detection via sox silence effect
1098
+ // Record raw PCM to stdout — silence detection handled in Node.js
756
1099
  const recProc = spawn('rec', [
757
- '-q', '-r', '16000', '-c', '1', '-b', '16',
758
- recFile,
759
- 'silence', '1', '0.1', '1%', '1', '2.0', '1%',
1100
+ '-q', '-r', '16000', '-c', '1', '-b', '16', '-t', 'raw', '-',
760
1101
  ], {
761
- stdio: ['ignore', 'ignore', 'pipe'],
1102
+ stdio: ['ignore', 'pipe', 'pipe'],
762
1103
  });
763
1104
 
764
1105
  recProc.stderr.on('data', () => {});
765
1106
 
766
- // Ctrl+C stops recording gracefully
767
1107
  const sigHandler = () => recProc.kill('SIGTERM');
768
1108
  process.on('SIGINT', sigHandler);
769
1109
 
1110
+ const chunks = [];
1111
+ let heardSound = false;
1112
+ let lastSoundTime = Date.now();
770
1113
  const recStart = Date.now();
1114
+
1115
+ recProc.stdout.on('data', (chunk) => {
1116
+ chunks.push(chunk);
1117
+ if (peakAmplitude(chunk) > SILENCE_THRESHOLD) {
1118
+ heardSound = true;
1119
+ lastSoundTime = Date.now();
1120
+ }
1121
+ });
1122
+
1123
+ const silenceTimer = setInterval(() => {
1124
+ if (flags.silence > 0 && heardSound && Date.now() - lastSoundTime > flags.silence * 1000) {
1125
+ recProc.kill('SIGTERM');
1126
+ }
1127
+ }, 100);
1128
+
771
1129
  await new Promise((resolve) => recProc.on('close', resolve));
1130
+ clearInterval(silenceTimer);
772
1131
  process.removeListener('SIGINT', sigHandler);
773
1132
  const duration = (Date.now() - recStart) / 1000;
774
1133
 
@@ -782,45 +1141,26 @@ async function runOnce(flags) {
782
1141
  const abortHandler = () => ac.abort();
783
1142
  process.on('SIGINT', abortHandler);
784
1143
 
785
- const blob = await fs.openAsBlob(recFile, { type: 'audio/wav' });
786
- const file = new File([blob], 'recording.wav', { type: 'audio/wav' });
787
- const fd = new FormData();
788
- fd.append('file', file);
789
- fd.append('model', config.model);
790
- if (config.language) fd.append('language', config.language);
791
- if (config.temperature != null) fd.append('temperature', String(config.temperature));
792
- if (config.contextBias) fd.append('context_bias', config.contextBias);
793
-
794
- const t0 = Date.now();
795
- const resp = await fetch('https://api.mistral.ai/v1/audio/transcriptions', {
796
- method: 'POST',
797
- headers: { Authorization: `Bearer ${config.apiKey}` },
798
- body: fd,
799
- signal: ac.signal,
800
- });
801
- const latency = Date.now() - t0;
1144
+ const result = await transcribeBuffer(chunks, { signal: ac.signal, timestamps: flags.timestamps, diarize: flags.diarize });
802
1145
  process.removeListener('SIGINT', abortHandler);
803
1146
 
804
- if (!resp.ok) {
805
- const raw = await resp.text().catch(() => '');
806
- process.stderr.write(`Error: ${raw || `HTTP ${resp.status}`}\n`);
807
- return EXIT_TRANSCRIPTION;
808
- }
809
-
810
- const data = await resp.json();
811
- const text = (data.text || '').trim();
812
-
813
- if (!text) {
1147
+ if (!result.text) {
814
1148
  process.stderr.write('No speech detected\n');
815
1149
  return EXIT_TRANSCRIPTION;
816
1150
  }
817
1151
 
818
- const wordCount = text.split(/\s+/).filter(Boolean).length;
1152
+ const wordCount = result.text.split(/\s+/).filter(Boolean).length;
819
1153
 
820
1154
  if (flags.json) {
821
- process.stdout.write(JSON.stringify({ text, duration: parseFloat(duration.toFixed(1)), latency, words: wordCount }) + '\n');
1155
+ const out = buildJsonOutput(
1156
+ { text: result.text, duration: parseFloat(duration.toFixed(1)), latency: result.latency, words: wordCount },
1157
+ { segments: result.segments, words: result.words, timestamps: flags.timestamps, diarize: flags.diarize },
1158
+ );
1159
+ process.stdout.write(JSON.stringify(out) + '\n');
1160
+ } else if (flags.diarize && result.segments) {
1161
+ process.stdout.write(formatDiarizedText(result.segments) + '\n');
822
1162
  } else {
823
- process.stdout.write(text + '\n');
1163
+ process.stdout.write(result.text + '\n');
824
1164
  }
825
1165
 
826
1166
  return EXIT_OK;
@@ -831,8 +1171,123 @@ async function runOnce(flags) {
831
1171
  process.stderr.write(`Error: ${err.message}\n`);
832
1172
  }
833
1173
  return EXIT_TRANSCRIPTION;
834
- } finally {
835
- try { fs.unlinkSync(recFile); } catch {}
1174
+ }
1175
+ }
1176
+
1177
+ // ── Stream mode ──────────────────────────────────────────────────────────────
1178
+
1179
+ async function runStream(flags) {
1180
+ try {
1181
+ const recProc = spawn('rec', [
1182
+ '-q', '-r', '16000', '-c', '1', '-b', '16', '-t', 'raw', '-',
1183
+ ], {
1184
+ stdio: ['ignore', 'pipe', 'pipe'],
1185
+ });
1186
+
1187
+ recProc.stderr.on('data', () => {});
1188
+
1189
+ let killed = false;
1190
+ const killRec = () => { if (!killed) { killed = true; recProc.kill('SIGTERM'); process.stderr.write('\n'); } };
1191
+ process.on('SIGINT', killRec);
1192
+
1193
+ let chunks = []; // current chunk buffer (resets per pause)
1194
+ let chunkHasAudio = false; // current chunk has sound (resets per pause)
1195
+ let heardSound = false; // ever heard sound (never resets)
1196
+ let lastSoundTime = Date.now();
1197
+ let chunkStart = Date.now();
1198
+ let chunkIndex = 0;
1199
+ const pending = [];
1200
+
1201
+ recProc.stdout.on('data', (chunk) => {
1202
+ chunks.push(chunk);
1203
+ if (peakAmplitude(chunk) > SILENCE_THRESHOLD) {
1204
+ chunkHasAudio = true;
1205
+ heardSound = true;
1206
+ lastSoundTime = Date.now();
1207
+ }
1208
+ });
1209
+
1210
+ const checkTimer = setInterval(() => {
1211
+ const silenceMs = Date.now() - lastSoundTime;
1212
+
1213
+ // Pause: send current chunk for transcription, keep recording
1214
+ if (chunkHasAudio && silenceMs > flags.pause * 1000 && chunks.length > 0) {
1215
+ const batch = chunks;
1216
+ const duration = (Date.now() - chunkStart) / 1000;
1217
+ const idx = chunkIndex++;
1218
+ chunks = [];
1219
+ chunkHasAudio = false;
1220
+ chunkStart = Date.now();
1221
+
1222
+ const p = transcribeBuffer(batch, { timestamps: flags.timestamps, diarize: flags.diarize })
1223
+ .then((result) => {
1224
+ if (!result.text) return;
1225
+ const wordCount = result.text.split(/\s+/).filter(Boolean).length;
1226
+ if (flags.json) {
1227
+ const out = buildJsonOutput(
1228
+ { text: result.text, chunk: idx, duration: parseFloat(duration.toFixed(1)), latency: result.latency, words: wordCount },
1229
+ { segments: result.segments, words: result.words, timestamps: flags.timestamps, diarize: flags.diarize },
1230
+ );
1231
+ process.stdout.write(JSON.stringify(out) + '\n');
1232
+ } else if (flags.diarize && result.segments) {
1233
+ const sep = flags.noNewline ? ' ' : '\n';
1234
+ process.stdout.write(formatDiarizedText(result.segments) + sep);
1235
+ } else {
1236
+ process.stdout.write(result.text + (flags.noNewline ? ' ' : '\n'));
1237
+ }
1238
+ })
1239
+ .catch((err) => {
1240
+ process.stderr.write(`Chunk ${idx} error: ${err.message}\n`);
1241
+ });
1242
+ pending.push(p);
1243
+ }
1244
+
1245
+ // Stop: full silence threshold reached
1246
+ if (flags.silence > 0 && heardSound && silenceMs > flags.silence * 1000) {
1247
+ killRec();
1248
+ }
1249
+ }, 100);
1250
+
1251
+ await new Promise((resolve) => recProc.on('close', resolve));
1252
+ clearInterval(checkTimer);
1253
+ process.removeListener('SIGINT', killRec);
1254
+
1255
+ // Send any remaining audio that hasn't been sent yet
1256
+ if (chunks.length > 0 && chunkHasAudio) {
1257
+ const duration = (Date.now() - chunkStart) / 1000;
1258
+ const idx = chunkIndex++;
1259
+ try {
1260
+ const result = await transcribeBuffer(chunks, { timestamps: flags.timestamps, diarize: flags.diarize });
1261
+ if (result.text) {
1262
+ const wordCount = result.text.split(/\s+/).filter(Boolean).length;
1263
+ if (flags.json) {
1264
+ const out = buildJsonOutput(
1265
+ { text: result.text, chunk: idx, duration: parseFloat(duration.toFixed(1)), latency: result.latency, words: wordCount },
1266
+ { segments: result.segments, words: result.words, timestamps: flags.timestamps, diarize: flags.diarize },
1267
+ );
1268
+ process.stdout.write(JSON.stringify(out) + '\n');
1269
+ } else if (flags.diarize && result.segments) {
1270
+ const sep = flags.noNewline ? ' ' : '\n';
1271
+ process.stdout.write(formatDiarizedText(result.segments) + sep);
1272
+ } else {
1273
+ process.stdout.write(result.text + (flags.noNewline ? ' ' : '\n'));
1274
+ }
1275
+ }
1276
+ } catch (err) {
1277
+ process.stderr.write(`Chunk ${idx} error: ${err.message}\n`);
1278
+ }
1279
+ }
1280
+
1281
+ // Wait for any in-flight transcriptions to finish
1282
+ await Promise.allSettled(pending);
1283
+
1284
+ // Final newline for --no-newline so shell prompt starts on a new line
1285
+ if (flags.noNewline && !flags.json) process.stdout.write('\n');
1286
+
1287
+ return EXIT_OK;
1288
+ } catch (err) {
1289
+ process.stderr.write(`Error: ${err.message}\n`);
1290
+ return EXIT_TRANSCRIPTION;
836
1291
  }
837
1292
  }
838
1293
 
@@ -863,6 +1318,14 @@ async function main() {
863
1318
  quiet: args.includes('--quiet') || args.includes('-q'),
864
1319
  noInput: args.includes('--no-input'),
865
1320
  setup: args.includes('--setup') || args[0] === 'setup',
1321
+ stream: args.includes('--stream'),
1322
+ silence: args.includes('--silence') ? (Number.isFinite(parseFloat(args[args.indexOf('--silence') + 1])) ? parseFloat(args[args.indexOf('--silence') + 1]) : 2.0) : 2.0,
1323
+ pause: args.includes('--pause') ? parseFloat(args[args.indexOf('--pause') + 1]) || 1.0 : 1.0,
1324
+ language: args.includes('--language') ? args[args.indexOf('--language') + 1] || '' : '',
1325
+ file: args.includes('--file') ? args[args.indexOf('--file') + 1] || '' : '',
1326
+ noNewline: args.includes('--no-newline') || args.includes('-n'),
1327
+ timestamps: args.includes('--timestamps') ? args[args.indexOf('--timestamps') + 1] || '' : '',
1328
+ diarize: args.includes('--diarize'),
866
1329
  };
867
1330
 
868
1331
  if (args.includes('--version')) {
@@ -903,6 +1366,14 @@ Options:
903
1366
  --update Update to latest version
904
1367
  --json Record once, output JSON to stdout
905
1368
  -q, --quiet Record once, print transcript to stdout
1369
+ --stream Stream transcription chunks on pauses
1370
+ --file <path> Transcribe an audio file (no mic needed)
1371
+ --silence <seconds> Silence duration before auto-stop (default: 2.0)
1372
+ --pause <seconds> Pause duration to split chunks (default: 1.0)
1373
+ --language <code> Language code, e.g. en, de, fr (default: auto)
1374
+ -n, --no-newline Join stream chunks without newlines
1375
+ --timestamps <granularity> Add timestamps: segment, word, or segment,word
1376
+ --diarize Enable speaker identification
906
1377
  --no-input Fail if config is missing (no wizard)
907
1378
  --no-color Disable colored output
908
1379
  --version Show version
@@ -920,8 +1391,15 @@ Examples:
920
1391
  dikt setup Reconfigure API key and model
921
1392
  dikt -q Record once, print transcript to stdout
922
1393
  dikt --json Record once, output JSON to stdout
1394
+ dikt -q --silence 5 Wait longer before auto-stopping
1395
+ dikt --stream Stream chunks as you speak
1396
+ dikt --stream --json Stream chunks as JSON Lines
923
1397
  dikt -q | claude Dictate a prompt to Claude Code
924
1398
  dikt update Update to the latest version
1399
+ dikt --file meeting.wav Transcribe an existing audio file
1400
+ dikt --stream --silence 0 Stream continuously until Ctrl+C
1401
+ dikt --stream -n Stream as continuous flowing text
1402
+ dikt -q --json --diarize Transcribe with speaker labels
925
1403
 
926
1404
  Environment variables:
927
1405
  DIKT_API_KEY Override API key from config
@@ -942,8 +1420,6 @@ Requires: sox (brew install sox)`);
942
1420
  process.exit(EXIT_OK);
943
1421
  }
944
1422
 
945
- checkSox();
946
-
947
1423
  // Load or setup config
948
1424
  if (flags.setup) {
949
1425
  checkTTY();
@@ -961,6 +1437,9 @@ Requires: sox (brew install sox)`);
961
1437
  }
962
1438
 
963
1439
  applyEnvOverrides(config);
1440
+ if (flags.language) config.language = flags.language;
1441
+ if (!flags.timestamps && config.timestamps) flags.timestamps = config.timestamps;
1442
+ if (!flags.diarize && config.diarize) flags.diarize = true;
964
1443
 
965
1444
  const validation = validateConfig(config);
966
1445
  if (!validation.valid) {
@@ -970,6 +1449,33 @@ Requires: sox (brew install sox)`);
970
1449
  process.exit(EXIT_CONFIG);
971
1450
  }
972
1451
 
1452
+ // Validate incompatible flag combinations
1453
+ const lang = config.language;
1454
+ if (lang && flags.timestamps) {
1455
+ process.stderr.write('Error: --timestamps and --language cannot be used together\n');
1456
+ process.exit(EXIT_CONFIG);
1457
+ }
1458
+ if (lang && flags.diarize) {
1459
+ process.stderr.write('Error: --diarize and --language cannot be used together\n');
1460
+ process.exit(EXIT_CONFIG);
1461
+ }
1462
+ if (flags.diarize && flags.stream) {
1463
+ process.stderr.write('Error: --diarize is not compatible with --stream, use -q --diarize instead\n');
1464
+ process.exit(EXIT_CONFIG);
1465
+ }
1466
+
1467
+ // File mode: transcribe an existing audio file (no sox needed)
1468
+ if (flags.file) {
1469
+ process.exit(await runFile(flags));
1470
+ }
1471
+
1472
+ checkSox();
1473
+
1474
+ // Stream mode: chunked transcription on pauses
1475
+ if (flags.stream) {
1476
+ process.exit(await runStream(flags));
1477
+ }
1478
+
973
1479
  // Single-shot mode: record once, output, exit
974
1480
  if (flags.json || flags.quiet) {
975
1481
  process.exit(await runOnce(flags));
@@ -978,6 +1484,10 @@ Requires: sox (brew install sox)`);
978
1484
  // Interactive TUI mode
979
1485
  checkTTY();
980
1486
 
1487
+ // Clear any setup wizard output before entering alt screen, so it doesn't
1488
+ // leak back when the alt screen exits.
1489
+ process.stdout.write(CLEAR_SCREEN);
1490
+
981
1491
  // Enter raw TUI mode (alternate screen buffer prevents scrollback corruption)
982
1492
  process.stdout.write(ALT_SCREEN_ON + HIDE_CURSOR + CLEAR_SCREEN);
983
1493
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "dikt",
3
- "version": "1.0.2",
3
+ "version": "1.1.0",
4
4
  "description": "Voice dictation for the terminal.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -17,7 +17,9 @@
17
17
  "cli",
18
18
  "terminal",
19
19
  "whisper",
20
- "mistral"
20
+ "mistral",
21
+ "diarization",
22
+ "voxtral"
21
23
  ],
22
24
  "author": "johxyz",
23
25
  "repository": {