npm - dikt - Versions diffs - 1.0.2 → 1.1.0 - Mend

dikt 1.0.2 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/cli.mjs +661 -151
package/package.json +4 -2

package/cli.mjs CHANGED Viewed

@@ -17,6 +17,9 @@ let DIM = `${ESC}2m`;
 let RED = `${ESC}31m`;
 let GREEN = `${ESC}32m`;
 let YELLOW = `${ESC}33m`;
+let BLUE = `${ESC}34m`;
+let MAGENTA = `${ESC}35m`;
+let CYAN = `${ESC}36m`;
 let GREY = `${ESC}90m`;
 let WHITE = `${ESC}37m`;
 let RED_BG = `${ESC}41m`;
@@ -29,14 +32,14 @@ const ALT_SCREEN_ON = `${ESC}?1049h`;
 const ALT_SCREEN_OFF = `${ESC}?1049l`;
 if (process.env.NO_COLOR != null || process.env.TERM === 'dumb' || process.argv.includes('--no-color')) {
-  RESET = BOLD = DIM = RED = GREEN = YELLOW = GREY = WHITE = RED_BG = '';
+  RESET = BOLD = DIM = RED = GREEN = YELLOW = BLUE = MAGENTA = CYAN = GREY = WHITE = RED_BG = '';
 }
 const moveTo = (row, col = 1) => `${ESC}${row};${col}H`;
 // ── Constants ─────────────────────────────────────────────────────────────────
-const VERSION = '1.0.2';
+const VERSION = '1.1.0';
 const CONFIG_BASE = process.env.XDG_CONFIG_HOME || path.join(os.homedir(), '.config');
 const CONFIG_DIR = path.join(CONFIG_BASE, 'dikt');
 const CONFIG_FILE = path.join(CONFIG_DIR, 'config.json');
@@ -88,51 +91,179 @@ function validateConfig(cfg) {
   return { valid: errors.length === 0, errors };
 }
-// ── Secret input ──────────────────────────────────────────────────────────────
+// ── Setup wizard (form-based) ─────────────────────────────────────────────────
+const TIMESTAMPS_DISPLAY = { '': 'off', 'segment': 'segment', 'word': 'word', 'segment,word': 'both' };
+const TIMESTAMPS_VALUE = { 'off': '', 'segment': 'segment', 'word': 'word', 'both': 'segment,word' };
+async function setupWizard() {
+  const existing = loadConfig() || {};
+  const fields = [
+    { key: 'apiKey', label: 'API key', type: 'secret', value: '', display: existing.apiKey ? '••••' + existing.apiKey.slice(-4) : '', fallback: existing.apiKey || '' },
+    { key: 'model', label: 'Model', type: 'text', value: '', display: existing.model || 'voxtral-mini-latest', fallback: existing.model || 'voxtral-mini-latest' },
+    { key: 'language', label: 'Language', type: 'text', value: '', display: existing.language || 'auto', fallback: existing.language || '' },
+    { key: 'temperature', label: 'Temperature', type: 'text', value: '', display: existing.temperature != null ? String(existing.temperature) : 'default', fallback: existing.temperature != null ? String(existing.temperature) : '' },
+    { key: 'contextBias', label: 'Context bias', type: 'text', value: '', display: existing.contextBias || '', fallback: existing.contextBias || '' },
+    { key: 'timestamps', label: 'Timestamps', type: 'select', options: ['off', 'segment', 'word', 'both'], idx: ['off', 'segment', 'word', 'both'].indexOf(TIMESTAMPS_DISPLAY[existing.timestamps || ''] || 'off') },
+    { key: 'diarize', label: 'Diarize', type: 'select', options: ['off', 'on'], idx: existing.diarize ? 1 : 0 },
+  ];
+  const LABEL_W = 15; // right-align labels to this width
+  let active = 0;
+  let editing = false; // true when typing into a text/secret field
+  let inputBuf = '';
+  function renderForm() {
+    let out = `\x1b[H\x1b[2J`; // move home + clear screen
+    out += `\n${BOLD} dikt — setup${RESET}\n`;
+    // Contextual hint
+    const f = fields[active];
+    if (f.type === 'select') {
+      out += `  ${DIM}Tab/arrows to change, Enter to confirm${RESET}\n`;
+    } else if (editing) {
+      out += `  ${DIM}Type to ${f.type === 'secret' ? 'enter' : 'change'}, Enter to confirm${RESET}\n`;
+    } else {
+      out += `  ${DIM}Enter to keep default, or start typing to change${RESET}\n`;
+    }
+    out += '\n';
+    for (let i = 0; i < fields.length; i++) {
+      const fi = fields[i];
+      const label = fi.label.padStart(LABEL_W);
+      const isActive = i === active;
+      const marker = isActive ? `${GREEN}>${RESET}` : ' ';
+      if (fi.type === 'select') {
+        const parts = fi.options.map((opt, j) => {
+          if (isActive) {
+            return j === fi.idx ? `${BOLD}${GREEN}${opt}${RESET}` : `${DIM}${opt}${RESET}`;
+          }
+          return j === fi.idx ? opt : `${DIM}${opt}${RESET}`;
+        });
+        out += `${marker} ${isActive ? BOLD : DIM}${label}${RESET}  ${parts.join('   ')}\n`;
+      } else {
+        let valueStr;
+        if (isActive && editing) {
+          valueStr = fi.type === 'secret'
+            ? `${GREEN}${'•'.repeat(inputBuf.length)}${RESET}█`
+            : `${GREEN}${inputBuf}${RESET}█`;
+        } else if (isActive && !editing) {
+          valueStr = `${DIM}${fi.display}${RESET}`;
+        } else {
+          // Show confirmed value or default
+          const show = fi.value || fi.display;
+          valueStr = fi.value
+            ? (fi.type === 'secret' ? '••••' + fi.value.slice(-4) : fi.value)
+            : `${DIM}${show}${RESET}`;
+        }
+        out += `${marker} ${isActive ? BOLD : DIM}${label}${RESET}  ${valueStr}\n`;
+      }
+    }
+    process.stderr.write(out);
+  }
-function readSecret(prompt) {
   return new Promise((resolve) => {
-    process.stderr.write(prompt);
     const { stdin } = process;
     stdin.setRawMode(true);
     stdin.resume();
     stdin.setEncoding('utf8');
-    let secret = '';
+    renderForm();
-    const cleanup = () => {
-      stdin.removeListener('data', onData);
-      stdin.setRawMode(false);
-      stdin.pause();
-    };
+    function advance() {
+      const f = fields[active];
+      // Commit text/secret field value
+      if (f.type !== 'select') {
+        if (inputBuf.trim()) {
+          f.value = inputBuf.trim();
+        } else {
+          f.value = f.fallback;
+        }
+        // Validate API key
+        if (f.key === 'apiKey' && !f.value) {
+          editing = false;
+          inputBuf = '';
+          renderForm();
+          process.stderr.write(`\n  ${RED}API key is required.${RESET}\n`);
+          return; // stay on this field
+        }
+        editing = false;
+        inputBuf = '';
+      }
+      active++;
+      if (active >= fields.length) {
+        // Save and exit
+        stdin.removeListener('data', onData);
+        stdin.setRawMode(false);
+        stdin.pause();
+        const ts = fields.find(f => f.key === 'timestamps');
+        const di = fields.find(f => f.key === 'diarize');
+        const tsValue = TIMESTAMPS_VALUE[ts.options[ts.idx]];
+        const diValue = di.options[di.idx] === 'on';
+        const lang = fields.find(f => f.key === 'language').value;
+        const tempVal = fields.find(f => f.key === 'temperature').value;
+        const cfg = {
+          apiKey: fields.find(f => f.key === 'apiKey').value,
+          model: fields.find(f => f.key === 'model').value,
+          language: lang === 'auto' ? '' : lang,
+          temperature: tempVal && tempVal !== 'default' ? parseFloat(tempVal) : null,
+          contextBias: fields.find(f => f.key === 'contextBias').value,
+          autoCopy: existing.autoCopy || false,
+          timestamps: tsValue,
+          diarize: diValue,
+        };
+        saveConfig(cfg);
+        process.stderr.write(`\n  ${GREEN}✓${RESET} Saved to ${DIM}${CONFIG_FILE}${RESET}\n\n`);
+        resolve(cfg);
+        return;
+      }
+      renderForm();
+    }
     const onData = (ch) => {
-      switch (ch) {
-        case '\n':
-        case '\r':
-        case '\u0004': // Ctrl+D
-          cleanup();
-          process.stderr.write('\n');
-          resolve(secret);
-          break;
-        case '\u0003': // Ctrl+C
-          cleanup();
-          process.stderr.write('\n');
-          process.exit(EXIT_CONFIG);
-          break;
-        case '\u007F': // Backspace (macOS)
-        case '\b':     // Backspace
-          if (secret.length > 0) {
-            secret = secret.slice(0, -1);
-            process.stderr.write('\b \b');
-          }
-          break;
-        default:
-          if (ch.charCodeAt(0) >= 32) {
-            secret += ch;
-            process.stderr.write('*'.repeat(ch.length));
+      const f = fields[active];
+      // Ctrl+C — exit
+      if (ch === '\u0003') {
+        stdin.removeListener('data', onData);
+        stdin.setRawMode(false);
+        stdin.pause();
+        process.stderr.write('\n');
+        process.exit(EXIT_CONFIG);
+      }
+      if (f.type === 'select') {
+        if (ch === '\t' || ch === '\x1b[C' || ch === '\x1b[B') { // Tab, Right, Down
+          f.idx = (f.idx + 1) % f.options.length;
+          renderForm();
+        } else if (ch === '\x1b[D' || ch === '\x1b[A') { // Left, Up
+          f.idx = (f.idx - 1 + f.options.length) % f.options.length;
+          renderForm();
+        } else if (ch === '\n' || ch === '\r') {
+          advance();
+        }
+      } else {
+        // text / secret field
+        if (ch === '\n' || ch === '\r') {
+          advance();
+        } else if (ch === '\u007F' || ch === '\b') { // Backspace
+          if (inputBuf.length > 0) {
+            inputBuf = inputBuf.slice(0, -1);
+            if (!inputBuf) editing = false;
+            renderForm();
           }
-          break;
+        } else if (ch.charCodeAt(0) >= 32 && !ch.startsWith('\x1b')) {
+          if (!editing) editing = true;
+          inputBuf += ch;
+          renderForm();
+        }
       }
     };
@@ -140,41 +271,6 @@ function readSecret(prompt) {
   });
 }
-// ── Setup wizard ──────────────────────────────────────────────────────────────
-async function setupWizard() {
-  const existing = loadConfig() || {};
-  process.stderr.write(`\n${BOLD} dikt — setup${RESET}\n`);
-  process.stderr.write(`  ${DIM}Press Enter to keep the default shown in brackets.${RESET}\n\n`);
-  const apiKey = (await readSecret(`  Mistral API key [${existing.apiKey ? '••••' + existing.apiKey.slice(-4) : ''}]: `)).trim()
-    || existing.apiKey || '';
-  if (!apiKey) {
-    process.stderr.write(`\n  ${RED}API key is required.${RESET}\n\n`);
-    process.exit(EXIT_CONFIG);
-  }
-  const rl = readline.createInterface({ input: process.stdin, output: process.stderr });
-  const ask = (q) => new Promise((res) => rl.question(q, res));
-  const model = (await ask(`  Model [${existing.model || 'voxtral-mini-latest'}]: `)).trim()
-    || existing.model || 'voxtral-mini-latest';
-  const language = (await ask(`  Language [${existing.language || 'auto'}]: `)).trim()
-    || existing.language || '';
-  const tempStr = (await ask(`  Temperature [${existing.temperature ?? 'default'}]: `)).trim();
-  const temperature = tempStr ? parseFloat(tempStr) : (existing.temperature ?? null);
-  const contextBias = (await ask(`  Context bias [${existing.contextBias || ''}]: `)).trim()
-    || existing.contextBias || '';
-  rl.close();
-  const cfg = { apiKey, model, language: language === 'auto' ? '' : language, temperature, contextBias, autoCopy: existing.autoCopy || false };
-  saveConfig(cfg);
-  process.stderr.write(`\n  ${GREEN}✓${RESET} Saved to ${DIM}${CONFIG_FILE}${RESET}\n\n`);
-  return cfg;
-}
 // ── Prerequisites ─────────────────────────────────────────────────────────────
 function checkSox() {
@@ -236,8 +332,13 @@ function getTermWidth() {
 function render() {
   const w = getTermWidth();
   const header = ` dikt`;
-  const right = `[?] [q]uit `;
-  const pad = Math.max(0, w - header.length - right.length);
+  const tags = [];
+  if (config.diarize) tags.push('diarize');
+  if (config.timestamps) tags.push('timestamps');
+  const tagStr = tags.length ? `  ${DIM}${tags.join(' · ')}${RESET}` : '';
+  const tagPlain = tags.length ? `  ${tags.join(' · ')}` : '';
+  const right = `[s]etup [?] [q]uit `;
+  const pad = Math.max(0, w - header.length - tagPlain.length - right.length);
   let out = moveTo(1);
@@ -251,7 +352,7 @@ function render() {
     out += CLEAR_LINE + '\n';
     out += renderHelp();
   } else {
-    out += CLEAR_LINE + BOLD + header + ' '.repeat(pad) + DIM + right + RESET + '\n';
+    out += CLEAR_LINE + BOLD + header + RESET + tagStr + ' '.repeat(pad) + DIM + right + RESET + '\n';
     out += CLEAR_LINE + ` ${'─'.repeat(Math.max(0, w - 2))}` + '\n';
     out += CLEAR_LINE + '\n';
     out += CLEAR_LINE + renderKeybar() + '\n';
@@ -270,7 +371,9 @@ function render() {
       const rows = process.stdout.rows || 24;
       const availableRows = rows - 9; // header(2) + blank + keybar + blank + status + blank + meta + cleardown
       if (availableRows > 0 && lines.length > availableRows) {
-        lines = lines.slice(lines.length - availableRows);
+        const hidden = lines.length - availableRows + 1; // +1 to make room for the hint
+        lines = lines.slice(lines.length - availableRows + 1);
+        lines.unshift(`   ${DIM}↑ ${hidden} more line${hidden === 1 ? '' : 's'} above${RESET}`);
       }
       for (const line of lines) {
         out += CLEAR_LINE + line + '\n';
@@ -326,6 +429,46 @@ function wrapTranscript(termWidth) {
   if (!text) return [];
   const indent = '   ';
   const maxLen = termWidth - indent.length - 1; // leave 1 col margin
+  // Diarized transcript: each line is already formatted with speaker labels + ANSI colors.
+  // Handle each speaker line independently — no quotes, just indent and wrap.
+  if (config.diarize && text.includes('\n')) {
+    const result = [];
+    for (const speakerLine of text.split('\n')) {
+      if (!speakerLine) continue;
+      // ANSI codes mess up length calculation — strip them for measuring
+      const plain = speakerLine.replace(/\x1b\[[0-9;]*m/g, '');
+      if (plain.length <= maxLen || maxLen < 10) {
+        result.push(`${indent}${speakerLine}`);
+      } else {
+        // Wrap long speaker lines: first line keeps the label, continuation lines get extra indent
+        const labelMatch = plain.match(/^([A-Z]\s{2})/);
+        const contIndent = labelMatch ? ' '.repeat(labelMatch[1].length) : '';
+        const words = speakerLine.split(/(\s+)/);
+        let cur = '';
+        let curPlain = '';
+        let first = true;
+        for (const word of words) {
+          const wordPlain = word.replace(/\x1b\[[0-9;]*m/g, '');
+          if (curPlain.length + wordPlain.length > maxLen && curPlain.length > 0) {
+            result.push(`${indent}${cur}`);
+            cur = first ? contIndent : '';
+            curPlain = first ? contIndent : '';
+            first = false;
+            const trimmed = word.replace(/^\s+/, '');
+            cur += trimmed;
+            curPlain += trimmed.replace(/\x1b\[[0-9;]*m/g, '');
+          } else {
+            cur += word;
+            curPlain += wordPlain;
+          }
+        }
+        if (cur) result.push(`${indent}${first ? '' : contIndent}${cur}`);
+      }
+    }
+    return result;
+  }
   if (maxLen < 10) return [`${indent}${text}`];
   const words = text.split(/(\s+)/);
@@ -564,60 +707,40 @@ async function transcribe(wavPath) {
   try {
     const blob = await fs.openAsBlob(wavPath, { type: 'audio/wav' });
     const file = new File([blob], 'recording.wav', { type: 'audio/wav' });
-    const fd = new FormData();
-    fd.append('file', file);
-    fd.append('model', config.model);
-    if (config.language) fd.append('language', config.language);
-    if (config.temperature != null) fd.append('temperature', String(config.temperature));
-    if (config.contextBias) fd.append('context_bias', config.contextBias);
     const t0 = Date.now();
-    const resp = await fetch('https://api.mistral.ai/v1/audio/transcriptions', {
-      method: 'POST',
-      headers: { Authorization: `Bearer ${config.apiKey}` },
-      body: fd,
+    const result = await callTranscribeAPI(file, {
       signal: AbortSignal.timeout(30_000),
+      timestamps: config.timestamps || '',
+      diarize: config.diarize || false,
     });
     state.latency = Date.now() - t0;
-    if (!resp.ok) {
-      const raw = await resp.text().catch(() => '');
-      let msg;
-      try {
-        const e = JSON.parse(raw);
-        msg = e.message;
-        if (!msg && Array.isArray(e.detail)) {
-          msg = e.detail.map(d => [d.loc?.join('.'), d.msg].filter(Boolean).join(': ')).join('; ');
-        } else if (!msg && e.detail) {
-          msg = typeof e.detail === 'string' ? e.detail : JSON.stringify(e.detail);
-        }
-        if (!msg) msg = raw;
-      } catch {
-        msg = raw || `HTTP ${resp.status}`;
-      }
-      if (resp.status === 401) msg += ' — press [s] to reconfigure';
-      throw new Error(msg);
-    }
-    const data = await resp.json();
-    const text = (data.text || '').trim();
+    const text = result.text;
     if (!text) {
       state.mode = 'error';
       state.error = 'No speech detected';
     } else {
-      state.transcript = text;
+      // Format with speaker labels if diarization is active
+      if (config.diarize && result.segments) {
+        state.transcript = formatDiarizedText(result.segments, { color: true });
+      } else {
+        state.transcript = text;
+      }
       state.wordCount = text.split(/\s+/).filter(Boolean).length;
       state.mode = 'ready';
       // Push to history
-      state.history.unshift({ transcript: text, wordCount: state.wordCount, duration: state.duration, latency: state.latency });
+      state.history.unshift({ transcript: state.transcript, wordCount: state.wordCount, duration: state.duration, latency: state.latency });
       if (state.history.length > MAX_HISTORY) state.history.pop();
       state.historyIndex = -1;
     }
   } catch (err) {
     state.mode = 'error';
-    state.error = err.name === 'TimeoutError' ? 'Transcription timed out' : err.message;
+    let msg = err.name === 'TimeoutError' ? 'Transcription timed out' : err.message;
+    if (err.status === 401) msg += ' — press [s] to reconfigure';
+    state.error = msg;
   } finally {
     clearInterval(state.spinnerInterval);
     cleanupRecFile();
@@ -746,29 +869,265 @@ async function runSetup() {
   renderAll();
 }
+// ── Audio helpers ─────────────────────────────────────────────────────────────
+const SILENCE_THRESHOLD = Math.round(32768 * 0.01); // 1% of max 16-bit amplitude
+function createWavHeader(dataSize) {
+  const buf = Buffer.alloc(44);
+  buf.write('RIFF', 0);
+  buf.writeUInt32LE(36 + dataSize, 4);
+  buf.write('WAVE', 8);
+  buf.write('fmt ', 12);
+  buf.writeUInt32LE(16, 16);
+  buf.writeUInt16LE(1, 20);         // PCM
+  buf.writeUInt16LE(1, 22);         // mono
+  buf.writeUInt32LE(16000, 24);     // sample rate
+  buf.writeUInt32LE(32000, 28);     // byte rate (16000 * 1 * 2)
+  buf.writeUInt16LE(2, 32);         // block align
+  buf.writeUInt16LE(16, 34);        // bits per sample
+  buf.write('data', 36);
+  buf.writeUInt32LE(dataSize, 40);
+  return buf;
+}
+function peakAmplitude(chunk) {
+  let peak = 0;
+  for (let i = 0; i < chunk.length - 1; i += 2) {
+    const abs = Math.abs(chunk.readInt16LE(i));
+    if (abs > peak) peak = abs;
+  }
+  return peak;
+}
+function trimSilence(rawData) {
+  const SAMPLE_RATE = 16000;
+  const BYTES_PER_SAMPLE = 2;
+  const WINDOW_SAMPLES = Math.round(SAMPLE_RATE * 0.05); // 50ms windows
+  const WINDOW_BYTES = WINDOW_SAMPLES * BYTES_PER_SAMPLE;
+  const MAX_SILENCE_WINDOWS = Math.round(1.0 / 0.05); // 1 second = 20 windows
+  const PAD_WINDOWS = Math.round(0.1 / 0.05); // 100ms padding = 2 windows
+  const windows = [];
+  for (let offset = 0; offset + WINDOW_BYTES <= rawData.length; offset += WINDOW_BYTES) {
+    windows.push(rawData.subarray(offset, offset + WINDOW_BYTES));
+  }
+  // Include any trailing partial window
+  const remainder = rawData.length % WINDOW_BYTES;
+  if (remainder > 0) {
+    windows.push(rawData.subarray(rawData.length - remainder));
+  }
+  const output = [];
+  let silentCount = 0;
+  for (const win of windows) {
+    const peak = peakAmplitude(win);
+    if (peak < SILENCE_THRESHOLD) {
+      silentCount++;
+      if (silentCount <= MAX_SILENCE_WINDOWS) {
+        output.push(win);
+      } else if (silentCount === MAX_SILENCE_WINDOWS + 1) {
+        // Replace excess silence with padding
+        const padBytes = PAD_WINDOWS * WINDOW_BYTES;
+        output.push(Buffer.alloc(padBytes)); // zeros = silence
+      }
+      // else: skip (already added padding)
+    } else {
+      silentCount = 0;
+      output.push(win);
+    }
+  }
+  return Buffer.concat(output);
+}
+async function callTranscribeAPI(file, { signal, timestamps, diarize } = {}) {
+  const fd = new FormData();
+  fd.append('file', file);
+  fd.append('model', config.model);
+  if (config.language) fd.append('language', config.language);
+  if (config.temperature != null) fd.append('temperature', String(config.temperature));
+  if (config.contextBias) fd.append('context_bias', config.contextBias);
+  if (timestamps) {
+    for (const g of timestamps.split(',')) fd.append('timestamp_granularities[]', g.trim());
+  }
+  if (diarize) {
+    fd.append('diarize', 'true');
+    // API requires segment timestamps when diarize is enabled
+    if (!timestamps) fd.append('timestamp_granularities[]', 'segment');
+  }
+  const t0 = Date.now();
+  const resp = await fetch('https://api.mistral.ai/v1/audio/transcriptions', {
+    method: 'POST',
+    headers: { Authorization: `Bearer ${config.apiKey}` },
+    body: fd,
+    signal: signal || AbortSignal.timeout(30_000),
+  });
+  const latency = Date.now() - t0;
+  if (!resp.ok) {
+    const raw = await resp.text().catch(() => '');
+    let msg;
+    try {
+      const e = JSON.parse(raw);
+      msg = e.message;
+      if (typeof msg === 'object' && msg !== null) msg = JSON.stringify(msg);
+      if (!msg && Array.isArray(e.detail)) {
+        msg = e.detail.map(d => [d.loc?.join('.'), d.msg].filter(Boolean).join(': ')).join('; ');
+      } else if (!msg && e.detail) {
+        msg = typeof e.detail === 'string' ? e.detail : JSON.stringify(e.detail);
+      }
+      if (!msg) msg = raw;
+    } catch {
+      msg = raw || `HTTP ${resp.status}`;
+    }
+    const err = new Error(msg);
+    err.status = resp.status;
+    throw err;
+  }
+  const data = await resp.json();
+  const text = (data.text || '').trim();
+  return { text, latency, segments: data.segments, words: data.words };
+}
+async function transcribeBuffer(rawChunks, { signal, timestamps, diarize } = {}) {
+  const rawData = Buffer.concat(rawChunks);
+  const trimmed = trimSilence(rawData);
+  const wavData = Buffer.concat([createWavHeader(trimmed.length), trimmed]);
+  const blob = new Blob([wavData], { type: 'audio/wav' });
+  const file = new File([blob], 'recording.wav', { type: 'audio/wav' });
+  return callTranscribeAPI(file, { signal, timestamps, diarize });
+}
+// ── Output formatting helpers ─────────────────────────────────────────────────
+const SPEAKER_COLORS = [GREEN, YELLOW, CYAN, MAGENTA, BLUE, RED];
+function formatDiarizedText(segments, { color = false } = {}) {
+  if (!segments || !segments.length) return '';
+  // Map speaker IDs to short letters (A, B, C, ...)
+  const speakerMap = new Map();
+  for (const s of segments) {
+    if (s.speaker_id != null && !speakerMap.has(s.speaker_id)) {
+      speakerMap.set(s.speaker_id, speakerMap.size);
+    }
+  }
+  // Merge consecutive segments from the same speaker
+  const merged = [];
+  for (const s of segments) {
+    const text = (s.text || '').trim();
+    if (!text) continue;
+    const last = merged[merged.length - 1];
+    if (last && last.speaker_id === s.speaker_id) {
+      last.text += ' ' + text;
+    } else {
+      merged.push({ speaker_id: s.speaker_id, text });
+    }
+  }
+  return merged.map(s => {
+    const idx = speakerMap.get(s.speaker_id) ?? 0;
+    const letter = String.fromCharCode(65 + idx); // A, B, C, ...
+    if (color) {
+      const c = SPEAKER_COLORS[idx % SPEAKER_COLORS.length];
+      return `${c}${BOLD}${letter}${RESET}  ${s.text}`;
+    }
+    return `${letter}  ${s.text}`;
+  }).join('\n');
+}
+function buildJsonOutput(base, { segments, words, timestamps, diarize } = {}) {
+  const out = { ...base, timestamp: new Date().toISOString() };
+  if ((timestamps || diarize) && segments) out.segments = segments;
+  if (timestamps && words) out.words = words;
+  return out;
+}
+// ── File mode ────────────────────────────────────────────────────────────────
+async function runFile(flags) {
+  try {
+    if (!flags.file || !fs.existsSync(flags.file)) {
+      process.stderr.write(`Error: file not found: ${flags.file}\n`);
+      return EXIT_TRANSCRIPTION;
+    }
+    const blob = await fs.openAsBlob(flags.file);
+    const ext = path.extname(flags.file).slice(1) || 'wav';
+    const mimeTypes = { wav: 'audio/wav', mp3: 'audio/mpeg', m4a: 'audio/mp4', ogg: 'audio/ogg', flac: 'audio/flac', webm: 'audio/webm' };
+    const mime = mimeTypes[ext] || 'audio/wav';
+    const file = new File([blob], path.basename(flags.file), { type: mime });
+    const result = await callTranscribeAPI(file, { timestamps: flags.timestamps, diarize: flags.diarize });
+    if (!result.text) {
+      process.stderr.write('No speech detected\n');
+      return EXIT_TRANSCRIPTION;
+    }
+    const wordCount = result.text.split(/\s+/).filter(Boolean).length;
+    if (flags.json) {
+      const out = buildJsonOutput(
+        { text: result.text, latency: result.latency, words: wordCount },
+        { segments: result.segments, words: result.words, timestamps: flags.timestamps, diarize: flags.diarize },
+      );
+      process.stdout.write(JSON.stringify(out) + '\n');
+    } else if (flags.diarize && result.segments) {
+      process.stdout.write(formatDiarizedText(result.segments) + '\n');
+    } else {
+      process.stdout.write(result.text + '\n');
+    }
+    return EXIT_OK;
+  } catch (err) {
+    process.stderr.write(`Error: ${err.message}\n`);
+    return EXIT_TRANSCRIPTION;
+  }
+}
 // ── Single-shot mode ──────────────────────────────────────────────────────────
 async function runOnce(flags) {
-  const recFile = path.join(os.tmpdir(), `dikt-${Date.now()}.wav`);
   try {
-    // Record with silence detection via sox silence effect
+    // Record raw PCM to stdout — silence detection handled in Node.js
     const recProc = spawn('rec', [
-      '-q', '-r', '16000', '-c', '1', '-b', '16',
-      recFile,
-      'silence', '1', '0.1', '1%', '1', '2.0', '1%',
+      '-q', '-r', '16000', '-c', '1', '-b', '16', '-t', 'raw', '-',
     ], {
-      stdio: ['ignore', 'ignore', 'pipe'],
+      stdio: ['ignore', 'pipe', 'pipe'],
     });
     recProc.stderr.on('data', () => {});
-    // Ctrl+C stops recording gracefully
     const sigHandler = () => recProc.kill('SIGTERM');
     process.on('SIGINT', sigHandler);
+    const chunks = [];
+    let heardSound = false;
+    let lastSoundTime = Date.now();
     const recStart = Date.now();
+    recProc.stdout.on('data', (chunk) => {
+      chunks.push(chunk);
+      if (peakAmplitude(chunk) > SILENCE_THRESHOLD) {
+        heardSound = true;
+        lastSoundTime = Date.now();
+      }
+    });
+    const silenceTimer = setInterval(() => {
+      if (flags.silence > 0 && heardSound && Date.now() - lastSoundTime > flags.silence * 1000) {
+        recProc.kill('SIGTERM');
+      }
+    }, 100);
     await new Promise((resolve) => recProc.on('close', resolve));
+    clearInterval(silenceTimer);
     process.removeListener('SIGINT', sigHandler);
     const duration = (Date.now() - recStart) / 1000;
@@ -782,45 +1141,26 @@ async function runOnce(flags) {
     const abortHandler = () => ac.abort();
     process.on('SIGINT', abortHandler);
-    const blob = await fs.openAsBlob(recFile, { type: 'audio/wav' });
-    const file = new File([blob], 'recording.wav', { type: 'audio/wav' });
-    const fd = new FormData();
-    fd.append('file', file);
-    fd.append('model', config.model);
-    if (config.language) fd.append('language', config.language);
-    if (config.temperature != null) fd.append('temperature', String(config.temperature));
-    if (config.contextBias) fd.append('context_bias', config.contextBias);
-    const t0 = Date.now();
-    const resp = await fetch('https://api.mistral.ai/v1/audio/transcriptions', {
-      method: 'POST',
-      headers: { Authorization: `Bearer ${config.apiKey}` },
-      body: fd,
-      signal: ac.signal,
-    });
-    const latency = Date.now() - t0;
+    const result = await transcribeBuffer(chunks, { signal: ac.signal, timestamps: flags.timestamps, diarize: flags.diarize });
     process.removeListener('SIGINT', abortHandler);
-    if (!resp.ok) {
-      const raw = await resp.text().catch(() => '');
-      process.stderr.write(`Error: ${raw || `HTTP ${resp.status}`}\n`);
-      return EXIT_TRANSCRIPTION;
-    }
-    const data = await resp.json();
-    const text = (data.text || '').trim();
-    if (!text) {
+    if (!result.text) {
       process.stderr.write('No speech detected\n');
       return EXIT_TRANSCRIPTION;
     }
-    const wordCount = text.split(/\s+/).filter(Boolean).length;
+    const wordCount = result.text.split(/\s+/).filter(Boolean).length;
     if (flags.json) {
-      process.stdout.write(JSON.stringify({ text, duration: parseFloat(duration.toFixed(1)), latency, words: wordCount }) + '\n');
+      const out = buildJsonOutput(
+        { text: result.text, duration: parseFloat(duration.toFixed(1)), latency: result.latency, words: wordCount },
+        { segments: result.segments, words: result.words, timestamps: flags.timestamps, diarize: flags.diarize },
+      );
+      process.stdout.write(JSON.stringify(out) + '\n');
+    } else if (flags.diarize && result.segments) {
+      process.stdout.write(formatDiarizedText(result.segments) + '\n');
     } else {
-      process.stdout.write(text + '\n');
+      process.stdout.write(result.text + '\n');
     }
     return EXIT_OK;
@@ -831,8 +1171,123 @@ async function runOnce(flags) {
       process.stderr.write(`Error: ${err.message}\n`);
     }
     return EXIT_TRANSCRIPTION;
-  } finally {
-    try { fs.unlinkSync(recFile); } catch {}
+  }
+}
+// ── Stream mode ──────────────────────────────────────────────────────────────
+async function runStream(flags) {
+  try {
+    const recProc = spawn('rec', [
+      '-q', '-r', '16000', '-c', '1', '-b', '16', '-t', 'raw', '-',
+    ], {
+      stdio: ['ignore', 'pipe', 'pipe'],
+    });
+    recProc.stderr.on('data', () => {});
+    let killed = false;
+    const killRec = () => { if (!killed) { killed = true; recProc.kill('SIGTERM'); process.stderr.write('\n'); } };
+    process.on('SIGINT', killRec);
+    let chunks = [];          // current chunk buffer (resets per pause)
+    let chunkHasAudio = false; // current chunk has sound (resets per pause)
+    let heardSound = false;    // ever heard sound (never resets)
+    let lastSoundTime = Date.now();
+    let chunkStart = Date.now();
+    let chunkIndex = 0;
+    const pending = [];
+    recProc.stdout.on('data', (chunk) => {
+      chunks.push(chunk);
+      if (peakAmplitude(chunk) > SILENCE_THRESHOLD) {
+        chunkHasAudio = true;
+        heardSound = true;
+        lastSoundTime = Date.now();
+      }
+    });
+    const checkTimer = setInterval(() => {
+      const silenceMs = Date.now() - lastSoundTime;
+      // Pause: send current chunk for transcription, keep recording
+      if (chunkHasAudio && silenceMs > flags.pause * 1000 && chunks.length > 0) {
+        const batch = chunks;
+        const duration = (Date.now() - chunkStart) / 1000;
+        const idx = chunkIndex++;
+        chunks = [];
+        chunkHasAudio = false;
+        chunkStart = Date.now();
+        const p = transcribeBuffer(batch, { timestamps: flags.timestamps, diarize: flags.diarize })
+          .then((result) => {
+            if (!result.text) return;
+            const wordCount = result.text.split(/\s+/).filter(Boolean).length;
+            if (flags.json) {
+              const out = buildJsonOutput(
+                { text: result.text, chunk: idx, duration: parseFloat(duration.toFixed(1)), latency: result.latency, words: wordCount },
+                { segments: result.segments, words: result.words, timestamps: flags.timestamps, diarize: flags.diarize },
+              );
+              process.stdout.write(JSON.stringify(out) + '\n');
+            } else if (flags.diarize && result.segments) {
+              const sep = flags.noNewline ? ' ' : '\n';
+              process.stdout.write(formatDiarizedText(result.segments) + sep);
+            } else {
+              process.stdout.write(result.text + (flags.noNewline ? ' ' : '\n'));
+            }
+          })
+          .catch((err) => {
+            process.stderr.write(`Chunk ${idx} error: ${err.message}\n`);
+          });
+        pending.push(p);
+      }
+      // Stop: full silence threshold reached
+      if (flags.silence > 0 && heardSound && silenceMs > flags.silence * 1000) {
+        killRec();
+      }
+    }, 100);
+    await new Promise((resolve) => recProc.on('close', resolve));
+    clearInterval(checkTimer);
+    process.removeListener('SIGINT', killRec);
+    // Send any remaining audio that hasn't been sent yet
+    if (chunks.length > 0 && chunkHasAudio) {
+      const duration = (Date.now() - chunkStart) / 1000;
+      const idx = chunkIndex++;
+      try {
+        const result = await transcribeBuffer(chunks, { timestamps: flags.timestamps, diarize: flags.diarize });
+        if (result.text) {
+          const wordCount = result.text.split(/\s+/).filter(Boolean).length;
+          if (flags.json) {
+            const out = buildJsonOutput(
+              { text: result.text, chunk: idx, duration: parseFloat(duration.toFixed(1)), latency: result.latency, words: wordCount },
+              { segments: result.segments, words: result.words, timestamps: flags.timestamps, diarize: flags.diarize },
+            );
+            process.stdout.write(JSON.stringify(out) + '\n');
+          } else if (flags.diarize && result.segments) {
+            const sep = flags.noNewline ? ' ' : '\n';
+            process.stdout.write(formatDiarizedText(result.segments) + sep);
+          } else {
+            process.stdout.write(result.text + (flags.noNewline ? ' ' : '\n'));
+          }
+        }
+      } catch (err) {
+        process.stderr.write(`Chunk ${idx} error: ${err.message}\n`);
+      }
+    }
+    // Wait for any in-flight transcriptions to finish
+    await Promise.allSettled(pending);
+    // Final newline for --no-newline so shell prompt starts on a new line
+    if (flags.noNewline && !flags.json) process.stdout.write('\n');
+    return EXIT_OK;
+  } catch (err) {
+    process.stderr.write(`Error: ${err.message}\n`);
+    return EXIT_TRANSCRIPTION;
   }
 }
@@ -863,6 +1318,14 @@ async function main() {
     quiet: args.includes('--quiet') || args.includes('-q'),
     noInput: args.includes('--no-input'),
     setup: args.includes('--setup') || args[0] === 'setup',
+    stream: args.includes('--stream'),
+    silence: args.includes('--silence') ? (Number.isFinite(parseFloat(args[args.indexOf('--silence') + 1])) ? parseFloat(args[args.indexOf('--silence') + 1]) : 2.0) : 2.0,
+    pause: args.includes('--pause') ? parseFloat(args[args.indexOf('--pause') + 1]) || 1.0 : 1.0,
+    language: args.includes('--language') ? args[args.indexOf('--language') + 1] || '' : '',
+    file: args.includes('--file') ? args[args.indexOf('--file') + 1] || '' : '',
+    noNewline: args.includes('--no-newline') || args.includes('-n'),
+    timestamps: args.includes('--timestamps') ? args[args.indexOf('--timestamps') + 1] || '' : '',
+    diarize: args.includes('--diarize'),
   };
   if (args.includes('--version')) {
@@ -903,6 +1366,14 @@ Options:
   --update                   Update to latest version
   --json                     Record once, output JSON to stdout
   -q, --quiet                Record once, print transcript to stdout
+  --stream                   Stream transcription chunks on pauses
+  --file <path>              Transcribe an audio file (no mic needed)
+  --silence <seconds>        Silence duration before auto-stop (default: 2.0)
+  --pause <seconds>          Pause duration to split chunks (default: 1.0)
+  --language <code>          Language code, e.g. en, de, fr (default: auto)
+  -n, --no-newline           Join stream chunks without newlines
+  --timestamps <granularity> Add timestamps: segment, word, or segment,word
+  --diarize                  Enable speaker identification
   --no-input                 Fail if config is missing (no wizard)
   --no-color                 Disable colored output
   --version                  Show version
@@ -920,8 +1391,15 @@ Examples:
   dikt setup                 Reconfigure API key and model
   dikt -q                    Record once, print transcript to stdout
   dikt --json                Record once, output JSON to stdout
+  dikt -q --silence 5        Wait longer before auto-stopping
+  dikt --stream              Stream chunks as you speak
+  dikt --stream --json       Stream chunks as JSON Lines
   dikt -q | claude           Dictate a prompt to Claude Code
   dikt update                Update to the latest version
+  dikt --file meeting.wav    Transcribe an existing audio file
+  dikt --stream --silence 0  Stream continuously until Ctrl+C
+  dikt --stream -n           Stream as continuous flowing text
+  dikt -q --json --diarize   Transcribe with speaker labels
 Environment variables:
   DIKT_API_KEY               Override API key from config
@@ -942,8 +1420,6 @@ Requires: sox (brew install sox)`);
     process.exit(EXIT_OK);
   }
-  checkSox();
   // Load or setup config
   if (flags.setup) {
     checkTTY();
@@ -961,6 +1437,9 @@ Requires: sox (brew install sox)`);
   }
   applyEnvOverrides(config);
+  if (flags.language) config.language = flags.language;
+  if (!flags.timestamps && config.timestamps) flags.timestamps = config.timestamps;
+  if (!flags.diarize && config.diarize) flags.diarize = true;
   const validation = validateConfig(config);
   if (!validation.valid) {
@@ -970,6 +1449,33 @@ Requires: sox (brew install sox)`);
     process.exit(EXIT_CONFIG);
   }
+  // Validate incompatible flag combinations
+  const lang = config.language;
+  if (lang && flags.timestamps) {
+    process.stderr.write('Error: --timestamps and --language cannot be used together\n');
+    process.exit(EXIT_CONFIG);
+  }
+  if (lang && flags.diarize) {
+    process.stderr.write('Error: --diarize and --language cannot be used together\n');
+    process.exit(EXIT_CONFIG);
+  }
+  if (flags.diarize && flags.stream) {
+    process.stderr.write('Error: --diarize is not compatible with --stream, use -q --diarize instead\n');
+    process.exit(EXIT_CONFIG);
+  }
+  // File mode: transcribe an existing audio file (no sox needed)
+  if (flags.file) {
+    process.exit(await runFile(flags));
+  }
+  checkSox();
+  // Stream mode: chunked transcription on pauses
+  if (flags.stream) {
+    process.exit(await runStream(flags));
+  }
   // Single-shot mode: record once, output, exit
   if (flags.json || flags.quiet) {
     process.exit(await runOnce(flags));
@@ -978,6 +1484,10 @@ Requires: sox (brew install sox)`);
   // Interactive TUI mode
   checkTTY();
+  // Clear any setup wizard output before entering alt screen, so it doesn't
+  // leak back when the alt screen exits.
+  process.stdout.write(CLEAR_SCREEN);
   // Enter raw TUI mode (alternate screen buffer prevents scrollback corruption)
   process.stdout.write(ALT_SCREEN_ON + HIDE_CURSOR + CLEAR_SCREEN);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "dikt",
-  "version": "1.0.2",
+  "version": "1.1.0",
   "description": "Voice dictation for the terminal.",
   "type": "module",
   "bin": {
@@ -17,7 +17,9 @@
     "cli",
     "terminal",
     "whisper",
-    "mistral"
+    "mistral",
+    "diarization",
+    "voxtral"
   ],
   "author": "johxyz",
   "repository": {