dikt 1.1.2 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +88 -7
  2. package/cli.mjs +719 -51
  3. package/package.json +1 -1
package/README.md CHANGED
@@ -10,7 +10,7 @@ Uses [Mistral's Voxtral](https://docs.mistral.ai/capabilities/audio/) for speech
10
10
  npm install -g dikt
11
11
  ```
12
12
 
13
- Requires [sox](https://sox.sourceforge.net/) for audio recording:
13
+ Requires [sox](https://sox.sourceforge.net/) for audio recording (not needed for `--file`):
14
14
 
15
15
  ```bash
16
16
  # macOS
@@ -23,6 +23,11 @@ sudo apt install sox
23
23
  sudo pacman -S sox
24
24
  ```
25
25
 
26
+ Optional dependencies for `--file` mode:
27
+
28
+ - [ffmpeg](https://ffmpeg.org/) — enables compression, chunked transcription of long files, and broader format support
29
+ - [yt-dlp](https://github.com/yt-dlp/yt-dlp) — enables transcribing audio from URLs (YouTube, podcasts, etc.)
30
+
26
31
  ## Setup
27
32
 
28
33
  On first run, dikt will prompt you for your Mistral API key and model preferences:
@@ -55,12 +60,6 @@ This opens an interactive TUI where you can record, transcribe, and copy text.
55
60
  | `?` | Show keybindings |
56
61
  | `q` | Quit |
57
62
 
58
- ### Update
59
-
60
- ```
61
- dikt update
62
- ```
63
-
64
63
  ### Single-shot mode
65
64
 
66
65
  ```bash
@@ -72,6 +71,88 @@ dikt --json
72
71
 
73
72
  # Pipe to another tool
74
73
  dikt -q | claude
74
+
75
+ # Wait longer before auto-stopping
76
+ dikt -q --silence 5
77
+ ```
78
+
79
+ ### Stream mode
80
+
81
+ Continuously transcribe, emitting chunks on pauses:
82
+
83
+ ```bash
84
+ dikt --stream
85
+
86
+ # Stream as JSON Lines
87
+ dikt --stream --json
88
+
89
+ # Stream as continuous flowing text
90
+ dikt --stream -n
91
+
92
+ # Stream continuously until Ctrl+C
93
+ dikt --stream --silence 0
94
+ ```
95
+
96
+ ### File mode
97
+
98
+ Transcribe an existing audio file (wav, mp3, m4a, flac, ogg, webm, aac, wma, and more):
99
+
100
+ ```bash
101
+ dikt --file meeting.wav
102
+
103
+ # Save to a file (.json auto-enables JSON output)
104
+ dikt --file meeting.wav -o transcript.json
105
+ dikt --file meeting.wav -o transcript.txt
106
+
107
+ # With JSON output
108
+ dikt --file recording.mp3 --json
109
+
110
+ # Transcribe from a URL (requires yt-dlp)
111
+ dikt --file https://youtube.com/watch?v=VIDEO_ID
112
+ dikt --file https://youtube.com/watch?v=VIDEO_ID -o transcript.txt
113
+ ```
114
+
115
+ ### Speaker identification & timestamps
116
+
117
+ ```bash
118
+ # Speaker labels
119
+ dikt -q --diarize
120
+
121
+ # Timestamps
122
+ dikt -q --timestamps segment
123
+ dikt -q --timestamps word
124
+ dikt --file lecture.mp3 --timestamps segment
125
+
126
+ # Combined with JSON
127
+ dikt -q --json --diarize
128
+ ```
129
+
130
+ ### Options
131
+
132
+ | Flag | Description |
133
+ |---|---|
134
+ | `--file <path\|url>` | Transcribe audio file or URL (via yt-dlp) |
135
+ | `-o`, `--output <path>` | Write output to file (`.json` auto-enables JSON) |
136
+ | `--stream` | Stream transcription chunks on pauses |
137
+ | `--json` | Output JSON (single-shot or stream) |
138
+ | `-q`, `--quiet` | Record once, print transcript to stdout |
139
+ | `--silence <seconds>` | Silence duration before auto-stop (default: 2.0) |
140
+ | `--pause <seconds>` | Pause duration to split stream chunks (default: 1.0) |
141
+ | `--language <code>` | Language code, e.g. en, de, fr (default: auto) |
142
+ | `--timestamps <granularity>` | Add timestamps: segment or word |
143
+ | `--diarize` | Enable speaker identification |
144
+ | `-n`, `--no-newline` | Join stream chunks without newlines |
145
+ | `--no-color` | Disable colored output |
146
+ | `--no-input` | Fail if config is missing (no wizard) |
147
+ | `--setup` | Run setup wizard |
148
+ | `--update` | Update to latest version |
149
+ | `--version` | Show version |
150
+ | `-h`, `--help` | Show help |
151
+
152
+ ### Update
153
+
154
+ ```
155
+ dikt update
75
156
  ```
76
157
 
77
158
  ## Environment variables
package/cli.mjs CHANGED
@@ -6,7 +6,10 @@ import fs from 'node:fs';
6
6
  import path from 'node:path';
7
7
  import os from 'node:os';
8
8
  import readline from 'node:readline';
9
- import { spawn, execFileSync } from 'node:child_process';
9
+ import { spawn, execFileSync, execFile as execFileCb } from 'node:child_process';
10
+ import { promisify } from 'node:util';
11
+ const execFileAsync = promisify(execFileCb);
12
+ import https from 'node:https';
10
13
 
11
14
  // ── ANSI helpers ──────────────────────────────────────────────────────────────
12
15
 
@@ -37,9 +40,16 @@ if (process.env.NO_COLOR != null || process.env.TERM === 'dumb' || process.argv.
37
40
 
38
41
  const moveTo = (row, col = 1) => `${ESC}${row};${col}H`;
39
42
 
43
+ function formatFileSize(bytes) {
44
+ if (bytes < 1024) return `${bytes} B`;
45
+ if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
46
+ if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
47
+ return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`;
48
+ }
49
+
40
50
  // ── Constants ─────────────────────────────────────────────────────────────────
41
51
 
42
- const VERSION = '1.1.2';
52
+ const VERSION = '1.3.0';
43
53
  const CONFIG_BASE = process.env.XDG_CONFIG_HOME || path.join(os.homedir(), '.config');
44
54
  const CONFIG_DIR = path.join(CONFIG_BASE, 'dikt');
45
55
  const CONFIG_FILE = path.join(CONFIG_DIR, 'config.json');
@@ -47,6 +57,56 @@ const MAX_HISTORY = 10;
47
57
  const MIN_RECORDING_MS = 500;
48
58
  const COST_PER_MIN = 0.003;
49
59
  const SPINNER = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏'];
60
+ const TARGET_CHUNK_SEC = 270; // ~4.5 min target chunk size
61
+ const CHUNK_MIN_SEC = 360; // only chunk files longer than 6 minutes
62
+ const SPLIT_SEARCH_SEC = 30; // search ±30s around target for silence split point
63
+ const MIN_CHUNK_SEC = 30; // merge chunks shorter than this into neighbor
64
+ const MAX_PARALLEL = 4; // max concurrent API requests
65
+ const MIME_TYPES = { wav: 'audio/wav', mp3: 'audio/mpeg', ogg: 'audio/ogg', flac: 'audio/flac', opus: 'audio/ogg', webm: 'audio/webm', m4a: 'audio/mp4', aac: 'audio/aac', wma: 'audio/x-ms-wma', aif: 'audio/aiff', aiff: 'audio/aiff', mp4: 'audio/mp4', oga: 'audio/ogg', amr: 'audio/amr', caf: 'audio/x-caf' };
66
+ const COMPRESSIBLE = new Set(['wav', 'flac', 'aiff', 'aif', 'raw', 'caf']); // lossless formats worth re-encoding
67
+
68
+ function createStderrSpinner() {
69
+ let frame = 0;
70
+ let interval = null;
71
+ let currentMsg = '';
72
+ const isTTY = process.stderr.isTTY;
73
+ const render = () => {
74
+ const sp = SPINNER[frame++ % SPINNER.length];
75
+ process.stderr.write(`\r${CLEAR_LINE}${YELLOW}${sp}${RESET} ${currentMsg}`);
76
+ };
77
+
78
+ return {
79
+ start(msg) {
80
+ currentMsg = msg;
81
+ if (isTTY) {
82
+ render();
83
+ interval = setInterval(render, 80);
84
+ } else {
85
+ process.stderr.write(`${currentMsg}\n`);
86
+ }
87
+ },
88
+ update(msg) {
89
+ currentMsg = msg;
90
+ if (isTTY) {
91
+ // Restart interval — prevents queued callbacks from firing after sync calls
92
+ if (interval) { clearInterval(interval); }
93
+ render();
94
+ interval = setInterval(render, 80);
95
+ } else {
96
+ process.stderr.write(`${msg}\n`);
97
+ }
98
+ },
99
+ stop(finalMsg) {
100
+ if (interval) { clearInterval(interval); interval = null; }
101
+ if (isTTY) {
102
+ process.stderr.write(`\r${CLEAR_LINE}`);
103
+ if (finalMsg) process.stderr.write(`${finalMsg}\n`);
104
+ } else if (finalMsg) {
105
+ process.stderr.write(`${finalMsg}\n`);
106
+ }
107
+ },
108
+ };
109
+ }
50
110
 
51
111
  const EXIT_OK = 0;
52
112
  const EXIT_DEPENDENCY = 1;
@@ -93,8 +153,8 @@ function validateConfig(cfg) {
93
153
 
94
154
  // ── Setup wizard (form-based) ─────────────────────────────────────────────────
95
155
 
96
- const TIMESTAMPS_DISPLAY = { '': 'off', 'segment': 'segment', 'word': 'word', 'segment,word': 'both' };
97
- const TIMESTAMPS_VALUE = { 'off': '', 'segment': 'segment', 'word': 'word', 'both': 'segment,word' };
156
+ const TIMESTAMPS_DISPLAY = { '': 'off', 'segment': 'segment', 'word': 'word' };
157
+ const TIMESTAMPS_VALUE = { 'off': '', 'segment': 'segment', 'word': 'word' };
98
158
 
99
159
  async function setupWizard() {
100
160
  const existing = loadConfig() || {};
@@ -105,7 +165,7 @@ async function setupWizard() {
105
165
  { key: 'language', label: 'Language', type: 'text', value: '', display: existing.language || 'auto', fallback: existing.language || '' },
106
166
  { key: 'temperature', label: 'Temperature', type: 'text', value: '', display: existing.temperature != null ? String(existing.temperature) : 'default', fallback: existing.temperature != null ? String(existing.temperature) : '' },
107
167
  { key: 'contextBias', label: 'Context bias', type: 'text', value: '', display: existing.contextBias || '', fallback: existing.contextBias || '' },
108
- { key: 'timestamps', label: 'Timestamps', type: 'select', options: ['off', 'segment', 'word', 'both'], idx: ['off', 'segment', 'word', 'both'].indexOf(TIMESTAMPS_DISPLAY[existing.timestamps || ''] || 'off') },
168
+ { key: 'timestamps', label: 'Timestamps', type: 'select', options: ['off', 'segment', 'word'], idx: ['off', 'segment', 'word'].indexOf(TIMESTAMPS_DISPLAY[existing.timestamps || ''] || 'off') },
109
169
  { key: 'diarize', label: 'Diarize', type: 'select', options: ['off', 'on'], idx: existing.diarize ? 1 : 0 },
110
170
  ];
111
171
 
@@ -576,6 +636,7 @@ function copy(text) {
576
636
  }
577
637
 
578
638
  const proc = spawn(cmd[0], cmd.slice(1), { stdio: ['pipe', 'ignore', 'ignore'] });
639
+ proc.on('error', () => {}); // swallow — clipboard is best-effort
579
640
  proc.stdin.end(text);
580
641
 
581
642
  state.mode = 'copied';
@@ -869,6 +930,9 @@ async function runSetup() {
869
930
  config = await setupWizard();
870
931
  applyEnvOverrides(config);
871
932
 
933
+ state.mode = state.transcript ? 'ready' : 'idle';
934
+ state.error = '';
935
+
872
936
  process.stdin.resume();
873
937
  process.stdin.setRawMode(true);
874
938
  process.stdin.on('keypress', handleKey);
@@ -949,7 +1013,7 @@ function trimSilence(rawData) {
949
1013
  return Buffer.concat(output);
950
1014
  }
951
1015
 
952
- async function callTranscribeAPI(file, { signal, timestamps, diarize } = {}) {
1016
+ async function callTranscribeAPI(file, { signal, timestamps, diarize, onProgress } = {}) {
953
1017
  const fd = new FormData();
954
1018
  fd.append('file', file);
955
1019
  fd.append('model', config.model);
@@ -957,7 +1021,7 @@ async function callTranscribeAPI(file, { signal, timestamps, diarize } = {}) {
957
1021
  if (config.temperature != null) fd.append('temperature', String(config.temperature));
958
1022
  if (config.contextBias) fd.append('context_bias', config.contextBias);
959
1023
  if (timestamps) {
960
- for (const g of timestamps.split(',')) fd.append('timestamp_granularities[]', g.trim());
1024
+ fd.append('timestamp_granularities[]', timestamps);
961
1025
  }
962
1026
  if (diarize) {
963
1027
  fd.append('diarize', 'true');
@@ -965,17 +1029,68 @@ async function callTranscribeAPI(file, { signal, timestamps, diarize } = {}) {
965
1029
  if (!timestamps) fd.append('timestamp_granularities[]', 'segment');
966
1030
  }
967
1031
 
968
- const t0 = Date.now();
969
- const resp = await fetch('https://api.mistral.ai/v1/audio/transcriptions', {
1032
+ // Use Request to serialize FormData into multipart body,
1033
+ // then send via node:https which has no hardcoded headersTimeout
1034
+ // (Node's built-in fetch/undici has a 300s headersTimeout that
1035
+ // cannot be configured without importing undici as a dependency).
1036
+ const req = new Request('https://api.mistral.ai/v1/audio/transcriptions', {
970
1037
  method: 'POST',
971
1038
  headers: { Authorization: `Bearer ${config.apiKey}` },
972
1039
  body: fd,
973
- signal: signal || AbortSignal.timeout(30_000),
1040
+ });
1041
+ const contentType = req.headers.get('content-type');
1042
+ const body = Buffer.from(await req.arrayBuffer());
1043
+
1044
+ const t0 = Date.now();
1045
+ const { status, raw } = await new Promise((resolve, reject) => {
1046
+ const hreq = https.request('https://api.mistral.ai/v1/audio/transcriptions', {
1047
+ method: 'POST',
1048
+ headers: {
1049
+ 'Authorization': `Bearer ${config.apiKey}`,
1050
+ 'Content-Type': contentType,
1051
+ 'Content-Length': body.length,
1052
+ },
1053
+ }, (res) => {
1054
+ const chunks = [];
1055
+ res.on('data', (c) => chunks.push(c));
1056
+ res.on('end', () => resolve({ status: res.statusCode, raw: Buffer.concat(chunks).toString() }));
1057
+ res.on('error', reject);
1058
+ });
1059
+
1060
+ hreq.on('error', (err) => {
1061
+ const ne = new Error(`Network error: ${err.message}`);
1062
+ ne.networkError = true;
1063
+ reject(ne);
1064
+ });
1065
+
1066
+ const abortSig = signal || AbortSignal.timeout(30_000);
1067
+ if (abortSig.aborted) { hreq.destroy(); reject(new DOMException('The operation was aborted', 'AbortError')); return; }
1068
+ abortSig.addEventListener('abort', () => {
1069
+ hreq.destroy();
1070
+ reject(abortSig.reason instanceof DOMException ? abortSig.reason
1071
+ : new DOMException('The operation was aborted', 'AbortError'));
1072
+ }, { once: true });
1073
+
1074
+ // Write body in chunks to enable upload progress tracking
1075
+ const CHUNK_SIZE = 256 * 1024;
1076
+ let written = 0;
1077
+ const total = body.length;
1078
+ const writeChunks = () => {
1079
+ while (written < total) {
1080
+ const end = Math.min(written + CHUNK_SIZE, total);
1081
+ const ok = hreq.write(body.subarray(written, end));
1082
+ written = end;
1083
+ if (onProgress) onProgress(written, total);
1084
+ if (!ok) { hreq.once('drain', writeChunks); return; }
1085
+ }
1086
+ if (onProgress) onProgress(-1, total); // upload done, server processing
1087
+ hreq.end();
1088
+ };
1089
+ writeChunks();
974
1090
  });
975
1091
  const latency = Date.now() - t0;
976
1092
 
977
- if (!resp.ok) {
978
- const raw = await resp.text().catch(() => '');
1093
+ if (status < 200 || status >= 300) {
979
1094
  let msg;
980
1095
  try {
981
1096
  const e = JSON.parse(raw);
@@ -988,14 +1103,14 @@ async function callTranscribeAPI(file, { signal, timestamps, diarize } = {}) {
988
1103
  }
989
1104
  if (!msg) msg = raw;
990
1105
  } catch {
991
- msg = raw || `HTTP ${resp.status}`;
1106
+ msg = raw || `HTTP ${status}`;
992
1107
  }
993
1108
  const err = new Error(msg);
994
- err.status = resp.status;
1109
+ err.status = status;
995
1110
  throw err;
996
1111
  }
997
1112
 
998
- const data = await resp.json();
1113
+ const data = JSON.parse(raw);
999
1114
  const text = (data.text || '').trim();
1000
1115
  return { text, latency, segments: data.segments, words: data.words };
1001
1116
  }
@@ -1055,22 +1170,296 @@ function buildJsonOutput(base, { segments, words, timestamps, diarize } = {}) {
1055
1170
  return out;
1056
1171
  }
1057
1172
 
1173
+ // ── File optimization helpers ────────────────────────────────────────────────
1174
+
1175
+ let _ffmpegAvail;
1176
+ function ffmpegAvailable() {
1177
+ if (_ffmpegAvail !== undefined) return _ffmpegAvail;
1178
+ try {
1179
+ execFileSync('ffmpeg', ['-version'], { stdio: 'pipe' });
1180
+ execFileSync('ffprobe', ['-version'], { stdio: 'pipe' });
1181
+ _ffmpegAvail = true;
1182
+ } catch { _ffmpegAvail = false; }
1183
+ return _ffmpegAvail;
1184
+ }
1185
+
1186
+ let _ytdlpAvail;
1187
+ function ytdlpAvailable() {
1188
+ if (_ytdlpAvail !== undefined) return _ytdlpAvail;
1189
+ try { execFileSync('yt-dlp', ['--version'], { stdio: 'pipe' }); _ytdlpAvail = true; }
1190
+ catch { _ytdlpAvail = false; }
1191
+ return _ytdlpAvail;
1192
+ }
1193
+
1194
+ function downloadWithYtdlp(url, spinner) {
1195
+ const tmpBase = path.join(os.tmpdir(), `dikt-ytdlp-${process.pid}-${Date.now()}`);
1196
+ const outTemplate = `${tmpBase}.%(ext)s`;
1197
+
1198
+ return new Promise((resolve, reject) => {
1199
+ const proc = spawn('yt-dlp', [
1200
+ '-x', '--audio-format', 'opus', '--audio-quality', '48K',
1201
+ '-o', outTemplate, '--no-playlist', '--newline', url,
1202
+ ], { stdio: ['ignore', 'pipe', 'pipe'] });
1203
+
1204
+ const cleanupPartial = () => {
1205
+ const dir = path.dirname(tmpBase);
1206
+ const prefix = path.basename(tmpBase);
1207
+ try {
1208
+ for (const f of fs.readdirSync(dir)) {
1209
+ if (f.startsWith(prefix) && f.length > prefix.length) try { fs.unlinkSync(path.join(dir, f)); } catch {}
1210
+ }
1211
+ } catch {}
1212
+ };
1213
+
1214
+ let aborted = false;
1215
+ const onSigint = () => { aborted = true; proc.kill(); };
1216
+ process.on('SIGINT', onSigint);
1217
+
1218
+ let lastErr = '';
1219
+ const parseOutput = (chunk) => {
1220
+ const lines = chunk.toString().split('\n');
1221
+ for (const line of lines) {
1222
+ if (!line.trim()) continue;
1223
+ const dl = line.match(/\[download\]\s+([\d.]+)%/);
1224
+ if (dl) { spinner.update(`Downloading... ${Math.round(parseFloat(dl[1]))}%`); continue; }
1225
+ if (/\[ExtractAudio\]/.test(line)) { spinner.update('Converting audio...'); continue; }
1226
+ if (/\[download\]\s+Destination:/.test(line)) { spinner.update('Downloading...'); continue; }
1227
+ }
1228
+ };
1229
+ proc.stdout.on('data', parseOutput);
1230
+ proc.stderr.on('data', (chunk) => {
1231
+ lastErr = chunk.toString().trim().split('\n').pop();
1232
+ parseOutput(chunk);
1233
+ });
1234
+
1235
+ proc.on('close', (code) => {
1236
+ process.removeListener('SIGINT', onSigint);
1237
+ if (aborted) { cleanupPartial(); return reject(new Error('Download aborted')); }
1238
+ if (code !== 0) { cleanupPartial(); return reject(new Error(lastErr || `yt-dlp exited with code ${code}`)); }
1239
+ // yt-dlp may produce a different extension than requested; find the actual file
1240
+ const dir = path.dirname(tmpBase);
1241
+ const prefix = path.basename(tmpBase);
1242
+ try {
1243
+ const match = fs.readdirSync(dir).find(f => f.startsWith(prefix) && f.length > prefix.length);
1244
+ if (!match) return reject(new Error('yt-dlp produced no output file'));
1245
+ resolve(path.join(dir, match));
1246
+ } catch (err) { reject(err); }
1247
+ });
1248
+ });
1249
+ }
1250
+
1251
+ function getAudioDuration(filePath) {
1252
+ try {
1253
+ const out = execFileSync('ffprobe', ['-v', 'quiet', '-show_entries', 'format=duration', '-of', 'csv=p=0', filePath], { stdio: 'pipe', encoding: 'utf8' });
1254
+ return parseFloat(out.trim()) || 0;
1255
+ } catch { return 0; }
1256
+ }
1257
+
1258
+ async function compressAudio(inputPath) {
1259
+ const base = path.join(os.tmpdir(), `dikt-${process.pid}-${Date.now()}-${path.basename(inputPath, path.extname(inputPath))}`);
1260
+ for (const codec of ['libopus', 'libvorbis']) {
1261
+ const outPath = `${base}.ogg`;
1262
+ try {
1263
+ await execFileAsync('ffmpeg', ['-i', inputPath, '-c:a', codec, '-b:a', '48k', '-y', '-v', 'quiet', outPath], { stdio: 'pipe' });
1264
+ if (fs.statSync(outPath).size > 0) return outPath;
1265
+ try { fs.unlinkSync(outPath); } catch {}
1266
+ } catch { try { fs.unlinkSync(outPath); } catch {} }
1267
+ }
1268
+ return null;
1269
+ }
1270
+
1271
+ async function findSilenceSplitPoint(filePath, targetSec) {
1272
+ const startSec = Math.max(0, targetSec - SPLIT_SEARCH_SEC);
1273
+ const durSec = SPLIT_SEARCH_SEC * 2;
1274
+
1275
+ try {
1276
+ // Extract a small window of raw PCM around the target for silence analysis
1277
+ const { stdout: raw } = await execFileAsync('ffmpeg', [
1278
+ '-ss', String(startSec), '-t', String(durSec), '-i', filePath,
1279
+ '-f', 's16le', '-ar', '16000', '-ac', '1', '-v', 'quiet', '-',
1280
+ ], { encoding: 'buffer', maxBuffer: 16000 * 2 * durSec + 4096 });
1281
+
1282
+ // Scan for silence in 50ms windows
1283
+ const WINDOW_BYTES = Math.round(16000 * 0.05) * 2; // 50ms at 16kHz 16-bit mono
1284
+ let bestOffset = -1, bestLen = 0;
1285
+ let runStart = -1, runLen = 0;
1286
+
1287
+ for (let offset = 0; offset + WINDOW_BYTES <= raw.length; offset += WINDOW_BYTES) {
1288
+ const peak = peakAmplitude(raw.subarray(offset, offset + WINDOW_BYTES));
1289
+ if (peak < SILENCE_THRESHOLD) {
1290
+ if (runStart === -1) runStart = offset;
1291
+ runLen++;
1292
+ } else {
1293
+ if (runLen > bestLen) { bestOffset = runStart; bestLen = runLen; }
1294
+ runStart = -1; runLen = 0;
1295
+ }
1296
+ }
1297
+ if (runLen > bestLen) { bestOffset = runStart; bestLen = runLen; }
1298
+
1299
+ if (bestLen >= 10) { // at least 500ms of silence (avoids mid-word splits)
1300
+ const centerBytes = bestOffset + Math.floor(bestLen / 2) * WINDOW_BYTES;
1301
+ return startSec + centerBytes / (16000 * 2);
1302
+ }
1303
+ } catch {}
1304
+
1305
+ return targetSec; // fallback: no silence found, split at target
1306
+ }
1307
+
1308
+ function cleanChunkText(t) {
1309
+ if (!t) return '';
1310
+ // Strip [PRINT_WORDLEVEL_TIME] markup the API sometimes spontaneously returns
1311
+ if (t.includes('[PRINT_WORDLEVEL_TIME]')) {
1312
+ t = t.replace(/\[PRINT_WORDLEVEL_TIME\]/g, '');
1313
+ t = t.replace(/<\/?\d{2}:\d{2}\.\d+>/g, '');
1314
+ t = t.replace(/\s+/g, ' ');
1315
+ }
1316
+ return t.trim();
1317
+ }
1318
+
1319
+ function mergeChunkResults(results, splitPoints) {
1320
+ // No overlap — just concatenate text, offset timestamps
1321
+ let text = results.map(r => cleanChunkText(r.text)).filter(Boolean).join(' ');
1322
+ // Fix missing spaces after punctuation (API omits leading spaces on some segments)
1323
+ text = text.replace(/([.!?,])([A-Za-z])/g, '$1 $2');
1324
+ let maxLatency = 0;
1325
+ const allSegments = [];
1326
+ const allWords = [];
1327
+
1328
+ const round1 = (n) => Math.round(n * 10) / 10;
1329
+ for (let i = 0; i < results.length; i++) {
1330
+ const r = results[i];
1331
+ const offset = splitPoints[i];
1332
+ if (r.latency > maxLatency) maxLatency = r.latency;
1333
+
1334
+ if (r.segments) {
1335
+ for (const seg of r.segments) {
1336
+ allSegments.push({ ...seg, start: round1(seg.start + offset), end: round1(seg.end + offset) });
1337
+ }
1338
+ }
1339
+ if (r.words) {
1340
+ for (const w of r.words) {
1341
+ allWords.push({ ...w, start: round1(w.start + offset), end: round1(w.end + offset) });
1342
+ }
1343
+ }
1344
+ }
1345
+
1346
+ return {
1347
+ text,
1348
+ latency: maxLatency,
1349
+ segments: allSegments.length ? allSegments : undefined,
1350
+ words: allWords.length ? allWords : undefined,
1351
+ };
1352
+ }
1353
+
1354
+ async function parallelMap(items, fn, concurrency) {
1355
+ const results = new Array(items.length);
1356
+ let next = 0;
1357
+ const worker = async () => { while (next < items.length) { const i = next++; results[i] = await fn(items[i], i); } };
1358
+ await Promise.all(Array.from({ length: Math.min(concurrency, items.length) }, () => worker()));
1359
+ return results;
1360
+ }
1361
+
1058
1362
  // ── File mode ────────────────────────────────────────────────────────────────
1059
1363
 
1060
1364
  async function runFile(flags) {
1365
+ const spinner = createStderrSpinner();
1366
+ let fileSize = 0;
1367
+ let transcribeTimer = null;
1368
+ const tempFiles = [];
1369
+
1061
1370
  try {
1062
- if (!flags.file || !fs.existsSync(flags.file)) {
1371
+ const isURL = /^https?:\/\//i.test(flags.file);
1372
+
1373
+ if (isURL) {
1374
+ if (!ytdlpAvailable()) {
1375
+ process.stderr.write(`\n${RED}${BOLD} yt-dlp not found.${RESET}\n\n`);
1376
+ process.stderr.write(` yt-dlp is required to download audio from URLs. Install it:\n\n`);
1377
+ if (process.platform === 'darwin') {
1378
+ process.stderr.write(` ${BOLD}brew install yt-dlp${RESET}\n\n`);
1379
+ } else if (process.platform === 'win32') {
1380
+ process.stderr.write(` ${BOLD}choco install yt-dlp${RESET} or ${BOLD}scoop install yt-dlp${RESET}\n\n`);
1381
+ } else {
1382
+ process.stderr.write(` ${BOLD}sudo apt install yt-dlp${RESET} (Debian/Ubuntu)\n`);
1383
+ process.stderr.write(` ${BOLD}pip install yt-dlp${RESET} (any platform)\n\n`);
1384
+ }
1385
+ return EXIT_DEPENDENCY;
1386
+ }
1387
+ spinner.start('Downloading audio...');
1388
+ try {
1389
+ const downloaded = await downloadWithYtdlp(flags.file, spinner);
1390
+ tempFiles.push(downloaded);
1391
+ flags = { ...flags, file: downloaded };
1392
+ } catch (err) {
1393
+ spinner.stop();
1394
+ process.stderr.write(`Error downloading: ${err.message}\n`);
1395
+ return EXIT_TRANSCRIPTION;
1396
+ }
1397
+ spinner.update('Processing audio...');
1398
+ } else if (!flags.file || !fs.existsSync(flags.file)) {
1063
1399
  process.stderr.write(`Error: file not found: ${flags.file}\n`);
1064
1400
  return EXIT_TRANSCRIPTION;
1401
+ } else {
1402
+ spinner.start('Reading file...');
1403
+ }
1404
+ fileSize = fs.statSync(flags.file).size;
1405
+ const ext = path.extname(flags.file).slice(1).toLowerCase() || 'wav';
1406
+
1407
+ // Check if ffmpeg is available for chunking / compression optimizations
1408
+ const hasFFmpeg = ffmpegAvailable();
1409
+ const duration = hasFFmpeg ? getAudioDuration(flags.file) : 0;
1410
+ const canChunk = hasFFmpeg && !flags.diarize && duration > CHUNK_MIN_SEC;
1411
+
1412
+ if (canChunk) {
1413
+ spinner.stop();
1414
+ return await runFileChunked(flags, { fileSize, duration });
1415
+ }
1416
+
1417
+ // Compress uncompressed formats (wav/flac → ogg) for faster upload
1418
+ let uploadPath = flags.file;
1419
+ let uploadExt = ext;
1420
+ if (hasFFmpeg && COMPRESSIBLE.has(ext)) {
1421
+ spinner.update('Compressing...');
1422
+ const compressed = await compressAudio(flags.file);
1423
+ if (compressed) {
1424
+ const newSize = fs.statSync(compressed).size;
1425
+ if (newSize < fileSize) {
1426
+ tempFiles.push(compressed);
1427
+ uploadPath = compressed;
1428
+ uploadExt = path.extname(compressed).slice(1);
1429
+ spinner.update(`Compressed ${formatFileSize(fileSize)} → ${formatFileSize(newSize)}`);
1430
+ } else {
1431
+ try { fs.unlinkSync(compressed); } catch {}
1432
+ }
1433
+ }
1065
1434
  }
1066
1435
 
1067
- const blob = await fs.openAsBlob(flags.file);
1068
- const ext = path.extname(flags.file).slice(1) || 'wav';
1069
- const mimeTypes = { wav: 'audio/wav', mp3: 'audio/mpeg', m4a: 'audio/mp4', ogg: 'audio/ogg', flac: 'audio/flac', webm: 'audio/webm' };
1070
- const mime = mimeTypes[ext] || 'audio/wav';
1071
- const file = new File([blob], path.basename(flags.file), { type: mime });
1436
+ const blob = await fs.openAsBlob(uploadPath);
1437
+ const mime = MIME_TYPES[uploadExt] || 'application/octet-stream';
1438
+ const file = new File([blob], path.basename(uploadPath), { type: mime });
1439
+
1440
+ spinner.update(`Uploading to API... (${formatFileSize(blob.size)})`);
1441
+
1442
+ const ac = new AbortController();
1443
+ const abortHandler = () => { spinner.stop('Aborting...'); ac.abort(); };
1444
+ process.on('SIGINT', abortHandler);
1445
+
1446
+ const onProgress = (sent, total) => {
1447
+ if (sent === -1) {
1448
+ const t0 = Date.now();
1449
+ const elapsed = () => { const s = Math.floor((Date.now() - t0) / 1000); return `${Math.floor(s / 60)}:${String(s % 60).padStart(2, '0')}`; };
1450
+ spinner.update(`Transcribing... ${DIM}(${elapsed()})${RESET}`);
1451
+ transcribeTimer = setInterval(() => spinner.update(`Transcribing... ${DIM}(${elapsed()})${RESET}`), 1000);
1452
+ } else {
1453
+ const pct = Math.round((sent / total) * 100);
1454
+ spinner.update(`Uploading ${pct}% (${formatFileSize(sent)} / ${formatFileSize(total)})`);
1455
+ }
1456
+ };
1457
+
1458
+ const result = await callTranscribeAPI(file, { signal: ac.signal, timestamps: flags.timestamps, diarize: flags.diarize, onProgress });
1459
+ if (transcribeTimer) clearInterval(transcribeTimer);
1460
+ process.removeListener('SIGINT', abortHandler);
1072
1461
 
1073
- const result = await callTranscribeAPI(file, { timestamps: flags.timestamps, diarize: flags.diarize });
1462
+ spinner.stop(`${GREEN}Done${RESET} (${(result.latency / 1000).toFixed(1)}s)`);
1074
1463
 
1075
1464
  if (!result.text) {
1076
1465
  process.stderr.write('No speech detected\n');
@@ -1079,22 +1468,202 @@ async function runFile(flags) {
1079
1468
 
1080
1469
  const wordCount = result.text.split(/\s+/).filter(Boolean).length;
1081
1470
 
1471
+ let output;
1082
1472
  if (flags.json) {
1083
1473
  const out = buildJsonOutput(
1084
1474
  { text: result.text, latency: result.latency, words: wordCount },
1085
1475
  { segments: result.segments, words: result.words, timestamps: flags.timestamps, diarize: flags.diarize },
1086
1476
  );
1087
- process.stdout.write(JSON.stringify(out) + '\n');
1477
+ output = JSON.stringify(out, null, flags.output ? 2 : 0) + '\n';
1088
1478
  } else if (flags.diarize && result.segments) {
1089
- process.stdout.write(formatDiarizedText(result.segments) + '\n');
1479
+ output = formatDiarizedText(result.segments) + '\n';
1090
1480
  } else {
1091
- process.stdout.write(result.text + '\n');
1481
+ output = result.text + '\n';
1482
+ }
1483
+
1484
+ if (flags.output) {
1485
+ fs.writeFileSync(flags.output, output);
1486
+ process.stderr.write(`Saved to ${flags.output}\n`);
1487
+ } else {
1488
+ process.stdout.write(output);
1092
1489
  }
1093
1490
 
1094
1491
  return EXIT_OK;
1095
1492
  } catch (err) {
1096
- process.stderr.write(`Error: ${err.message}\n`);
1493
+ if (transcribeTimer) clearInterval(transcribeTimer);
1494
+ spinner.stop();
1495
+
1496
+ if (err.name === 'AbortError') {
1497
+ process.stderr.write('Aborted\n');
1498
+ return EXIT_TRANSCRIPTION;
1499
+ }
1500
+
1501
+ const parts = [`Error: ${err.message}`];
1502
+ if (fileSize) parts.push(` File: ${flags.file} (${formatFileSize(fileSize)})`);
1503
+
1504
+ if (err.networkError) {
1505
+ parts.push(' Hint: check your network connection and try again');
1506
+ } else if (err.status === 401) {
1507
+ parts.push(' Hint: invalid API key — run `dikt setup` to reconfigure');
1508
+ } else if (err.status === 413) {
1509
+ parts.push(' Hint: file is too large for the API — try a shorter recording');
1510
+ } else if (err.status === 429) {
1511
+ parts.push(' Hint: rate limited — wait a moment and try again');
1512
+ } else if (err.status >= 500) {
1513
+ parts.push(' Hint: Mistral API server error — try again later');
1514
+ }
1515
+
1516
+ process.stderr.write(parts.join('\n') + '\n');
1097
1517
  return EXIT_TRANSCRIPTION;
1518
+ } finally {
1519
+ for (const f of tempFiles) { try { fs.unlinkSync(f); } catch {} }
1520
+ }
1521
+ }
1522
+
1523
+ async function runFileChunked(flags, { fileSize, duration }) {
1524
+ const spinner = createStderrSpinner();
1525
+ const tempFiles = [];
1526
+ const t0 = Date.now();
1527
+ let progressTimer = null;
1528
+ let abortHandler = null;
1529
+
1530
+ try {
1531
+ // Find optimal split points at silence boundaries
1532
+ const numTargetChunks = Math.ceil(duration / TARGET_CHUNK_SEC);
1533
+ spinner.start('Analyzing audio for split points...');
1534
+
1535
+ const splitPoints = [0];
1536
+ for (let i = 1; i < numTargetChunks; i++) {
1537
+ spinner.update(`Finding split point ${i}/${numTargetChunks - 1}...`);
1538
+ splitPoints.push(await findSilenceSplitPoint(flags.file, i * TARGET_CHUNK_SEC));
1539
+ }
1540
+ splitPoints.push(duration);
1541
+
1542
+ // Merge tiny trailing chunks (< MIN_CHUNK_SEC) into the previous one
1543
+ for (let i = splitPoints.length - 2; i > 0; i--) {
1544
+ if (splitPoints[i + 1] - splitPoints[i] < MIN_CHUNK_SEC) {
1545
+ splitPoints.splice(i, 1);
1546
+ }
1547
+ }
1548
+
1549
+ const numChunks = splitPoints.length - 1;
1550
+
1551
+ // Split audio and compress each chunk
1552
+ const chunkBase = path.join(os.tmpdir(), `dikt-${process.pid}-${Date.now()}`);
1553
+ const uploadPaths = [];
1554
+
1555
+ for (let i = 0; i < numChunks; i++) {
1556
+ spinner.update(`Preparing chunk ${i + 1}/${numChunks}...`);
1557
+ const start = splitPoints[i];
1558
+ const dur = splitPoints[i + 1] - start;
1559
+ const oggPath = `${chunkBase}-${i}.ogg`;
1560
+ try {
1561
+ await execFileAsync('ffmpeg', ['-ss', String(start), '-t', String(dur), '-i', flags.file, '-c:a', 'libopus', '-b:a', '48k', '-y', '-v', 'quiet', oggPath], { stdio: 'pipe' });
1562
+ if (fs.statSync(oggPath).size > 0) {
1563
+ tempFiles.push(oggPath);
1564
+ uploadPaths.push(oggPath);
1565
+ } else { throw new Error('empty output'); }
1566
+ } catch {
1567
+ try { fs.unlinkSync(oggPath); } catch {}
1568
+ const wavPath = `${chunkBase}-${i}.wav`;
1569
+ await execFileAsync('ffmpeg', ['-ss', String(start), '-t', String(dur), '-i', flags.file, '-y', '-v', 'quiet', wavPath], { stdio: 'pipe' });
1570
+ if (!fs.statSync(wavPath).size) throw new Error(`ffmpeg produced empty chunk ${i}`);
1571
+ tempFiles.push(wavPath);
1572
+ uploadPaths.push(wavPath);
1573
+ }
1574
+ }
1575
+
1576
+ const totalUploadSize = uploadPaths.reduce((sum, p) => sum + fs.statSync(p).size, 0);
1577
+ spinner.update(`Compressed → ${formatFileSize(totalUploadSize)} total`);
1578
+
1579
+ // Abort handling
1580
+ const ac = new AbortController();
1581
+ abortHandler = () => { spinner.stop('Aborting...'); ac.abort(); };
1582
+ process.on('SIGINT', abortHandler);
1583
+
1584
+ // Transcribe chunks in parallel
1585
+ let completed = 0;
1586
+ const elapsed = () => {
1587
+ const s = Math.floor((Date.now() - t0) / 1000);
1588
+ return `${Math.floor(s / 60)}:${String(s % 60).padStart(2, '0')}`;
1589
+ };
1590
+ spinner.update(`Transcribing ${numChunks} chunks... ${DIM}(${elapsed()})${RESET}`);
1591
+ progressTimer = setInterval(() => {
1592
+ spinner.update(`Transcribing... ${completed}/${numChunks} done ${DIM}(${elapsed()})${RESET}`);
1593
+ }, 1000);
1594
+
1595
+ const chunkIndices = Array.from({ length: numChunks }, (_, i) => i);
1596
+ const results = await parallelMap(chunkIndices, async (i) => {
1597
+ const uploadPath = uploadPaths[i];
1598
+ const ext = path.extname(uploadPath).slice(1);
1599
+ const blob = await fs.openAsBlob(uploadPath);
1600
+ const file = new File([blob], `chunk-${i}.${ext}`, { type: MIME_TYPES[ext] || 'audio/wav' });
1601
+ const result = await callTranscribeAPI(file, { signal: ac.signal, timestamps: flags.timestamps });
1602
+ completed++;
1603
+ return result;
1604
+ }, MAX_PARALLEL);
1605
+
1606
+ clearInterval(progressTimer); progressTimer = null;
1607
+ process.removeListener('SIGINT', abortHandler); abortHandler = null;
1608
+
1609
+ // Merge results — no overlap, just concatenate text and offset timestamps
1610
+ const merged = mergeChunkResults(results, splitPoints);
1611
+ const totalLatency = Date.now() - t0;
1612
+ spinner.stop(`${GREEN}Done${RESET} (${(totalLatency / 1000).toFixed(1)}s, ${numChunks} chunks)`);
1613
+
1614
+ if (!merged.text) {
1615
+ process.stderr.write('No speech detected\n');
1616
+ return EXIT_TRANSCRIPTION;
1617
+ }
1618
+
1619
+ const wordCount = merged.text.split(/\s+/).filter(Boolean).length;
1620
+
1621
+ let output;
1622
+ if (flags.json) {
1623
+ const out = buildJsonOutput(
1624
+ { text: merged.text, latency: totalLatency, words: wordCount },
1625
+ { segments: merged.segments, words: merged.words, timestamps: flags.timestamps, diarize: false },
1626
+ );
1627
+ output = JSON.stringify(out, null, flags.output ? 2 : 0) + '\n';
1628
+ } else {
1629
+ output = merged.text + '\n';
1630
+ }
1631
+
1632
+ if (flags.output) {
1633
+ fs.writeFileSync(flags.output, output);
1634
+ process.stderr.write(`Saved to ${flags.output}\n`);
1635
+ } else {
1636
+ process.stdout.write(output);
1637
+ }
1638
+
1639
+ return EXIT_OK;
1640
+ } catch (err) {
1641
+ spinner.stop();
1642
+
1643
+ if (err.name === 'AbortError') {
1644
+ process.stderr.write('Aborted\n');
1645
+ return EXIT_TRANSCRIPTION;
1646
+ }
1647
+
1648
+ const parts = [`Error: ${err.message}`];
1649
+ if (fileSize) parts.push(` File: ${flags.file} (${formatFileSize(fileSize)})`);
1650
+
1651
+ if (err.networkError) {
1652
+ parts.push(' Hint: check your network connection and try again');
1653
+ } else if (err.status === 401) {
1654
+ parts.push(' Hint: invalid API key — run `dikt setup` to reconfigure');
1655
+ } else if (err.status === 429) {
1656
+ parts.push(' Hint: rate limited — wait a moment and try again');
1657
+ } else if (err.status >= 500) {
1658
+ parts.push(' Hint: Mistral API server error — try again later');
1659
+ }
1660
+
1661
+ process.stderr.write(parts.join('\n') + '\n');
1662
+ return EXIT_TRANSCRIPTION;
1663
+ } finally {
1664
+ if (progressTimer) clearInterval(progressTimer);
1665
+ if (abortHandler) process.removeListener('SIGINT', abortHandler);
1666
+ for (const f of tempFiles) { try { fs.unlinkSync(f); } catch {} }
1098
1667
  }
1099
1668
  }
1100
1669
 
@@ -1158,16 +1727,24 @@ async function runOnce(flags) {
1158
1727
 
1159
1728
  const wordCount = result.text.split(/\s+/).filter(Boolean).length;
1160
1729
 
1730
+ let output;
1161
1731
  if (flags.json) {
1162
1732
  const out = buildJsonOutput(
1163
1733
  { text: result.text, duration: parseFloat(duration.toFixed(1)), latency: result.latency, words: wordCount },
1164
1734
  { segments: result.segments, words: result.words, timestamps: flags.timestamps, diarize: flags.diarize },
1165
1735
  );
1166
- process.stdout.write(JSON.stringify(out) + '\n');
1736
+ output = JSON.stringify(out, null, flags.output ? 2 : 0) + '\n';
1167
1737
  } else if (flags.diarize && result.segments) {
1168
- process.stdout.write(formatDiarizedText(result.segments) + '\n');
1738
+ output = formatDiarizedText(result.segments) + '\n';
1739
+ } else {
1740
+ output = result.text + '\n';
1741
+ }
1742
+
1743
+ if (flags.output) {
1744
+ fs.writeFileSync(flags.output, output);
1745
+ process.stderr.write(`Saved to ${flags.output}\n`);
1169
1746
  } else {
1170
- process.stdout.write(result.text + '\n');
1747
+ process.stdout.write(output);
1171
1748
  }
1172
1749
 
1173
1750
  return EXIT_OK;
@@ -1204,6 +1781,7 @@ async function runStream(flags) {
1204
1781
  let chunkStart = Date.now();
1205
1782
  let chunkIndex = 0;
1206
1783
  const pending = [];
1784
+ const outputParts = []; // collect output for --output
1207
1785
 
1208
1786
  recProc.stdout.on('data', (chunk) => {
1209
1787
  chunks.push(chunk);
@@ -1230,17 +1808,23 @@ async function runStream(flags) {
1230
1808
  .then((result) => {
1231
1809
  if (!result.text) return;
1232
1810
  const wordCount = result.text.split(/\s+/).filter(Boolean).length;
1811
+ let chunk_output;
1233
1812
  if (flags.json) {
1234
1813
  const out = buildJsonOutput(
1235
1814
  { text: result.text, chunk: idx, duration: parseFloat(duration.toFixed(1)), latency: result.latency, words: wordCount },
1236
1815
  { segments: result.segments, words: result.words, timestamps: flags.timestamps, diarize: flags.diarize },
1237
1816
  );
1238
- process.stdout.write(JSON.stringify(out) + '\n');
1817
+ chunk_output = JSON.stringify(out, null, flags.output ? 2 : 0) + '\n';
1239
1818
  } else if (flags.diarize && result.segments) {
1240
1819
  const sep = flags.noNewline ? ' ' : '\n';
1241
- process.stdout.write(formatDiarizedText(result.segments) + sep);
1820
+ chunk_output = formatDiarizedText(result.segments) + sep;
1821
+ } else {
1822
+ chunk_output = result.text + (flags.noNewline ? ' ' : '\n');
1823
+ }
1824
+ if (flags.output) {
1825
+ outputParts[idx] = chunk_output;
1242
1826
  } else {
1243
- process.stdout.write(result.text + (flags.noNewline ? ' ' : '\n'));
1827
+ process.stdout.write(chunk_output);
1244
1828
  }
1245
1829
  })
1246
1830
  .catch((err) => {
@@ -1267,17 +1851,23 @@ async function runStream(flags) {
1267
1851
  const result = await transcribeBuffer(chunks, { timestamps: flags.timestamps, diarize: flags.diarize });
1268
1852
  if (result.text) {
1269
1853
  const wordCount = result.text.split(/\s+/).filter(Boolean).length;
1854
+ let chunk_output;
1270
1855
  if (flags.json) {
1271
1856
  const out = buildJsonOutput(
1272
1857
  { text: result.text, chunk: idx, duration: parseFloat(duration.toFixed(1)), latency: result.latency, words: wordCount },
1273
1858
  { segments: result.segments, words: result.words, timestamps: flags.timestamps, diarize: flags.diarize },
1274
1859
  );
1275
- process.stdout.write(JSON.stringify(out) + '\n');
1860
+ chunk_output = JSON.stringify(out, null, flags.output ? 2 : 0) + '\n';
1276
1861
  } else if (flags.diarize && result.segments) {
1277
1862
  const sep = flags.noNewline ? ' ' : '\n';
1278
- process.stdout.write(formatDiarizedText(result.segments) + sep);
1863
+ chunk_output = formatDiarizedText(result.segments) + sep;
1864
+ } else {
1865
+ chunk_output = result.text + (flags.noNewline ? ' ' : '\n');
1866
+ }
1867
+ if (flags.output) {
1868
+ outputParts[idx] = chunk_output;
1279
1869
  } else {
1280
- process.stdout.write(result.text + (flags.noNewline ? ' ' : '\n'));
1870
+ process.stdout.write(chunk_output);
1281
1871
  }
1282
1872
  }
1283
1873
  } catch (err) {
@@ -1288,8 +1878,13 @@ async function runStream(flags) {
1288
1878
  // Wait for any in-flight transcriptions to finish
1289
1879
  await Promise.allSettled(pending);
1290
1880
 
1881
+ if (flags.output && outputParts.length) {
1882
+ fs.writeFileSync(flags.output, outputParts.filter(Boolean).join(''));
1883
+ process.stderr.write(`Saved to ${flags.output}\n`);
1884
+ }
1885
+
1291
1886
  // Final newline for --no-newline so shell prompt starts on a new line
1292
- if (flags.noNewline && !flags.json) process.stdout.write('\n');
1887
+ if (!flags.output && flags.noNewline && !flags.json) process.stdout.write('\n');
1293
1888
 
1294
1889
  return EXIT_OK;
1295
1890
  } catch (err) {
@@ -1318,6 +1913,26 @@ function quit() {
1318
1913
 
1319
1914
  // ── Main ──────────────────────────────────────────────────────────────────────
1320
1915
 
1916
+ function flagVal(args, name, hint, { valid, numeric } = {}) {
1917
+ const i = args.indexOf(name);
1918
+ if (i === -1) return '';
1919
+ const v = args[i + 1];
1920
+ if (!v || v.startsWith('-')) {
1921
+ const h = hint ? ` (${hint})` : '';
1922
+ process.stderr.write(`Error: ${name} requires a value${h}\n`);
1923
+ process.exit(EXIT_CONFIG);
1924
+ }
1925
+ if (valid && !valid.includes(v)) {
1926
+ process.stderr.write(`Error: invalid value for ${name}: '${v}' (${hint})\n`);
1927
+ process.exit(EXIT_CONFIG);
1928
+ }
1929
+ if (numeric && !Number.isFinite(parseFloat(v))) {
1930
+ process.stderr.write(`Error: ${name} must be a number\n`);
1931
+ process.exit(EXIT_CONFIG);
1932
+ }
1933
+ return v;
1934
+ }
1935
+
1321
1936
  async function main() {
1322
1937
  const args = process.argv.slice(2);
1323
1938
  const flags = {
@@ -1326,15 +1941,42 @@ async function main() {
1326
1941
  noInput: args.includes('--no-input'),
1327
1942
  setup: args.includes('--setup') || args[0] === 'setup',
1328
1943
  stream: args.includes('--stream'),
1329
- silence: args.includes('--silence') ? (Number.isFinite(parseFloat(args[args.indexOf('--silence') + 1])) ? parseFloat(args[args.indexOf('--silence') + 1]) : 2.0) : 2.0,
1330
- pause: args.includes('--pause') ? parseFloat(args[args.indexOf('--pause') + 1]) || 1.0 : 1.0,
1331
- language: args.includes('--language') ? args[args.indexOf('--language') + 1] || '' : '',
1332
- file: args.includes('--file') ? args[args.indexOf('--file') + 1] || '' : '',
1944
+ silence: args.includes('--silence') ? parseFloat(flagVal(args, '--silence', 'seconds', { numeric: true })) : 2.0,
1945
+ pause: args.includes('--pause') ? parseFloat(flagVal(args, '--pause', 'seconds', { numeric: true })) : 1.0,
1946
+ language: flagVal(args, '--language', 'e.g. en, de, fr'),
1947
+ file: flagVal(args, '--file', 'path to audio file'),
1333
1948
  noNewline: args.includes('--no-newline') || args.includes('-n'),
1334
- timestamps: args.includes('--timestamps') ? args[args.indexOf('--timestamps') + 1] || '' : '',
1949
+ timestamps: flagVal(args, '--timestamps', 'segment or word', { valid: ['segment', 'word'] }),
1335
1950
  diarize: args.includes('--diarize'),
1951
+ output: flagVal(args, '--output', 'path') || flagVal(args, '-o', 'path'),
1336
1952
  };
1337
1953
 
1954
+ // Reject unknown flags and arguments
1955
+ const knownFlags = new Set([
1956
+ '--json', '--quiet', '-q', '--no-input', '--setup', '--stream',
1957
+ '--no-newline', '-n', '--diarize', '--version', '--update',
1958
+ '--help', '-h', '--no-color',
1959
+ '--silence', '--pause', '--language', '--file', '--timestamps',
1960
+ '--output', '-o',
1961
+ ]);
1962
+ const knownCommands = new Set(['setup', 'update']);
1963
+ const valueTakers = new Set(['--silence', '--pause', '--language', '--file', '--timestamps', '--output', '-o']);
1964
+ for (let i = 0; i < args.length; i++) {
1965
+ const a = args[i];
1966
+ if (a.startsWith('-')) {
1967
+ if (!knownFlags.has(a)) {
1968
+ process.stderr.write(`Unknown flag: ${a}\nRun dikt --help for usage.\n`);
1969
+ process.exit(EXIT_CONFIG);
1970
+ }
1971
+ if (valueTakers.has(a)) i++; // skip value
1972
+ } else if (knownCommands.has(a)) {
1973
+ // ok — subcommand
1974
+ } else {
1975
+ process.stderr.write(`Unexpected argument: ${a}\nRun dikt --help for usage.\n`);
1976
+ process.exit(EXIT_CONFIG);
1977
+ }
1978
+ }
1979
+
1338
1980
  if (args.includes('--version')) {
1339
1981
  console.log(`dikt v${VERSION}`);
1340
1982
  process.exit(EXIT_OK);
@@ -1374,12 +2016,13 @@ Options:
1374
2016
  --json Record once, output JSON to stdout
1375
2017
  -q, --quiet Record once, print transcript to stdout
1376
2018
  --stream Stream transcription chunks on pauses
1377
- --file <path> Transcribe an audio file (no mic needed)
2019
+ --file <path|url> Transcribe audio file or URL (via yt-dlp)
2020
+ -o, --output <path> Write output to file (.json auto-enables JSON)
1378
2021
  --silence <seconds> Silence duration before auto-stop (default: 2.0)
1379
2022
  --pause <seconds> Pause duration to split chunks (default: 1.0)
1380
2023
  --language <code> Language code, e.g. en, de, fr (default: auto)
1381
2024
  -n, --no-newline Join stream chunks without newlines
1382
- --timestamps <granularity> Add timestamps: segment, word, or segment,word
2025
+ --timestamps <granularity> Add timestamps: segment or word
1383
2026
  --diarize Enable speaker identification
1384
2027
  --no-input Fail if config is missing (no wizard)
1385
2028
  --no-color Disable colored output
@@ -1404,6 +2047,9 @@ Examples:
1404
2047
  dikt -q | claude Dictate a prompt to Claude Code
1405
2048
  dikt update Update to the latest version
1406
2049
  dikt --file meeting.wav Transcribe an existing audio file
2050
+ dikt --file a.wav -o a.json Transcribe to a JSON file
2051
+ dikt --file a.wav -o a.txt Transcribe to a text file
2052
+ dikt --file https://youtube.com/watch?v=ID Transcribe from URL
1407
2053
  dikt --stream --silence 0 Stream continuously until Ctrl+C
1408
2054
  dikt --stream -n Stream as continuous flowing text
1409
2055
  dikt -q --json --diarize Transcribe with speaker labels
@@ -1417,13 +2063,13 @@ Environment variables:
1417
2063
 
1418
2064
  Exit codes:
1419
2065
  0 Success
1420
- 1 Missing dependency (sox)
2066
+ 1 Missing dependency (sox/ffmpeg)
1421
2067
  2 Not a terminal
1422
2068
  3 Configuration error
1423
2069
  4 Transcription error
1424
2070
 
1425
2071
  Config: ${CONFIG_DIR}/config.json
1426
- Requires: sox (brew install sox)`);
2072
+ Requires: sox (recording), ffmpeg (--file optimization), yt-dlp (URLs, optional)`);
1427
2073
  process.exit(EXIT_OK);
1428
2074
  }
1429
2075
 
@@ -1445,8 +2091,12 @@ Requires: sox (brew install sox)`);
1445
2091
 
1446
2092
  applyEnvOverrides(config);
1447
2093
  if (flags.language) config.language = flags.language;
1448
- if (!flags.timestamps && config.timestamps) flags.timestamps = config.timestamps;
2094
+ if (!flags.timestamps && config.timestamps) {
2095
+ // Migrate legacy 'segment,word' → 'word' (combined option removed)
2096
+ flags.timestamps = config.timestamps === 'segment,word' ? 'word' : config.timestamps;
2097
+ }
1449
2098
  if (!flags.diarize && config.diarize) flags.diarize = true;
2099
+ if (flags.output && flags.output.endsWith('.json')) flags.json = true;
1450
2100
 
1451
2101
  const validation = validateConfig(config);
1452
2102
  if (!validation.valid) {
@@ -1457,14 +2107,27 @@ Requires: sox (brew install sox)`);
1457
2107
  }
1458
2108
 
1459
2109
  // Validate incompatible flag combinations
2110
+ // Only error when both sides are CLI-passed. When one comes from config,
2111
+ // let the explicit CLI flag win and silently drop the config value.
2112
+ const cliLanguage = args.includes('--language');
2113
+ const cliTimestamps = args.includes('--timestamps');
2114
+ const cliDiarize = args.includes('--diarize');
1460
2115
  const lang = config.language;
1461
2116
  if (lang && flags.timestamps) {
1462
- process.stderr.write('Error: --timestamps and --language cannot be used together\n');
1463
- process.exit(EXIT_CONFIG);
2117
+ if (cliLanguage && cliTimestamps) {
2118
+ process.stderr.write('Error: --timestamps and --language cannot be used together\n');
2119
+ process.exit(EXIT_CONFIG);
2120
+ }
2121
+ if (cliLanguage) flags.timestamps = '';
2122
+ else config.language = '';
1464
2123
  }
1465
2124
  if (lang && flags.diarize) {
1466
- process.stderr.write('Error: --diarize and --language cannot be used together\n');
1467
- process.exit(EXIT_CONFIG);
2125
+ if (cliLanguage && cliDiarize) {
2126
+ process.stderr.write('Error: --diarize and --language cannot be used together\n');
2127
+ process.exit(EXIT_CONFIG);
2128
+ }
2129
+ if (cliLanguage) flags.diarize = false;
2130
+ else config.language = '';
1468
2131
  }
1469
2132
  if (flags.diarize && flags.stream) {
1470
2133
  process.stderr.write('Error: --diarize is not compatible with --stream, use -q --diarize instead\n');
@@ -1488,6 +2151,11 @@ Requires: sox (brew install sox)`);
1488
2151
  process.exit(await runOnce(flags));
1489
2152
  }
1490
2153
 
2154
+ // Warn about flags that don't apply to interactive mode
2155
+ if (flags.output) {
2156
+ process.stderr.write(`Warning: --output is ignored in interactive mode. Use with --file, -q, or --stream.\n`);
2157
+ }
2158
+
1491
2159
  // Interactive TUI mode
1492
2160
  checkTTY();
1493
2161
 
@@ -1514,7 +2182,7 @@ Requires: sox (brew install sox)`);
1514
2182
  }
1515
2183
 
1516
2184
  main().catch((err) => {
1517
- process.stdout.write(SHOW_CURSOR + ALT_SCREEN_OFF);
2185
+ if (process.stdout.isTTY) process.stdout.write(SHOW_CURSOR + ALT_SCREEN_OFF);
1518
2186
  console.error(err);
1519
2187
  process.exit(EXIT_DEPENDENCY);
1520
2188
  });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "dikt",
3
- "version": "1.1.2",
3
+ "version": "1.3.0",
4
4
  "description": "Voice dictation for the terminal.",
5
5
  "type": "module",
6
6
  "bin": {