dikt 1.2.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +13 -4
  2. package/cli.mjs +594 -33
  3. package/package.json +1 -1
package/README.md CHANGED
@@ -23,6 +23,11 @@ sudo apt install sox
23
23
  sudo pacman -S sox
24
24
  ```
25
25
 
26
+ Optional dependencies for `--file` mode:
27
+
28
+ - [ffmpeg](https://ffmpeg.org/) — enables compression, chunked transcription of long files, and broader format support
29
+ - [yt-dlp](https://github.com/yt-dlp/yt-dlp) — enables transcribing audio from URLs (YouTube, podcasts, etc.)
30
+
26
31
  ## Setup
27
32
 
28
33
  On first run, dikt will prompt you for your Mistral API key and model preferences:
@@ -90,7 +95,7 @@ dikt --stream --silence 0
90
95
 
91
96
  ### File mode
92
97
 
93
- Transcribe an existing audio file (wav, mp3, m4a, flac, ogg, webm no sox needed):
98
+ Transcribe an existing audio file (wav, mp3, m4a, flac, ogg, webm, aac, wma, and more):
94
99
 
95
100
  ```bash
96
101
  dikt --file meeting.wav
@@ -101,6 +106,10 @@ dikt --file meeting.wav -o transcript.txt
101
106
 
102
107
  # With JSON output
103
108
  dikt --file recording.mp3 --json
109
+
110
+ # Transcribe from a URL (requires yt-dlp)
111
+ dikt --file https://youtube.com/watch?v=VIDEO_ID
112
+ dikt --file https://youtube.com/watch?v=VIDEO_ID -o transcript.txt
104
113
  ```
105
114
 
106
115
  ### Speaker identification & timestamps
@@ -112,7 +121,7 @@ dikt -q --diarize
112
121
  # Timestamps
113
122
  dikt -q --timestamps segment
114
123
  dikt -q --timestamps word
115
- dikt -q --timestamps segment,word
124
+ dikt --file lecture.mp3 --timestamps segment
116
125
 
117
126
  # Combined with JSON
118
127
  dikt -q --json --diarize
@@ -122,7 +131,7 @@ dikt -q --json --diarize
122
131
 
123
132
  | Flag | Description |
124
133
  |---|---|
125
- | `--file <path>` | Transcribe an audio file (no mic needed) |
134
+ | `--file <path\|url>` | Transcribe audio file or URL (via yt-dlp) |
126
135
  | `-o`, `--output <path>` | Write output to file (`.json` auto-enables JSON) |
127
136
  | `--stream` | Stream transcription chunks on pauses |
128
137
  | `--json` | Output JSON (single-shot or stream) |
@@ -130,7 +139,7 @@ dikt -q --json --diarize
130
139
  | `--silence <seconds>` | Silence duration before auto-stop (default: 2.0) |
131
140
  | `--pause <seconds>` | Pause duration to split stream chunks (default: 1.0) |
132
141
  | `--language <code>` | Language code, e.g. en, de, fr (default: auto) |
133
- | `--timestamps <granularity>` | Add timestamps: segment, word, or segment,word |
142
+ | `--timestamps <granularity>` | Add timestamps: segment or word |
134
143
  | `--diarize` | Enable speaker identification |
135
144
  | `-n`, `--no-newline` | Join stream chunks without newlines |
136
145
  | `--no-color` | Disable colored output |
package/cli.mjs CHANGED
@@ -6,7 +6,10 @@ import fs from 'node:fs';
6
6
  import path from 'node:path';
7
7
  import os from 'node:os';
8
8
  import readline from 'node:readline';
9
- import { spawn, execFileSync } from 'node:child_process';
9
+ import { spawn, execFileSync, execFile as execFileCb } from 'node:child_process';
10
+ import { promisify } from 'node:util';
11
+ const execFileAsync = promisify(execFileCb);
12
+ import https from 'node:https';
10
13
 
11
14
  // ── ANSI helpers ──────────────────────────────────────────────────────────────
12
15
 
@@ -37,9 +40,16 @@ if (process.env.NO_COLOR != null || process.env.TERM === 'dumb' || process.argv.
37
40
 
38
41
  const moveTo = (row, col = 1) => `${ESC}${row};${col}H`;
39
42
 
43
+ function formatFileSize(bytes) {
44
+ if (bytes < 1024) return `${bytes} B`;
45
+ if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
46
+ if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
47
+ return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`;
48
+ }
49
+
40
50
  // ── Constants ─────────────────────────────────────────────────────────────────
41
51
 
42
- const VERSION = '1.2.0';
52
+ const VERSION = '1.4.0';
43
53
  const CONFIG_BASE = process.env.XDG_CONFIG_HOME || path.join(os.homedir(), '.config');
44
54
  const CONFIG_DIR = path.join(CONFIG_BASE, 'dikt');
45
55
  const CONFIG_FILE = path.join(CONFIG_DIR, 'config.json');
@@ -47,6 +57,56 @@ const MAX_HISTORY = 10;
47
57
  const MIN_RECORDING_MS = 500;
48
58
  const COST_PER_MIN = 0.003;
49
59
  const SPINNER = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏'];
60
+ const TARGET_CHUNK_SEC = 270; // ~4.5 min target chunk size
61
+ const CHUNK_MIN_SEC = 360; // only chunk files longer than 6 minutes
62
+ const SPLIT_SEARCH_SEC = 30; // search ±30s around target for silence split point
63
+ const MIN_CHUNK_SEC = 30; // merge chunks shorter than this into neighbor
64
+ const MAX_PARALLEL = 4; // max concurrent API requests
65
+ const MIME_TYPES = { wav: 'audio/wav', mp3: 'audio/mpeg', ogg: 'audio/ogg', flac: 'audio/flac', opus: 'audio/ogg', webm: 'audio/webm', m4a: 'audio/mp4', aac: 'audio/aac', wma: 'audio/x-ms-wma', aif: 'audio/aiff', aiff: 'audio/aiff', mp4: 'audio/mp4', oga: 'audio/ogg', amr: 'audio/amr', caf: 'audio/x-caf' };
66
+ const COMPRESSIBLE = new Set(['wav', 'flac', 'aiff', 'aif', 'raw', 'caf']); // lossless formats worth re-encoding
67
+
68
+ function createStderrSpinner() {
69
+ let frame = 0;
70
+ let interval = null;
71
+ let currentMsg = '';
72
+ const isTTY = process.stderr.isTTY;
73
+ const render = () => {
74
+ const sp = SPINNER[frame++ % SPINNER.length];
75
+ process.stderr.write(`\r${CLEAR_LINE}${YELLOW}${sp}${RESET} ${currentMsg}`);
76
+ };
77
+
78
+ return {
79
+ start(msg) {
80
+ currentMsg = msg;
81
+ if (isTTY) {
82
+ render();
83
+ interval = setInterval(render, 80);
84
+ } else {
85
+ process.stderr.write(`${currentMsg}\n`);
86
+ }
87
+ },
88
+ update(msg) {
89
+ currentMsg = msg;
90
+ if (isTTY) {
91
+ // Restart interval — prevents queued callbacks from firing after sync calls
92
+ if (interval) { clearInterval(interval); }
93
+ render();
94
+ interval = setInterval(render, 80);
95
+ } else {
96
+ process.stderr.write(`${msg}\n`);
97
+ }
98
+ },
99
+ stop(finalMsg) {
100
+ if (interval) { clearInterval(interval); interval = null; }
101
+ if (isTTY) {
102
+ process.stderr.write(`\r${CLEAR_LINE}`);
103
+ if (finalMsg) process.stderr.write(`${finalMsg}\n`);
104
+ } else if (finalMsg) {
105
+ process.stderr.write(`${finalMsg}\n`);
106
+ }
107
+ },
108
+ };
109
+ }
50
110
 
51
111
  const EXIT_OK = 0;
52
112
  const EXIT_DEPENDENCY = 1;
@@ -93,8 +153,8 @@ function validateConfig(cfg) {
93
153
 
94
154
  // ── Setup wizard (form-based) ─────────────────────────────────────────────────
95
155
 
96
- const TIMESTAMPS_DISPLAY = { '': 'off', 'segment': 'segment', 'word': 'word', 'segment,word': 'both' };
97
- const TIMESTAMPS_VALUE = { 'off': '', 'segment': 'segment', 'word': 'word', 'both': 'segment,word' };
156
+ const TIMESTAMPS_DISPLAY = { '': 'off', 'segment': 'segment', 'word': 'word' };
157
+ const TIMESTAMPS_VALUE = { 'off': '', 'segment': 'segment', 'word': 'word' };
98
158
 
99
159
  async function setupWizard() {
100
160
  const existing = loadConfig() || {};
@@ -105,7 +165,7 @@ async function setupWizard() {
105
165
  { key: 'language', label: 'Language', type: 'text', value: '', display: existing.language || 'auto', fallback: existing.language || '' },
106
166
  { key: 'temperature', label: 'Temperature', type: 'text', value: '', display: existing.temperature != null ? String(existing.temperature) : 'default', fallback: existing.temperature != null ? String(existing.temperature) : '' },
107
167
  { key: 'contextBias', label: 'Context bias', type: 'text', value: '', display: existing.contextBias || '', fallback: existing.contextBias || '' },
108
- { key: 'timestamps', label: 'Timestamps', type: 'select', options: ['off', 'segment', 'word', 'both'], idx: ['off', 'segment', 'word', 'both'].indexOf(TIMESTAMPS_DISPLAY[existing.timestamps || ''] || 'off') },
168
+ { key: 'timestamps', label: 'Timestamps', type: 'select', options: ['off', 'segment', 'word'], idx: ['off', 'segment', 'word'].indexOf(TIMESTAMPS_DISPLAY[existing.timestamps || ''] || 'off') },
109
169
  { key: 'diarize', label: 'Diarize', type: 'select', options: ['off', 'on'], idx: existing.diarize ? 1 : 0 },
110
170
  ];
111
171
 
@@ -398,13 +458,22 @@ function renderKeybar() {
398
458
  return ` ${DIM}[SPACE]${RESET} Record ${copyKey}${autoCopyKey}${histKey}${retryKey}`.trimEnd();
399
459
  }
400
460
 
461
+ function formatDuration(seconds) {
462
+ if (seconds < 60) return `${seconds.toFixed(1)}s`;
463
+ const m = Math.floor(seconds / 60);
464
+ const s = (seconds % 60).toFixed(1).padStart(4, '0');
465
+ if (m < 60) return `${m}m ${s}s`;
466
+ const h = Math.floor(m / 60);
467
+ const rm = String(m % 60).padStart(2, '0');
468
+ return `${h}h ${rm}m ${s}s`;
469
+ }
470
+
401
471
  function renderStatus() {
402
472
  switch (state.mode) {
403
473
  case 'idle':
404
474
  return ` ${GREY}● Idle${RESET}`;
405
475
  case 'recording': {
406
- const secs = state.duration.toFixed(1);
407
- return ` ${RED}${BOLD}● Recording${RESET} ${RED}${secs}s${RESET}`;
476
+ return ` ${RED}${BOLD}● Recording${RESET} ${RED}${formatDuration(state.duration)}${RESET}`;
408
477
  }
409
478
  case 'transcribing': {
410
479
  const sp = SPINNER[state.spinnerFrame % SPINNER.length];
@@ -498,7 +567,7 @@ function renderMeta() {
498
567
  const cost = (state.duration / 60 * COST_PER_MIN).toFixed(4);
499
568
  const latencyStr = state.latency ? `${(state.latency / 1000).toFixed(1)}s` : '—';
500
569
  const histLabel = state.historyIndex >= 0 ? ` · history ${state.historyIndex + 1}/${state.history.length}` : '';
501
- return ` ${DIM}${state.wordCount} words · ${state.duration.toFixed(1)}s · latency ${latencyStr} · $${cost}${histLabel}${RESET}`;
570
+ return ` ${DIM}${state.wordCount} words · ${formatDuration(state.duration)} · latency ${latencyStr} · $${cost}${histLabel}${RESET}`;
502
571
  }
503
572
 
504
573
  function renderHelp() {
@@ -953,7 +1022,7 @@ function trimSilence(rawData) {
953
1022
  return Buffer.concat(output);
954
1023
  }
955
1024
 
956
- async function callTranscribeAPI(file, { signal, timestamps, diarize } = {}) {
1025
+ async function callTranscribeAPI(file, { signal, timestamps, diarize, onProgress } = {}) {
957
1026
  const fd = new FormData();
958
1027
  fd.append('file', file);
959
1028
  fd.append('model', config.model);
@@ -961,7 +1030,7 @@ async function callTranscribeAPI(file, { signal, timestamps, diarize } = {}) {
961
1030
  if (config.temperature != null) fd.append('temperature', String(config.temperature));
962
1031
  if (config.contextBias) fd.append('context_bias', config.contextBias);
963
1032
  if (timestamps) {
964
- for (const g of timestamps.split(',')) fd.append('timestamp_granularities[]', g.trim());
1033
+ fd.append('timestamp_granularities[]', timestamps);
965
1034
  }
966
1035
  if (diarize) {
967
1036
  fd.append('diarize', 'true');
@@ -969,17 +1038,68 @@ async function callTranscribeAPI(file, { signal, timestamps, diarize } = {}) {
969
1038
  if (!timestamps) fd.append('timestamp_granularities[]', 'segment');
970
1039
  }
971
1040
 
972
- const t0 = Date.now();
973
- const resp = await fetch('https://api.mistral.ai/v1/audio/transcriptions', {
1041
+ // Use Request to serialize FormData into multipart body,
1042
+ // then send via node:https which has no hardcoded headersTimeout
1043
+ // (Node's built-in fetch/undici has a 300s headersTimeout that
1044
+ // cannot be configured without importing undici as a dependency).
1045
+ const req = new Request('https://api.mistral.ai/v1/audio/transcriptions', {
974
1046
  method: 'POST',
975
1047
  headers: { Authorization: `Bearer ${config.apiKey}` },
976
1048
  body: fd,
977
- signal: signal || AbortSignal.timeout(30_000),
1049
+ });
1050
+ const contentType = req.headers.get('content-type');
1051
+ const body = Buffer.from(await req.arrayBuffer());
1052
+
1053
+ const t0 = Date.now();
1054
+ const { status, raw } = await new Promise((resolve, reject) => {
1055
+ const hreq = https.request('https://api.mistral.ai/v1/audio/transcriptions', {
1056
+ method: 'POST',
1057
+ headers: {
1058
+ 'Authorization': `Bearer ${config.apiKey}`,
1059
+ 'Content-Type': contentType,
1060
+ 'Content-Length': body.length,
1061
+ },
1062
+ }, (res) => {
1063
+ const chunks = [];
1064
+ res.on('data', (c) => chunks.push(c));
1065
+ res.on('end', () => resolve({ status: res.statusCode, raw: Buffer.concat(chunks).toString() }));
1066
+ res.on('error', reject);
1067
+ });
1068
+
1069
+ hreq.on('error', (err) => {
1070
+ const ne = new Error(`Network error: ${err.message}`);
1071
+ ne.networkError = true;
1072
+ reject(ne);
1073
+ });
1074
+
1075
+ const abortSig = signal || AbortSignal.timeout(30_000);
1076
+ if (abortSig.aborted) { hreq.destroy(); reject(new DOMException('The operation was aborted', 'AbortError')); return; }
1077
+ abortSig.addEventListener('abort', () => {
1078
+ hreq.destroy();
1079
+ reject(abortSig.reason instanceof DOMException ? abortSig.reason
1080
+ : new DOMException('The operation was aborted', 'AbortError'));
1081
+ }, { once: true });
1082
+
1083
+ // Write body in chunks to enable upload progress tracking
1084
+ const CHUNK_SIZE = 256 * 1024;
1085
+ let written = 0;
1086
+ const total = body.length;
1087
+ const writeChunks = () => {
1088
+ while (written < total) {
1089
+ const end = Math.min(written + CHUNK_SIZE, total);
1090
+ const ok = hreq.write(body.subarray(written, end));
1091
+ written = end;
1092
+ if (onProgress) onProgress(written, total);
1093
+ if (!ok) { hreq.once('drain', writeChunks); return; }
1094
+ }
1095
+ if (onProgress) onProgress(-1, total); // upload done, server processing
1096
+ hreq.end();
1097
+ };
1098
+ writeChunks();
978
1099
  });
979
1100
  const latency = Date.now() - t0;
980
1101
 
981
- if (!resp.ok) {
982
- const raw = await resp.text().catch(() => '');
1102
+ if (status < 200 || status >= 300) {
983
1103
  let msg;
984
1104
  try {
985
1105
  const e = JSON.parse(raw);
@@ -992,14 +1112,14 @@ async function callTranscribeAPI(file, { signal, timestamps, diarize } = {}) {
992
1112
  }
993
1113
  if (!msg) msg = raw;
994
1114
  } catch {
995
- msg = raw || `HTTP ${resp.status}`;
1115
+ msg = raw || `HTTP ${status}`;
996
1116
  }
997
1117
  const err = new Error(msg);
998
- err.status = resp.status;
1118
+ err.status = status;
999
1119
  throw err;
1000
1120
  }
1001
1121
 
1002
- const data = await resp.json();
1122
+ const data = JSON.parse(raw);
1003
1123
  const text = (data.text || '').trim();
1004
1124
  return { text, latency, segments: data.segments, words: data.words };
1005
1125
  }
@@ -1059,28 +1179,297 @@ function buildJsonOutput(base, { segments, words, timestamps, diarize } = {}) {
1059
1179
  return out;
1060
1180
  }
1061
1181
 
1182
+ // ── File optimization helpers ────────────────────────────────────────────────
1183
+
1184
+ let _ffmpegAvail;
1185
+ function ffmpegAvailable() {
1186
+ if (_ffmpegAvail !== undefined) return _ffmpegAvail;
1187
+ try {
1188
+ execFileSync('ffmpeg', ['-version'], { stdio: 'pipe' });
1189
+ execFileSync('ffprobe', ['-version'], { stdio: 'pipe' });
1190
+ _ffmpegAvail = true;
1191
+ } catch { _ffmpegAvail = false; }
1192
+ return _ffmpegAvail;
1193
+ }
1194
+
1195
+ let _ytdlpAvail;
1196
+ function ytdlpAvailable() {
1197
+ if (_ytdlpAvail !== undefined) return _ytdlpAvail;
1198
+ try { execFileSync('yt-dlp', ['--version'], { stdio: 'pipe' }); _ytdlpAvail = true; }
1199
+ catch { _ytdlpAvail = false; }
1200
+ return _ytdlpAvail;
1201
+ }
1202
+
1203
+ function downloadWithYtdlp(url, spinner) {
1204
+ const tmpBase = path.join(os.tmpdir(), `dikt-ytdlp-${process.pid}-${Date.now()}`);
1205
+ const outTemplate = `${tmpBase}.%(ext)s`;
1206
+
1207
+ return new Promise((resolve, reject) => {
1208
+ const proc = spawn('yt-dlp', [
1209
+ '-x', '--audio-format', 'opus', '--audio-quality', '48K',
1210
+ '-o', outTemplate, '--no-playlist', '--newline', url,
1211
+ ], { stdio: ['ignore', 'pipe', 'pipe'] });
1212
+
1213
+ const cleanupPartial = () => {
1214
+ const dir = path.dirname(tmpBase);
1215
+ const prefix = path.basename(tmpBase);
1216
+ try {
1217
+ for (const f of fs.readdirSync(dir)) {
1218
+ if (f.startsWith(prefix) && f.length > prefix.length) try { fs.unlinkSync(path.join(dir, f)); } catch {}
1219
+ }
1220
+ } catch {}
1221
+ };
1222
+
1223
+ let aborted = false;
1224
+ const onSigint = () => { aborted = true; proc.kill(); };
1225
+ process.on('SIGINT', onSigint);
1226
+
1227
+ let lastErr = '';
1228
+ const parseOutput = (chunk) => {
1229
+ const lines = chunk.toString().split('\n');
1230
+ for (const line of lines) {
1231
+ if (!line.trim()) continue;
1232
+ const dl = line.match(/\[download\]\s+([\d.]+)%/);
1233
+ if (dl) { spinner.update(`Downloading... ${Math.round(parseFloat(dl[1]))}%`); continue; }
1234
+ if (/\[ExtractAudio\]/.test(line)) { spinner.update('Converting audio...'); continue; }
1235
+ if (/\[download\]\s+Destination:/.test(line)) { spinner.update('Downloading...'); continue; }
1236
+ }
1237
+ };
1238
+ proc.stdout.on('data', parseOutput);
1239
+ proc.stderr.on('data', (chunk) => {
1240
+ lastErr = chunk.toString().trim().split('\n').pop();
1241
+ parseOutput(chunk);
1242
+ });
1243
+
1244
+ proc.on('close', (code) => {
1245
+ process.removeListener('SIGINT', onSigint);
1246
+ if (aborted) { cleanupPartial(); return reject(new Error('Download aborted')); }
1247
+ if (code !== 0) { cleanupPartial(); return reject(new Error(lastErr || `yt-dlp exited with code ${code}`)); }
1248
+ // yt-dlp may produce a different extension than requested; find the actual file
1249
+ const dir = path.dirname(tmpBase);
1250
+ const prefix = path.basename(tmpBase);
1251
+ try {
1252
+ const match = fs.readdirSync(dir).find(f => f.startsWith(prefix) && f.length > prefix.length);
1253
+ if (!match) return reject(new Error('yt-dlp produced no output file'));
1254
+ resolve(path.join(dir, match));
1255
+ } catch (err) { reject(err); }
1256
+ });
1257
+ });
1258
+ }
1259
+
1260
+ function getAudioDuration(filePath) {
1261
+ try {
1262
+ const out = execFileSync('ffprobe', ['-v', 'quiet', '-show_entries', 'format=duration', '-of', 'csv=p=0', filePath], { stdio: 'pipe', encoding: 'utf8' });
1263
+ return parseFloat(out.trim()) || 0;
1264
+ } catch { return 0; }
1265
+ }
1266
+
1267
+ async function compressAudio(inputPath) {
1268
+ const base = path.join(os.tmpdir(), `dikt-${process.pid}-${Date.now()}-${path.basename(inputPath, path.extname(inputPath))}`);
1269
+ for (const codec of ['libopus', 'libvorbis']) {
1270
+ const outPath = `${base}.ogg`;
1271
+ try {
1272
+ await execFileAsync('ffmpeg', ['-i', inputPath, '-c:a', codec, '-b:a', '48k', '-y', '-v', 'quiet', outPath], { stdio: 'pipe' });
1273
+ if (fs.statSync(outPath).size > 0) return outPath;
1274
+ try { fs.unlinkSync(outPath); } catch {}
1275
+ } catch { try { fs.unlinkSync(outPath); } catch {} }
1276
+ }
1277
+ return null;
1278
+ }
1279
+
1280
+ async function findSilenceSplitPoint(filePath, targetSec) {
1281
+ const startSec = Math.max(0, targetSec - SPLIT_SEARCH_SEC);
1282
+ const durSec = SPLIT_SEARCH_SEC * 2;
1283
+
1284
+ try {
1285
+ // Extract a small window of raw PCM around the target for silence analysis
1286
+ const { stdout: raw } = await execFileAsync('ffmpeg', [
1287
+ '-ss', String(startSec), '-t', String(durSec), '-i', filePath,
1288
+ '-f', 's16le', '-ar', '16000', '-ac', '1', '-v', 'quiet', '-',
1289
+ ], { encoding: 'buffer', maxBuffer: 16000 * 2 * durSec + 4096 });
1290
+
1291
+ // Scan for silence in 50ms windows
1292
+ const WINDOW_BYTES = Math.round(16000 * 0.05) * 2; // 50ms at 16kHz 16-bit mono
1293
+ let bestOffset = -1, bestLen = 0;
1294
+ let runStart = -1, runLen = 0;
1295
+
1296
+ for (let offset = 0; offset + WINDOW_BYTES <= raw.length; offset += WINDOW_BYTES) {
1297
+ const peak = peakAmplitude(raw.subarray(offset, offset + WINDOW_BYTES));
1298
+ if (peak < SILENCE_THRESHOLD) {
1299
+ if (runStart === -1) runStart = offset;
1300
+ runLen++;
1301
+ } else {
1302
+ if (runLen > bestLen) { bestOffset = runStart; bestLen = runLen; }
1303
+ runStart = -1; runLen = 0;
1304
+ }
1305
+ }
1306
+ if (runLen > bestLen) { bestOffset = runStart; bestLen = runLen; }
1307
+
1308
+ if (bestLen >= 10) { // at least 500ms of silence (avoids mid-word splits)
1309
+ const centerBytes = bestOffset + Math.floor(bestLen / 2) * WINDOW_BYTES;
1310
+ return startSec + centerBytes / (16000 * 2);
1311
+ }
1312
+ } catch {}
1313
+
1314
+ return targetSec; // fallback: no silence found, split at target
1315
+ }
1316
+
1317
+ function cleanChunkText(t) {
1318
+ if (!t) return '';
1319
+ // Strip [PRINT_WORDLEVEL_TIME] markup the API sometimes spontaneously returns
1320
+ if (t.includes('[PRINT_WORDLEVEL_TIME]')) {
1321
+ t = t.replace(/\[PRINT_WORDLEVEL_TIME\]/g, '');
1322
+ t = t.replace(/<\/?\d{2}:\d{2}\.\d+>/g, '');
1323
+ t = t.replace(/\s+/g, ' ');
1324
+ }
1325
+ return t.trim();
1326
+ }
1327
+
1328
+ function mergeChunkResults(results, splitPoints) {
1329
+ // No overlap — just concatenate text, offset timestamps
1330
+ let text = results.map(r => cleanChunkText(r.text)).filter(Boolean).join(' ');
1331
+ // Fix missing spaces after punctuation (API omits leading spaces on some segments)
1332
+ text = text.replace(/([.!?,])([A-Za-z])/g, '$1 $2');
1333
+ let maxLatency = 0;
1334
+ const allSegments = [];
1335
+ const allWords = [];
1336
+
1337
+ const round1 = (n) => Math.round(n * 10) / 10;
1338
+ for (let i = 0; i < results.length; i++) {
1339
+ const r = results[i];
1340
+ const offset = splitPoints[i];
1341
+ if (r.latency > maxLatency) maxLatency = r.latency;
1342
+
1343
+ if (r.segments) {
1344
+ for (const seg of r.segments) {
1345
+ allSegments.push({ ...seg, start: round1(seg.start + offset), end: round1(seg.end + offset) });
1346
+ }
1347
+ }
1348
+ if (r.words) {
1349
+ for (const w of r.words) {
1350
+ allWords.push({ ...w, start: round1(w.start + offset), end: round1(w.end + offset) });
1351
+ }
1352
+ }
1353
+ }
1354
+
1355
+ return {
1356
+ text,
1357
+ latency: maxLatency,
1358
+ segments: allSegments.length ? allSegments : undefined,
1359
+ words: allWords.length ? allWords : undefined,
1360
+ };
1361
+ }
1362
+
1363
+ async function parallelMap(items, fn, concurrency) {
1364
+ const results = new Array(items.length);
1365
+ let next = 0;
1366
+ const worker = async () => { while (next < items.length) { const i = next++; results[i] = await fn(items[i], i); } };
1367
+ await Promise.all(Array.from({ length: Math.min(concurrency, items.length) }, () => worker()));
1368
+ return results;
1369
+ }
1370
+
1062
1371
  // ── File mode ────────────────────────────────────────────────────────────────
1063
1372
 
1064
1373
  async function runFile(flags) {
1374
+ const spinner = createStderrSpinner();
1375
+ let fileSize = 0;
1376
+ let transcribeTimer = null;
1377
+ const tempFiles = [];
1378
+
1065
1379
  try {
1066
- if (!flags.file || !fs.existsSync(flags.file)) {
1380
+ const isURL = /^https?:\/\//i.test(flags.file);
1381
+
1382
+ if (isURL) {
1383
+ if (!ytdlpAvailable()) {
1384
+ process.stderr.write(`\n${RED}${BOLD} yt-dlp not found.${RESET}\n\n`);
1385
+ process.stderr.write(` yt-dlp is required to download audio from URLs. Install it:\n\n`);
1386
+ if (process.platform === 'darwin') {
1387
+ process.stderr.write(` ${BOLD}brew install yt-dlp${RESET}\n\n`);
1388
+ } else if (process.platform === 'win32') {
1389
+ process.stderr.write(` ${BOLD}choco install yt-dlp${RESET} or ${BOLD}scoop install yt-dlp${RESET}\n\n`);
1390
+ } else {
1391
+ process.stderr.write(` ${BOLD}sudo apt install yt-dlp${RESET} (Debian/Ubuntu)\n`);
1392
+ process.stderr.write(` ${BOLD}pip install yt-dlp${RESET} (any platform)\n\n`);
1393
+ }
1394
+ return EXIT_DEPENDENCY;
1395
+ }
1396
+ spinner.start('Downloading audio...');
1397
+ try {
1398
+ const downloaded = await downloadWithYtdlp(flags.file, spinner);
1399
+ tempFiles.push(downloaded);
1400
+ flags = { ...flags, file: downloaded };
1401
+ } catch (err) {
1402
+ spinner.stop();
1403
+ process.stderr.write(`Error downloading: ${err.message}\n`);
1404
+ return EXIT_TRANSCRIPTION;
1405
+ }
1406
+ spinner.update('Processing audio...');
1407
+ } else if (!flags.file || !fs.existsSync(flags.file)) {
1067
1408
  process.stderr.write(`Error: file not found: ${flags.file}\n`);
1068
1409
  return EXIT_TRANSCRIPTION;
1410
+ } else {
1411
+ spinner.start('Reading file...');
1069
1412
  }
1413
+ fileSize = fs.statSync(flags.file).size;
1414
+ const ext = path.extname(flags.file).slice(1).toLowerCase() || 'wav';
1070
1415
 
1071
- const blob = await fs.openAsBlob(flags.file);
1072
- const ext = path.extname(flags.file).slice(1) || 'wav';
1073
- const mimeTypes = { wav: 'audio/wav', mp3: 'audio/mpeg', m4a: 'audio/mp4', ogg: 'audio/ogg', flac: 'audio/flac', webm: 'audio/webm' };
1074
- const mime = mimeTypes[ext] || 'audio/wav';
1075
- const file = new File([blob], path.basename(flags.file), { type: mime });
1416
+ // Check if ffmpeg is available for chunking / compression optimizations
1417
+ const hasFFmpeg = ffmpegAvailable();
1418
+ const duration = hasFFmpeg ? getAudioDuration(flags.file) : 0;
1419
+ const canChunk = hasFFmpeg && !flags.diarize && duration > CHUNK_MIN_SEC;
1420
+
1421
+ if (canChunk) {
1422
+ spinner.stop();
1423
+ return await runFileChunked(flags, { fileSize, duration });
1424
+ }
1425
+
1426
+ // Compress uncompressed formats (wav/flac → ogg) for faster upload
1427
+ let uploadPath = flags.file;
1428
+ let uploadExt = ext;
1429
+ if (hasFFmpeg && COMPRESSIBLE.has(ext)) {
1430
+ spinner.update('Compressing...');
1431
+ const compressed = await compressAudio(flags.file);
1432
+ if (compressed) {
1433
+ const newSize = fs.statSync(compressed).size;
1434
+ if (newSize < fileSize) {
1435
+ tempFiles.push(compressed);
1436
+ uploadPath = compressed;
1437
+ uploadExt = path.extname(compressed).slice(1);
1438
+ spinner.update(`Compressed ${formatFileSize(fileSize)} → ${formatFileSize(newSize)}`);
1439
+ } else {
1440
+ try { fs.unlinkSync(compressed); } catch {}
1441
+ }
1442
+ }
1443
+ }
1444
+
1445
+ const blob = await fs.openAsBlob(uploadPath);
1446
+ const mime = MIME_TYPES[uploadExt] || 'application/octet-stream';
1447
+ const file = new File([blob], path.basename(uploadPath), { type: mime });
1448
+
1449
+ spinner.update(`Uploading to API... (${formatFileSize(blob.size)})`);
1076
1450
 
1077
1451
  const ac = new AbortController();
1078
- const abortHandler = () => ac.abort();
1452
+ const abortHandler = () => { spinner.stop('Aborting...'); ac.abort(); };
1079
1453
  process.on('SIGINT', abortHandler);
1080
1454
 
1081
- const result = await callTranscribeAPI(file, { signal: ac.signal, timestamps: flags.timestamps, diarize: flags.diarize });
1455
+ const onProgress = (sent, total) => {
1456
+ if (sent === -1) {
1457
+ const t0 = Date.now();
1458
+ const elapsed = () => { const s = Math.floor((Date.now() - t0) / 1000); return `${Math.floor(s / 60)}:${String(s % 60).padStart(2, '0')}`; };
1459
+ spinner.update(`Transcribing... ${DIM}(${elapsed()})${RESET}`);
1460
+ transcribeTimer = setInterval(() => spinner.update(`Transcribing... ${DIM}(${elapsed()})${RESET}`), 1000);
1461
+ } else {
1462
+ const pct = Math.round((sent / total) * 100);
1463
+ spinner.update(`Uploading ${pct}% (${formatFileSize(sent)} / ${formatFileSize(total)})`);
1464
+ }
1465
+ };
1466
+
1467
+ const result = await callTranscribeAPI(file, { signal: ac.signal, timestamps: flags.timestamps, diarize: flags.diarize, onProgress });
1468
+ if (transcribeTimer) clearInterval(transcribeTimer);
1082
1469
  process.removeListener('SIGINT', abortHandler);
1083
1470
 
1471
+ spinner.stop(`${GREEN}Done${RESET} (${(result.latency / 1000).toFixed(1)}s)`);
1472
+
1084
1473
  if (!result.text) {
1085
1474
  process.stderr.write('No speech detected\n');
1086
1475
  return EXIT_TRANSCRIPTION;
@@ -1110,12 +1499,180 @@ async function runFile(flags) {
1110
1499
 
1111
1500
  return EXIT_OK;
1112
1501
  } catch (err) {
1502
+ if (transcribeTimer) clearInterval(transcribeTimer);
1503
+ spinner.stop();
1504
+
1113
1505
  if (err.name === 'AbortError') {
1114
1506
  process.stderr.write('Aborted\n');
1115
1507
  return EXIT_TRANSCRIPTION;
1116
1508
  }
1117
- process.stderr.write(`Error: ${err.message}\n`);
1509
+
1510
+ const parts = [`Error: ${err.message}`];
1511
+ if (fileSize) parts.push(` File: ${flags.file} (${formatFileSize(fileSize)})`);
1512
+
1513
+ if (err.networkError) {
1514
+ parts.push(' Hint: check your network connection and try again');
1515
+ } else if (err.status === 401) {
1516
+ parts.push(' Hint: invalid API key — run `dikt setup` to reconfigure');
1517
+ } else if (err.status === 413) {
1518
+ parts.push(' Hint: file is too large for the API — try a shorter recording');
1519
+ } else if (err.status === 429) {
1520
+ parts.push(' Hint: rate limited — wait a moment and try again');
1521
+ } else if (err.status >= 500) {
1522
+ parts.push(' Hint: Mistral API server error — try again later');
1523
+ }
1524
+
1525
+ process.stderr.write(parts.join('\n') + '\n');
1118
1526
  return EXIT_TRANSCRIPTION;
1527
+ } finally {
1528
+ for (const f of tempFiles) { try { fs.unlinkSync(f); } catch {} }
1529
+ }
1530
+ }
1531
+
1532
+ async function runFileChunked(flags, { fileSize, duration }) {
1533
+ const spinner = createStderrSpinner();
1534
+ const tempFiles = [];
1535
+ const t0 = Date.now();
1536
+ let progressTimer = null;
1537
+ let abortHandler = null;
1538
+
1539
+ try {
1540
+ // Find optimal split points at silence boundaries
1541
+ const numTargetChunks = Math.ceil(duration / TARGET_CHUNK_SEC);
1542
+ spinner.start('Analyzing audio for split points...');
1543
+
1544
+ const splitPoints = [0];
1545
+ for (let i = 1; i < numTargetChunks; i++) {
1546
+ spinner.update(`Finding split point ${i}/${numTargetChunks - 1}...`);
1547
+ splitPoints.push(await findSilenceSplitPoint(flags.file, i * TARGET_CHUNK_SEC));
1548
+ }
1549
+ splitPoints.push(duration);
1550
+
1551
+ // Merge tiny trailing chunks (< MIN_CHUNK_SEC) into the previous one
1552
+ for (let i = splitPoints.length - 2; i > 0; i--) {
1553
+ if (splitPoints[i + 1] - splitPoints[i] < MIN_CHUNK_SEC) {
1554
+ splitPoints.splice(i, 1);
1555
+ }
1556
+ }
1557
+
1558
+ const numChunks = splitPoints.length - 1;
1559
+
1560
+ // Split audio and compress each chunk
1561
+ const chunkBase = path.join(os.tmpdir(), `dikt-${process.pid}-${Date.now()}`);
1562
+ const uploadPaths = [];
1563
+
1564
+ for (let i = 0; i < numChunks; i++) {
1565
+ spinner.update(`Preparing chunk ${i + 1}/${numChunks}...`);
1566
+ const start = splitPoints[i];
1567
+ const dur = splitPoints[i + 1] - start;
1568
+ const oggPath = `${chunkBase}-${i}.ogg`;
1569
+ try {
1570
+ await execFileAsync('ffmpeg', ['-ss', String(start), '-t', String(dur), '-i', flags.file, '-c:a', 'libopus', '-b:a', '48k', '-y', '-v', 'quiet', oggPath], { stdio: 'pipe' });
1571
+ if (fs.statSync(oggPath).size > 0) {
1572
+ tempFiles.push(oggPath);
1573
+ uploadPaths.push(oggPath);
1574
+ } else { throw new Error('empty output'); }
1575
+ } catch {
1576
+ try { fs.unlinkSync(oggPath); } catch {}
1577
+ const wavPath = `${chunkBase}-${i}.wav`;
1578
+ await execFileAsync('ffmpeg', ['-ss', String(start), '-t', String(dur), '-i', flags.file, '-y', '-v', 'quiet', wavPath], { stdio: 'pipe' });
1579
+ if (!fs.statSync(wavPath).size) throw new Error(`ffmpeg produced empty chunk ${i}`);
1580
+ tempFiles.push(wavPath);
1581
+ uploadPaths.push(wavPath);
1582
+ }
1583
+ }
1584
+
1585
+ const totalUploadSize = uploadPaths.reduce((sum, p) => sum + fs.statSync(p).size, 0);
1586
+ spinner.update(`Compressed → ${formatFileSize(totalUploadSize)} total`);
1587
+
1588
+ // Abort handling
1589
+ const ac = new AbortController();
1590
+ abortHandler = () => { spinner.stop('Aborting...'); ac.abort(); };
1591
+ process.on('SIGINT', abortHandler);
1592
+
1593
+ // Transcribe chunks in parallel
1594
+ let completed = 0;
1595
+ const elapsed = () => {
1596
+ const s = Math.floor((Date.now() - t0) / 1000);
1597
+ return `${Math.floor(s / 60)}:${String(s % 60).padStart(2, '0')}`;
1598
+ };
1599
+ spinner.update(`Transcribing ${numChunks} chunks... ${DIM}(${elapsed()})${RESET}`);
1600
+ progressTimer = setInterval(() => {
1601
+ spinner.update(`Transcribing... ${completed}/${numChunks} done ${DIM}(${elapsed()})${RESET}`);
1602
+ }, 1000);
1603
+
1604
+ const chunkIndices = Array.from({ length: numChunks }, (_, i) => i);
1605
+ const results = await parallelMap(chunkIndices, async (i) => {
1606
+ const uploadPath = uploadPaths[i];
1607
+ const ext = path.extname(uploadPath).slice(1);
1608
+ const blob = await fs.openAsBlob(uploadPath);
1609
+ const file = new File([blob], `chunk-${i}.${ext}`, { type: MIME_TYPES[ext] || 'audio/wav' });
1610
+ const result = await callTranscribeAPI(file, { signal: ac.signal, timestamps: flags.timestamps });
1611
+ completed++;
1612
+ return result;
1613
+ }, MAX_PARALLEL);
1614
+
1615
+ clearInterval(progressTimer); progressTimer = null;
1616
+ process.removeListener('SIGINT', abortHandler); abortHandler = null;
1617
+
1618
+ // Merge results — no overlap, just concatenate text and offset timestamps
1619
+ const merged = mergeChunkResults(results, splitPoints);
1620
+ const totalLatency = Date.now() - t0;
1621
+ spinner.stop(`${GREEN}Done${RESET} (${(totalLatency / 1000).toFixed(1)}s, ${numChunks} chunks)`);
1622
+
1623
+ if (!merged.text) {
1624
+ process.stderr.write('No speech detected\n');
1625
+ return EXIT_TRANSCRIPTION;
1626
+ }
1627
+
1628
+ const wordCount = merged.text.split(/\s+/).filter(Boolean).length;
1629
+
1630
+ let output;
1631
+ if (flags.json) {
1632
+ const out = buildJsonOutput(
1633
+ { text: merged.text, latency: totalLatency, words: wordCount },
1634
+ { segments: merged.segments, words: merged.words, timestamps: flags.timestamps, diarize: false },
1635
+ );
1636
+ output = JSON.stringify(out, null, flags.output ? 2 : 0) + '\n';
1637
+ } else {
1638
+ output = merged.text + '\n';
1639
+ }
1640
+
1641
+ if (flags.output) {
1642
+ fs.writeFileSync(flags.output, output);
1643
+ process.stderr.write(`Saved to ${flags.output}\n`);
1644
+ } else {
1645
+ process.stdout.write(output);
1646
+ }
1647
+
1648
+ return EXIT_OK;
1649
+ } catch (err) {
1650
+ spinner.stop();
1651
+
1652
+ if (err.name === 'AbortError') {
1653
+ process.stderr.write('Aborted\n');
1654
+ return EXIT_TRANSCRIPTION;
1655
+ }
1656
+
1657
+ const parts = [`Error: ${err.message}`];
1658
+ if (fileSize) parts.push(` File: ${flags.file} (${formatFileSize(fileSize)})`);
1659
+
1660
+ if (err.networkError) {
1661
+ parts.push(' Hint: check your network connection and try again');
1662
+ } else if (err.status === 401) {
1663
+ parts.push(' Hint: invalid API key — run `dikt setup` to reconfigure');
1664
+ } else if (err.status === 429) {
1665
+ parts.push(' Hint: rate limited — wait a moment and try again');
1666
+ } else if (err.status >= 500) {
1667
+ parts.push(' Hint: Mistral API server error — try again later');
1668
+ }
1669
+
1670
+ process.stderr.write(parts.join('\n') + '\n');
1671
+ return EXIT_TRANSCRIPTION;
1672
+ } finally {
1673
+ if (progressTimer) clearInterval(progressTimer);
1674
+ if (abortHandler) process.removeListener('SIGINT', abortHandler);
1675
+ for (const f of tempFiles) { try { fs.unlinkSync(f); } catch {} }
1119
1676
  }
1120
1677
  }
1121
1678
 
@@ -1398,7 +1955,7 @@ async function main() {
1398
1955
  language: flagVal(args, '--language', 'e.g. en, de, fr'),
1399
1956
  file: flagVal(args, '--file', 'path to audio file'),
1400
1957
  noNewline: args.includes('--no-newline') || args.includes('-n'),
1401
- timestamps: flagVal(args, '--timestamps', 'segment, word, or segment,word', { valid: ['segment', 'word', 'segment,word'] }),
1958
+ timestamps: flagVal(args, '--timestamps', 'segment or word', { valid: ['segment', 'word'] }),
1402
1959
  diarize: args.includes('--diarize'),
1403
1960
  output: flagVal(args, '--output', 'path') || flagVal(args, '-o', 'path'),
1404
1961
  };
@@ -1468,13 +2025,13 @@ Options:
1468
2025
  --json Record once, output JSON to stdout
1469
2026
  -q, --quiet Record once, print transcript to stdout
1470
2027
  --stream Stream transcription chunks on pauses
1471
- --file <path> Transcribe an audio file (no mic needed)
2028
+ --file <path|url> Transcribe audio file or URL (via yt-dlp)
1472
2029
  -o, --output <path> Write output to file (.json auto-enables JSON)
1473
2030
  --silence <seconds> Silence duration before auto-stop (default: 2.0)
1474
2031
  --pause <seconds> Pause duration to split chunks (default: 1.0)
1475
2032
  --language <code> Language code, e.g. en, de, fr (default: auto)
1476
2033
  -n, --no-newline Join stream chunks without newlines
1477
- --timestamps <granularity> Add timestamps: segment, word, or segment,word
2034
+ --timestamps <granularity> Add timestamps: segment or word
1478
2035
  --diarize Enable speaker identification
1479
2036
  --no-input Fail if config is missing (no wizard)
1480
2037
  --no-color Disable colored output
@@ -1501,6 +2058,7 @@ Examples:
1501
2058
  dikt --file meeting.wav Transcribe an existing audio file
1502
2059
  dikt --file a.wav -o a.json Transcribe to a JSON file
1503
2060
  dikt --file a.wav -o a.txt Transcribe to a text file
2061
+ dikt --file https://youtube.com/watch?v=ID Transcribe from URL
1504
2062
  dikt --stream --silence 0 Stream continuously until Ctrl+C
1505
2063
  dikt --stream -n Stream as continuous flowing text
1506
2064
  dikt -q --json --diarize Transcribe with speaker labels
@@ -1514,13 +2072,13 @@ Environment variables:
1514
2072
 
1515
2073
  Exit codes:
1516
2074
  0 Success
1517
- 1 Missing dependency (sox)
2075
+ 1 Missing dependency (sox/ffmpeg)
1518
2076
  2 Not a terminal
1519
2077
  3 Configuration error
1520
2078
  4 Transcription error
1521
2079
 
1522
2080
  Config: ${CONFIG_DIR}/config.json
1523
- Requires: sox (brew install sox)`);
2081
+ Requires: sox (recording), ffmpeg (--file optimization), yt-dlp (URLs, optional)`);
1524
2082
  process.exit(EXIT_OK);
1525
2083
  }
1526
2084
 
@@ -1542,7 +2100,10 @@ Requires: sox (brew install sox)`);
1542
2100
 
1543
2101
  applyEnvOverrides(config);
1544
2102
  if (flags.language) config.language = flags.language;
1545
- if (!flags.timestamps && config.timestamps) flags.timestamps = config.timestamps;
2103
+ if (!flags.timestamps && config.timestamps) {
2104
+ // Migrate legacy 'segment,word' → 'word' (combined option removed)
2105
+ flags.timestamps = config.timestamps === 'segment,word' ? 'word' : config.timestamps;
2106
+ }
1546
2107
  if (!flags.diarize && config.diarize) flags.diarize = true;
1547
2108
  if (flags.output && flags.output.endsWith('.json')) flags.json = true;
1548
2109
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "dikt",
3
- "version": "1.2.0",
3
+ "version": "1.4.0",
4
4
  "description": "Voice dictation for the terminal.",
5
5
  "type": "module",
6
6
  "bin": {