dikt 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +13 -4
  2. package/cli.mjs +582 -30
  3. package/package.json +1 -1
package/README.md CHANGED
@@ -23,6 +23,11 @@ sudo apt install sox
23
23
  sudo pacman -S sox
24
24
  ```
25
25
 
26
+ Optional dependencies for `--file` mode:
27
+
28
+ - [ffmpeg](https://ffmpeg.org/) — enables compression, chunked transcription of long files, and broader format support
29
+ - [yt-dlp](https://github.com/yt-dlp/yt-dlp) — enables transcribing audio from URLs (YouTube, podcasts, etc.)
30
+
26
31
  ## Setup
27
32
 
28
33
  On first run, dikt will prompt you for your Mistral API key and model preferences:
@@ -90,7 +95,7 @@ dikt --stream --silence 0
90
95
 
91
96
  ### File mode
92
97
 
93
- Transcribe an existing audio file (wav, mp3, m4a, flac, ogg, webm no sox needed):
98
+ Transcribe an existing audio file (wav, mp3, m4a, flac, ogg, webm, aac, wma, and more):
94
99
 
95
100
  ```bash
96
101
  dikt --file meeting.wav
@@ -101,6 +106,10 @@ dikt --file meeting.wav -o transcript.txt
101
106
 
102
107
  # With JSON output
103
108
  dikt --file recording.mp3 --json
109
+
110
+ # Transcribe from a URL (requires yt-dlp)
111
+ dikt --file https://youtube.com/watch?v=VIDEO_ID
112
+ dikt --file https://youtube.com/watch?v=VIDEO_ID -o transcript.txt
104
113
  ```
105
114
 
106
115
  ### Speaker identification & timestamps
@@ -112,7 +121,7 @@ dikt -q --diarize
112
121
  # Timestamps
113
122
  dikt -q --timestamps segment
114
123
  dikt -q --timestamps word
115
- dikt -q --timestamps segment,word
124
+ dikt --file lecture.mp3 --timestamps segment
116
125
 
117
126
  # Combined with JSON
118
127
  dikt -q --json --diarize
@@ -122,7 +131,7 @@ dikt -q --json --diarize
122
131
 
123
132
  | Flag | Description |
124
133
  |---|---|
125
- | `--file <path>` | Transcribe an audio file (no mic needed) |
134
+ | `--file <path\|url>` | Transcribe audio file or URL (via yt-dlp) |
126
135
  | `-o`, `--output <path>` | Write output to file (`.json` auto-enables JSON) |
127
136
  | `--stream` | Stream transcription chunks on pauses |
128
137
  | `--json` | Output JSON (single-shot or stream) |
@@ -130,7 +139,7 @@ dikt -q --json --diarize
130
139
  | `--silence <seconds>` | Silence duration before auto-stop (default: 2.0) |
131
140
  | `--pause <seconds>` | Pause duration to split stream chunks (default: 1.0) |
132
141
  | `--language <code>` | Language code, e.g. en, de, fr (default: auto) |
133
- | `--timestamps <granularity>` | Add timestamps: segment, word, or segment,word |
142
+ | `--timestamps <granularity>` | Add timestamps: segment or word |
134
143
  | `--diarize` | Enable speaker identification |
135
144
  | `-n`, `--no-newline` | Join stream chunks without newlines |
136
145
  | `--no-color` | Disable colored output |
package/cli.mjs CHANGED
@@ -6,7 +6,10 @@ import fs from 'node:fs';
6
6
  import path from 'node:path';
7
7
  import os from 'node:os';
8
8
  import readline from 'node:readline';
9
- import { spawn, execFileSync } from 'node:child_process';
9
+ import { spawn, execFileSync, execFile as execFileCb } from 'node:child_process';
10
+ import { promisify } from 'node:util';
11
+ const execFileAsync = promisify(execFileCb);
12
+ import https from 'node:https';
10
13
 
11
14
  // ── ANSI helpers ──────────────────────────────────────────────────────────────
12
15
 
@@ -37,9 +40,16 @@ if (process.env.NO_COLOR != null || process.env.TERM === 'dumb' || process.argv.
37
40
 
38
41
  const moveTo = (row, col = 1) => `${ESC}${row};${col}H`;
39
42
 
43
+ function formatFileSize(bytes) {
44
+ if (bytes < 1024) return `${bytes} B`;
45
+ if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
46
+ if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
47
+ return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`;
48
+ }
49
+
40
50
  // ── Constants ─────────────────────────────────────────────────────────────────
41
51
 
42
- const VERSION = '1.2.0';
52
+ const VERSION = '1.3.0';
43
53
  const CONFIG_BASE = process.env.XDG_CONFIG_HOME || path.join(os.homedir(), '.config');
44
54
  const CONFIG_DIR = path.join(CONFIG_BASE, 'dikt');
45
55
  const CONFIG_FILE = path.join(CONFIG_DIR, 'config.json');
@@ -47,6 +57,56 @@ const MAX_HISTORY = 10;
47
57
  const MIN_RECORDING_MS = 500;
48
58
  const COST_PER_MIN = 0.003;
49
59
  const SPINNER = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏'];
60
+ const TARGET_CHUNK_SEC = 270; // ~4.5 min target chunk size
61
+ const CHUNK_MIN_SEC = 360; // only chunk files longer than 6 minutes
62
+ const SPLIT_SEARCH_SEC = 30; // search ±30s around target for silence split point
63
+ const MIN_CHUNK_SEC = 30; // merge chunks shorter than this into neighbor
64
+ const MAX_PARALLEL = 4; // max concurrent API requests
65
+ const MIME_TYPES = { wav: 'audio/wav', mp3: 'audio/mpeg', ogg: 'audio/ogg', flac: 'audio/flac', opus: 'audio/ogg', webm: 'audio/webm', m4a: 'audio/mp4', aac: 'audio/aac', wma: 'audio/x-ms-wma', aif: 'audio/aiff', aiff: 'audio/aiff', mp4: 'audio/mp4', oga: 'audio/ogg', amr: 'audio/amr', caf: 'audio/x-caf' };
66
+ const COMPRESSIBLE = new Set(['wav', 'flac', 'aiff', 'aif', 'raw', 'caf']); // lossless formats worth re-encoding
67
+
68
+ function createStderrSpinner() {
69
+ let frame = 0;
70
+ let interval = null;
71
+ let currentMsg = '';
72
+ const isTTY = process.stderr.isTTY;
73
+ const render = () => {
74
+ const sp = SPINNER[frame++ % SPINNER.length];
75
+ process.stderr.write(`\r${CLEAR_LINE}${YELLOW}${sp}${RESET} ${currentMsg}`);
76
+ };
77
+
78
+ return {
79
+ start(msg) {
80
+ currentMsg = msg;
81
+ if (isTTY) {
82
+ render();
83
+ interval = setInterval(render, 80);
84
+ } else {
85
+ process.stderr.write(`${currentMsg}\n`);
86
+ }
87
+ },
88
+ update(msg) {
89
+ currentMsg = msg;
90
+ if (isTTY) {
91
+ // Restart interval — prevents queued callbacks from firing after sync calls
92
+ if (interval) { clearInterval(interval); }
93
+ render();
94
+ interval = setInterval(render, 80);
95
+ } else {
96
+ process.stderr.write(`${msg}\n`);
97
+ }
98
+ },
99
+ stop(finalMsg) {
100
+ if (interval) { clearInterval(interval); interval = null; }
101
+ if (isTTY) {
102
+ process.stderr.write(`\r${CLEAR_LINE}`);
103
+ if (finalMsg) process.stderr.write(`${finalMsg}\n`);
104
+ } else if (finalMsg) {
105
+ process.stderr.write(`${finalMsg}\n`);
106
+ }
107
+ },
108
+ };
109
+ }
50
110
 
51
111
  const EXIT_OK = 0;
52
112
  const EXIT_DEPENDENCY = 1;
@@ -93,8 +153,8 @@ function validateConfig(cfg) {
93
153
 
94
154
  // ── Setup wizard (form-based) ─────────────────────────────────────────────────
95
155
 
96
- const TIMESTAMPS_DISPLAY = { '': 'off', 'segment': 'segment', 'word': 'word', 'segment,word': 'both' };
97
- const TIMESTAMPS_VALUE = { 'off': '', 'segment': 'segment', 'word': 'word', 'both': 'segment,word' };
156
+ const TIMESTAMPS_DISPLAY = { '': 'off', 'segment': 'segment', 'word': 'word' };
157
+ const TIMESTAMPS_VALUE = { 'off': '', 'segment': 'segment', 'word': 'word' };
98
158
 
99
159
  async function setupWizard() {
100
160
  const existing = loadConfig() || {};
@@ -105,7 +165,7 @@ async function setupWizard() {
105
165
  { key: 'language', label: 'Language', type: 'text', value: '', display: existing.language || 'auto', fallback: existing.language || '' },
106
166
  { key: 'temperature', label: 'Temperature', type: 'text', value: '', display: existing.temperature != null ? String(existing.temperature) : 'default', fallback: existing.temperature != null ? String(existing.temperature) : '' },
107
167
  { key: 'contextBias', label: 'Context bias', type: 'text', value: '', display: existing.contextBias || '', fallback: existing.contextBias || '' },
108
- { key: 'timestamps', label: 'Timestamps', type: 'select', options: ['off', 'segment', 'word', 'both'], idx: ['off', 'segment', 'word', 'both'].indexOf(TIMESTAMPS_DISPLAY[existing.timestamps || ''] || 'off') },
168
+ { key: 'timestamps', label: 'Timestamps', type: 'select', options: ['off', 'segment', 'word'], idx: ['off', 'segment', 'word'].indexOf(TIMESTAMPS_DISPLAY[existing.timestamps || ''] || 'off') },
109
169
  { key: 'diarize', label: 'Diarize', type: 'select', options: ['off', 'on'], idx: existing.diarize ? 1 : 0 },
110
170
  ];
111
171
 
@@ -953,7 +1013,7 @@ function trimSilence(rawData) {
953
1013
  return Buffer.concat(output);
954
1014
  }
955
1015
 
956
- async function callTranscribeAPI(file, { signal, timestamps, diarize } = {}) {
1016
+ async function callTranscribeAPI(file, { signal, timestamps, diarize, onProgress } = {}) {
957
1017
  const fd = new FormData();
958
1018
  fd.append('file', file);
959
1019
  fd.append('model', config.model);
@@ -961,7 +1021,7 @@ async function callTranscribeAPI(file, { signal, timestamps, diarize } = {}) {
961
1021
  if (config.temperature != null) fd.append('temperature', String(config.temperature));
962
1022
  if (config.contextBias) fd.append('context_bias', config.contextBias);
963
1023
  if (timestamps) {
964
- for (const g of timestamps.split(',')) fd.append('timestamp_granularities[]', g.trim());
1024
+ fd.append('timestamp_granularities[]', timestamps);
965
1025
  }
966
1026
  if (diarize) {
967
1027
  fd.append('diarize', 'true');
@@ -969,17 +1029,68 @@ async function callTranscribeAPI(file, { signal, timestamps, diarize } = {}) {
969
1029
  if (!timestamps) fd.append('timestamp_granularities[]', 'segment');
970
1030
  }
971
1031
 
972
- const t0 = Date.now();
973
- const resp = await fetch('https://api.mistral.ai/v1/audio/transcriptions', {
1032
+ // Use Request to serialize FormData into multipart body,
1033
+ // then send via node:https which has no hardcoded headersTimeout
1034
+ // (Node's built-in fetch/undici has a 300s headersTimeout that
1035
+ // cannot be configured without importing undici as a dependency).
1036
+ const req = new Request('https://api.mistral.ai/v1/audio/transcriptions', {
974
1037
  method: 'POST',
975
1038
  headers: { Authorization: `Bearer ${config.apiKey}` },
976
1039
  body: fd,
977
- signal: signal || AbortSignal.timeout(30_000),
1040
+ });
1041
+ const contentType = req.headers.get('content-type');
1042
+ const body = Buffer.from(await req.arrayBuffer());
1043
+
1044
+ const t0 = Date.now();
1045
+ const { status, raw } = await new Promise((resolve, reject) => {
1046
+ const hreq = https.request('https://api.mistral.ai/v1/audio/transcriptions', {
1047
+ method: 'POST',
1048
+ headers: {
1049
+ 'Authorization': `Bearer ${config.apiKey}`,
1050
+ 'Content-Type': contentType,
1051
+ 'Content-Length': body.length,
1052
+ },
1053
+ }, (res) => {
1054
+ const chunks = [];
1055
+ res.on('data', (c) => chunks.push(c));
1056
+ res.on('end', () => resolve({ status: res.statusCode, raw: Buffer.concat(chunks).toString() }));
1057
+ res.on('error', reject);
1058
+ });
1059
+
1060
+ hreq.on('error', (err) => {
1061
+ const ne = new Error(`Network error: ${err.message}`);
1062
+ ne.networkError = true;
1063
+ reject(ne);
1064
+ });
1065
+
1066
+ const abortSig = signal || AbortSignal.timeout(30_000);
1067
+ if (abortSig.aborted) { hreq.destroy(); reject(new DOMException('The operation was aborted', 'AbortError')); return; }
1068
+ abortSig.addEventListener('abort', () => {
1069
+ hreq.destroy();
1070
+ reject(abortSig.reason instanceof DOMException ? abortSig.reason
1071
+ : new DOMException('The operation was aborted', 'AbortError'));
1072
+ }, { once: true });
1073
+
1074
+ // Write body in chunks to enable upload progress tracking
1075
+ const CHUNK_SIZE = 256 * 1024;
1076
+ let written = 0;
1077
+ const total = body.length;
1078
+ const writeChunks = () => {
1079
+ while (written < total) {
1080
+ const end = Math.min(written + CHUNK_SIZE, total);
1081
+ const ok = hreq.write(body.subarray(written, end));
1082
+ written = end;
1083
+ if (onProgress) onProgress(written, total);
1084
+ if (!ok) { hreq.once('drain', writeChunks); return; }
1085
+ }
1086
+ if (onProgress) onProgress(-1, total); // upload done, server processing
1087
+ hreq.end();
1088
+ };
1089
+ writeChunks();
978
1090
  });
979
1091
  const latency = Date.now() - t0;
980
1092
 
981
- if (!resp.ok) {
982
- const raw = await resp.text().catch(() => '');
1093
+ if (status < 200 || status >= 300) {
983
1094
  let msg;
984
1095
  try {
985
1096
  const e = JSON.parse(raw);
@@ -992,14 +1103,14 @@ async function callTranscribeAPI(file, { signal, timestamps, diarize } = {}) {
992
1103
  }
993
1104
  if (!msg) msg = raw;
994
1105
  } catch {
995
- msg = raw || `HTTP ${resp.status}`;
1106
+ msg = raw || `HTTP ${status}`;
996
1107
  }
997
1108
  const err = new Error(msg);
998
- err.status = resp.status;
1109
+ err.status = status;
999
1110
  throw err;
1000
1111
  }
1001
1112
 
1002
- const data = await resp.json();
1113
+ const data = JSON.parse(raw);
1003
1114
  const text = (data.text || '').trim();
1004
1115
  return { text, latency, segments: data.segments, words: data.words };
1005
1116
  }
@@ -1059,28 +1170,297 @@ function buildJsonOutput(base, { segments, words, timestamps, diarize } = {}) {
1059
1170
  return out;
1060
1171
  }
1061
1172
 
1173
+ // ── File optimization helpers ────────────────────────────────────────────────
1174
+
1175
+ let _ffmpegAvail;
1176
+ function ffmpegAvailable() {
1177
+ if (_ffmpegAvail !== undefined) return _ffmpegAvail;
1178
+ try {
1179
+ execFileSync('ffmpeg', ['-version'], { stdio: 'pipe' });
1180
+ execFileSync('ffprobe', ['-version'], { stdio: 'pipe' });
1181
+ _ffmpegAvail = true;
1182
+ } catch { _ffmpegAvail = false; }
1183
+ return _ffmpegAvail;
1184
+ }
1185
+
1186
+ let _ytdlpAvail;
1187
+ function ytdlpAvailable() {
1188
+ if (_ytdlpAvail !== undefined) return _ytdlpAvail;
1189
+ try { execFileSync('yt-dlp', ['--version'], { stdio: 'pipe' }); _ytdlpAvail = true; }
1190
+ catch { _ytdlpAvail = false; }
1191
+ return _ytdlpAvail;
1192
+ }
1193
+
1194
+ function downloadWithYtdlp(url, spinner) {
1195
+ const tmpBase = path.join(os.tmpdir(), `dikt-ytdlp-${process.pid}-${Date.now()}`);
1196
+ const outTemplate = `${tmpBase}.%(ext)s`;
1197
+
1198
+ return new Promise((resolve, reject) => {
1199
+ const proc = spawn('yt-dlp', [
1200
+ '-x', '--audio-format', 'opus', '--audio-quality', '48K',
1201
+ '-o', outTemplate, '--no-playlist', '--newline', url,
1202
+ ], { stdio: ['ignore', 'pipe', 'pipe'] });
1203
+
1204
+ const cleanupPartial = () => {
1205
+ const dir = path.dirname(tmpBase);
1206
+ const prefix = path.basename(tmpBase);
1207
+ try {
1208
+ for (const f of fs.readdirSync(dir)) {
1209
+ if (f.startsWith(prefix) && f.length > prefix.length) try { fs.unlinkSync(path.join(dir, f)); } catch {}
1210
+ }
1211
+ } catch {}
1212
+ };
1213
+
1214
+ let aborted = false;
1215
+ const onSigint = () => { aborted = true; proc.kill(); };
1216
+ process.on('SIGINT', onSigint);
1217
+
1218
+ let lastErr = '';
1219
+ const parseOutput = (chunk) => {
1220
+ const lines = chunk.toString().split('\n');
1221
+ for (const line of lines) {
1222
+ if (!line.trim()) continue;
1223
+ const dl = line.match(/\[download\]\s+([\d.]+)%/);
1224
+ if (dl) { spinner.update(`Downloading... ${Math.round(parseFloat(dl[1]))}%`); continue; }
1225
+ if (/\[ExtractAudio\]/.test(line)) { spinner.update('Converting audio...'); continue; }
1226
+ if (/\[download\]\s+Destination:/.test(line)) { spinner.update('Downloading...'); continue; }
1227
+ }
1228
+ };
1229
+ proc.stdout.on('data', parseOutput);
1230
+ proc.stderr.on('data', (chunk) => {
1231
+ lastErr = chunk.toString().trim().split('\n').pop();
1232
+ parseOutput(chunk);
1233
+ });
1234
+
1235
+ proc.on('close', (code) => {
1236
+ process.removeListener('SIGINT', onSigint);
1237
+ if (aborted) { cleanupPartial(); return reject(new Error('Download aborted')); }
1238
+ if (code !== 0) { cleanupPartial(); return reject(new Error(lastErr || `yt-dlp exited with code ${code}`)); }
1239
+ // yt-dlp may produce a different extension than requested; find the actual file
1240
+ const dir = path.dirname(tmpBase);
1241
+ const prefix = path.basename(tmpBase);
1242
+ try {
1243
+ const match = fs.readdirSync(dir).find(f => f.startsWith(prefix) && f.length > prefix.length);
1244
+ if (!match) return reject(new Error('yt-dlp produced no output file'));
1245
+ resolve(path.join(dir, match));
1246
+ } catch (err) { reject(err); }
1247
+ });
1248
+ });
1249
+ }
1250
+
1251
+ function getAudioDuration(filePath) {
1252
+ try {
1253
+ const out = execFileSync('ffprobe', ['-v', 'quiet', '-show_entries', 'format=duration', '-of', 'csv=p=0', filePath], { stdio: 'pipe', encoding: 'utf8' });
1254
+ return parseFloat(out.trim()) || 0;
1255
+ } catch { return 0; }
1256
+ }
1257
+
1258
+ async function compressAudio(inputPath) {
1259
+ const base = path.join(os.tmpdir(), `dikt-${process.pid}-${Date.now()}-${path.basename(inputPath, path.extname(inputPath))}`);
1260
+ for (const codec of ['libopus', 'libvorbis']) {
1261
+ const outPath = `${base}.ogg`;
1262
+ try {
1263
+ await execFileAsync('ffmpeg', ['-i', inputPath, '-c:a', codec, '-b:a', '48k', '-y', '-v', 'quiet', outPath], { stdio: 'pipe' });
1264
+ if (fs.statSync(outPath).size > 0) return outPath;
1265
+ try { fs.unlinkSync(outPath); } catch {}
1266
+ } catch { try { fs.unlinkSync(outPath); } catch {} }
1267
+ }
1268
+ return null;
1269
+ }
1270
+
1271
+ async function findSilenceSplitPoint(filePath, targetSec) {
1272
+ const startSec = Math.max(0, targetSec - SPLIT_SEARCH_SEC);
1273
+ const durSec = SPLIT_SEARCH_SEC * 2;
1274
+
1275
+ try {
1276
+ // Extract a small window of raw PCM around the target for silence analysis
1277
+ const { stdout: raw } = await execFileAsync('ffmpeg', [
1278
+ '-ss', String(startSec), '-t', String(durSec), '-i', filePath,
1279
+ '-f', 's16le', '-ar', '16000', '-ac', '1', '-v', 'quiet', '-',
1280
+ ], { encoding: 'buffer', maxBuffer: 16000 * 2 * durSec + 4096 });
1281
+
1282
+ // Scan for silence in 50ms windows
1283
+ const WINDOW_BYTES = Math.round(16000 * 0.05) * 2; // 50ms at 16kHz 16-bit mono
1284
+ let bestOffset = -1, bestLen = 0;
1285
+ let runStart = -1, runLen = 0;
1286
+
1287
+ for (let offset = 0; offset + WINDOW_BYTES <= raw.length; offset += WINDOW_BYTES) {
1288
+ const peak = peakAmplitude(raw.subarray(offset, offset + WINDOW_BYTES));
1289
+ if (peak < SILENCE_THRESHOLD) {
1290
+ if (runStart === -1) runStart = offset;
1291
+ runLen++;
1292
+ } else {
1293
+ if (runLen > bestLen) { bestOffset = runStart; bestLen = runLen; }
1294
+ runStart = -1; runLen = 0;
1295
+ }
1296
+ }
1297
+ if (runLen > bestLen) { bestOffset = runStart; bestLen = runLen; }
1298
+
1299
+ if (bestLen >= 10) { // at least 500ms of silence (avoids mid-word splits)
1300
+ const centerBytes = bestOffset + Math.floor(bestLen / 2) * WINDOW_BYTES;
1301
+ return startSec + centerBytes / (16000 * 2);
1302
+ }
1303
+ } catch {}
1304
+
1305
+ return targetSec; // fallback: no silence found, split at target
1306
+ }
1307
+
1308
+ function cleanChunkText(t) {
1309
+ if (!t) return '';
1310
+ // Strip [PRINT_WORDLEVEL_TIME] markup the API sometimes spontaneously returns
1311
+ if (t.includes('[PRINT_WORDLEVEL_TIME]')) {
1312
+ t = t.replace(/\[PRINT_WORDLEVEL_TIME\]/g, '');
1313
+ t = t.replace(/<\/?\d{2}:\d{2}\.\d+>/g, '');
1314
+ t = t.replace(/\s+/g, ' ');
1315
+ }
1316
+ return t.trim();
1317
+ }
1318
+
1319
+ function mergeChunkResults(results, splitPoints) {
1320
+ // No overlap — just concatenate text, offset timestamps
1321
+ let text = results.map(r => cleanChunkText(r.text)).filter(Boolean).join(' ');
1322
+ // Fix missing spaces after punctuation (API omits leading spaces on some segments)
1323
+ text = text.replace(/([.!?,])([A-Za-z])/g, '$1 $2');
1324
+ let maxLatency = 0;
1325
+ const allSegments = [];
1326
+ const allWords = [];
1327
+
1328
+ const round1 = (n) => Math.round(n * 10) / 10;
1329
+ for (let i = 0; i < results.length; i++) {
1330
+ const r = results[i];
1331
+ const offset = splitPoints[i];
1332
+ if (r.latency > maxLatency) maxLatency = r.latency;
1333
+
1334
+ if (r.segments) {
1335
+ for (const seg of r.segments) {
1336
+ allSegments.push({ ...seg, start: round1(seg.start + offset), end: round1(seg.end + offset) });
1337
+ }
1338
+ }
1339
+ if (r.words) {
1340
+ for (const w of r.words) {
1341
+ allWords.push({ ...w, start: round1(w.start + offset), end: round1(w.end + offset) });
1342
+ }
1343
+ }
1344
+ }
1345
+
1346
+ return {
1347
+ text,
1348
+ latency: maxLatency,
1349
+ segments: allSegments.length ? allSegments : undefined,
1350
+ words: allWords.length ? allWords : undefined,
1351
+ };
1352
+ }
1353
+
1354
+ async function parallelMap(items, fn, concurrency) {
1355
+ const results = new Array(items.length);
1356
+ let next = 0;
1357
+ const worker = async () => { while (next < items.length) { const i = next++; results[i] = await fn(items[i], i); } };
1358
+ await Promise.all(Array.from({ length: Math.min(concurrency, items.length) }, () => worker()));
1359
+ return results;
1360
+ }
1361
+
1062
1362
  // ── File mode ────────────────────────────────────────────────────────────────
1063
1363
 
1064
1364
  async function runFile(flags) {
1365
+ const spinner = createStderrSpinner();
1366
+ let fileSize = 0;
1367
+ let transcribeTimer = null;
1368
+ const tempFiles = [];
1369
+
1065
1370
  try {
1066
- if (!flags.file || !fs.existsSync(flags.file)) {
1371
+ const isURL = /^https?:\/\//i.test(flags.file);
1372
+
1373
+ if (isURL) {
1374
+ if (!ytdlpAvailable()) {
1375
+ process.stderr.write(`\n${RED}${BOLD} yt-dlp not found.${RESET}\n\n`);
1376
+ process.stderr.write(` yt-dlp is required to download audio from URLs. Install it:\n\n`);
1377
+ if (process.platform === 'darwin') {
1378
+ process.stderr.write(` ${BOLD}brew install yt-dlp${RESET}\n\n`);
1379
+ } else if (process.platform === 'win32') {
1380
+ process.stderr.write(` ${BOLD}choco install yt-dlp${RESET} or ${BOLD}scoop install yt-dlp${RESET}\n\n`);
1381
+ } else {
1382
+ process.stderr.write(` ${BOLD}sudo apt install yt-dlp${RESET} (Debian/Ubuntu)\n`);
1383
+ process.stderr.write(` ${BOLD}pip install yt-dlp${RESET} (any platform)\n\n`);
1384
+ }
1385
+ return EXIT_DEPENDENCY;
1386
+ }
1387
+ spinner.start('Downloading audio...');
1388
+ try {
1389
+ const downloaded = await downloadWithYtdlp(flags.file, spinner);
1390
+ tempFiles.push(downloaded);
1391
+ flags = { ...flags, file: downloaded };
1392
+ } catch (err) {
1393
+ spinner.stop();
1394
+ process.stderr.write(`Error downloading: ${err.message}\n`);
1395
+ return EXIT_TRANSCRIPTION;
1396
+ }
1397
+ spinner.update('Processing audio...');
1398
+ } else if (!flags.file || !fs.existsSync(flags.file)) {
1067
1399
  process.stderr.write(`Error: file not found: ${flags.file}\n`);
1068
1400
  return EXIT_TRANSCRIPTION;
1401
+ } else {
1402
+ spinner.start('Reading file...');
1403
+ }
1404
+ fileSize = fs.statSync(flags.file).size;
1405
+ const ext = path.extname(flags.file).slice(1).toLowerCase() || 'wav';
1406
+
1407
+ // Check if ffmpeg is available for chunking / compression optimizations
1408
+ const hasFFmpeg = ffmpegAvailable();
1409
+ const duration = hasFFmpeg ? getAudioDuration(flags.file) : 0;
1410
+ const canChunk = hasFFmpeg && !flags.diarize && duration > CHUNK_MIN_SEC;
1411
+
1412
+ if (canChunk) {
1413
+ spinner.stop();
1414
+ return await runFileChunked(flags, { fileSize, duration });
1415
+ }
1416
+
1417
+ // Compress uncompressed formats (wav/flac → ogg) for faster upload
1418
+ let uploadPath = flags.file;
1419
+ let uploadExt = ext;
1420
+ if (hasFFmpeg && COMPRESSIBLE.has(ext)) {
1421
+ spinner.update('Compressing...');
1422
+ const compressed = await compressAudio(flags.file);
1423
+ if (compressed) {
1424
+ const newSize = fs.statSync(compressed).size;
1425
+ if (newSize < fileSize) {
1426
+ tempFiles.push(compressed);
1427
+ uploadPath = compressed;
1428
+ uploadExt = path.extname(compressed).slice(1);
1429
+ spinner.update(`Compressed ${formatFileSize(fileSize)} → ${formatFileSize(newSize)}`);
1430
+ } else {
1431
+ try { fs.unlinkSync(compressed); } catch {}
1432
+ }
1433
+ }
1069
1434
  }
1070
1435
 
1071
- const blob = await fs.openAsBlob(flags.file);
1072
- const ext = path.extname(flags.file).slice(1) || 'wav';
1073
- const mimeTypes = { wav: 'audio/wav', mp3: 'audio/mpeg', m4a: 'audio/mp4', ogg: 'audio/ogg', flac: 'audio/flac', webm: 'audio/webm' };
1074
- const mime = mimeTypes[ext] || 'audio/wav';
1075
- const file = new File([blob], path.basename(flags.file), { type: mime });
1436
+ const blob = await fs.openAsBlob(uploadPath);
1437
+ const mime = MIME_TYPES[uploadExt] || 'application/octet-stream';
1438
+ const file = new File([blob], path.basename(uploadPath), { type: mime });
1439
+
1440
+ spinner.update(`Uploading to API... (${formatFileSize(blob.size)})`);
1076
1441
 
1077
1442
  const ac = new AbortController();
1078
- const abortHandler = () => ac.abort();
1443
+ const abortHandler = () => { spinner.stop('Aborting...'); ac.abort(); };
1079
1444
  process.on('SIGINT', abortHandler);
1080
1445
 
1081
- const result = await callTranscribeAPI(file, { signal: ac.signal, timestamps: flags.timestamps, diarize: flags.diarize });
1446
+ const onProgress = (sent, total) => {
1447
+ if (sent === -1) {
1448
+ const t0 = Date.now();
1449
+ const elapsed = () => { const s = Math.floor((Date.now() - t0) / 1000); return `${Math.floor(s / 60)}:${String(s % 60).padStart(2, '0')}`; };
1450
+ spinner.update(`Transcribing... ${DIM}(${elapsed()})${RESET}`);
1451
+ transcribeTimer = setInterval(() => spinner.update(`Transcribing... ${DIM}(${elapsed()})${RESET}`), 1000);
1452
+ } else {
1453
+ const pct = Math.round((sent / total) * 100);
1454
+ spinner.update(`Uploading ${pct}% (${formatFileSize(sent)} / ${formatFileSize(total)})`);
1455
+ }
1456
+ };
1457
+
1458
+ const result = await callTranscribeAPI(file, { signal: ac.signal, timestamps: flags.timestamps, diarize: flags.diarize, onProgress });
1459
+ if (transcribeTimer) clearInterval(transcribeTimer);
1082
1460
  process.removeListener('SIGINT', abortHandler);
1083
1461
 
1462
+ spinner.stop(`${GREEN}Done${RESET} (${(result.latency / 1000).toFixed(1)}s)`);
1463
+
1084
1464
  if (!result.text) {
1085
1465
  process.stderr.write('No speech detected\n');
1086
1466
  return EXIT_TRANSCRIPTION;
@@ -1110,12 +1490,180 @@ async function runFile(flags) {
1110
1490
 
1111
1491
  return EXIT_OK;
1112
1492
  } catch (err) {
1493
+ if (transcribeTimer) clearInterval(transcribeTimer);
1494
+ spinner.stop();
1495
+
1113
1496
  if (err.name === 'AbortError') {
1114
1497
  process.stderr.write('Aborted\n');
1115
1498
  return EXIT_TRANSCRIPTION;
1116
1499
  }
1117
- process.stderr.write(`Error: ${err.message}\n`);
1500
+
1501
+ const parts = [`Error: ${err.message}`];
1502
+ if (fileSize) parts.push(` File: ${flags.file} (${formatFileSize(fileSize)})`);
1503
+
1504
+ if (err.networkError) {
1505
+ parts.push(' Hint: check your network connection and try again');
1506
+ } else if (err.status === 401) {
1507
+ parts.push(' Hint: invalid API key — run `dikt setup` to reconfigure');
1508
+ } else if (err.status === 413) {
1509
+ parts.push(' Hint: file is too large for the API — try a shorter recording');
1510
+ } else if (err.status === 429) {
1511
+ parts.push(' Hint: rate limited — wait a moment and try again');
1512
+ } else if (err.status >= 500) {
1513
+ parts.push(' Hint: Mistral API server error — try again later');
1514
+ }
1515
+
1516
+ process.stderr.write(parts.join('\n') + '\n');
1517
+ return EXIT_TRANSCRIPTION;
1518
+ } finally {
1519
+ for (const f of tempFiles) { try { fs.unlinkSync(f); } catch {} }
1520
+ }
1521
+ }
1522
+
1523
+ async function runFileChunked(flags, { fileSize, duration }) {
1524
+ const spinner = createStderrSpinner();
1525
+ const tempFiles = [];
1526
+ const t0 = Date.now();
1527
+ let progressTimer = null;
1528
+ let abortHandler = null;
1529
+
1530
+ try {
1531
+ // Find optimal split points at silence boundaries
1532
+ const numTargetChunks = Math.ceil(duration / TARGET_CHUNK_SEC);
1533
+ spinner.start('Analyzing audio for split points...');
1534
+
1535
+ const splitPoints = [0];
1536
+ for (let i = 1; i < numTargetChunks; i++) {
1537
+ spinner.update(`Finding split point ${i}/${numTargetChunks - 1}...`);
1538
+ splitPoints.push(await findSilenceSplitPoint(flags.file, i * TARGET_CHUNK_SEC));
1539
+ }
1540
+ splitPoints.push(duration);
1541
+
1542
+ // Merge tiny trailing chunks (< MIN_CHUNK_SEC) into the previous one
1543
+ for (let i = splitPoints.length - 2; i > 0; i--) {
1544
+ if (splitPoints[i + 1] - splitPoints[i] < MIN_CHUNK_SEC) {
1545
+ splitPoints.splice(i, 1);
1546
+ }
1547
+ }
1548
+
1549
+ const numChunks = splitPoints.length - 1;
1550
+
1551
+ // Split audio and compress each chunk
1552
+ const chunkBase = path.join(os.tmpdir(), `dikt-${process.pid}-${Date.now()}`);
1553
+ const uploadPaths = [];
1554
+
1555
+ for (let i = 0; i < numChunks; i++) {
1556
+ spinner.update(`Preparing chunk ${i + 1}/${numChunks}...`);
1557
+ const start = splitPoints[i];
1558
+ const dur = splitPoints[i + 1] - start;
1559
+ const oggPath = `${chunkBase}-${i}.ogg`;
1560
+ try {
1561
+ await execFileAsync('ffmpeg', ['-ss', String(start), '-t', String(dur), '-i', flags.file, '-c:a', 'libopus', '-b:a', '48k', '-y', '-v', 'quiet', oggPath], { stdio: 'pipe' });
1562
+ if (fs.statSync(oggPath).size > 0) {
1563
+ tempFiles.push(oggPath);
1564
+ uploadPaths.push(oggPath);
1565
+ } else { throw new Error('empty output'); }
1566
+ } catch {
1567
+ try { fs.unlinkSync(oggPath); } catch {}
1568
+ const wavPath = `${chunkBase}-${i}.wav`;
1569
+ await execFileAsync('ffmpeg', ['-ss', String(start), '-t', String(dur), '-i', flags.file, '-y', '-v', 'quiet', wavPath], { stdio: 'pipe' });
1570
+ if (!fs.statSync(wavPath).size) throw new Error(`ffmpeg produced empty chunk ${i}`);
1571
+ tempFiles.push(wavPath);
1572
+ uploadPaths.push(wavPath);
1573
+ }
1574
+ }
1575
+
1576
+ const totalUploadSize = uploadPaths.reduce((sum, p) => sum + fs.statSync(p).size, 0);
1577
+ spinner.update(`Compressed → ${formatFileSize(totalUploadSize)} total`);
1578
+
1579
+ // Abort handling
1580
+ const ac = new AbortController();
1581
+ abortHandler = () => { spinner.stop('Aborting...'); ac.abort(); };
1582
+ process.on('SIGINT', abortHandler);
1583
+
1584
+ // Transcribe chunks in parallel
1585
+ let completed = 0;
1586
+ const elapsed = () => {
1587
+ const s = Math.floor((Date.now() - t0) / 1000);
1588
+ return `${Math.floor(s / 60)}:${String(s % 60).padStart(2, '0')}`;
1589
+ };
1590
+ spinner.update(`Transcribing ${numChunks} chunks... ${DIM}(${elapsed()})${RESET}`);
1591
+ progressTimer = setInterval(() => {
1592
+ spinner.update(`Transcribing... ${completed}/${numChunks} done ${DIM}(${elapsed()})${RESET}`);
1593
+ }, 1000);
1594
+
1595
+ const chunkIndices = Array.from({ length: numChunks }, (_, i) => i);
1596
+ const results = await parallelMap(chunkIndices, async (i) => {
1597
+ const uploadPath = uploadPaths[i];
1598
+ const ext = path.extname(uploadPath).slice(1);
1599
+ const blob = await fs.openAsBlob(uploadPath);
1600
+ const file = new File([blob], `chunk-${i}.${ext}`, { type: MIME_TYPES[ext] || 'audio/wav' });
1601
+ const result = await callTranscribeAPI(file, { signal: ac.signal, timestamps: flags.timestamps });
1602
+ completed++;
1603
+ return result;
1604
+ }, MAX_PARALLEL);
1605
+
1606
+ clearInterval(progressTimer); progressTimer = null;
1607
+ process.removeListener('SIGINT', abortHandler); abortHandler = null;
1608
+
1609
+ // Merge results — no overlap, just concatenate text and offset timestamps
1610
+ const merged = mergeChunkResults(results, splitPoints);
1611
+ const totalLatency = Date.now() - t0;
1612
+ spinner.stop(`${GREEN}Done${RESET} (${(totalLatency / 1000).toFixed(1)}s, ${numChunks} chunks)`);
1613
+
1614
+ if (!merged.text) {
1615
+ process.stderr.write('No speech detected\n');
1616
+ return EXIT_TRANSCRIPTION;
1617
+ }
1618
+
1619
+ const wordCount = merged.text.split(/\s+/).filter(Boolean).length;
1620
+
1621
+ let output;
1622
+ if (flags.json) {
1623
+ const out = buildJsonOutput(
1624
+ { text: merged.text, latency: totalLatency, words: wordCount },
1625
+ { segments: merged.segments, words: merged.words, timestamps: flags.timestamps, diarize: false },
1626
+ );
1627
+ output = JSON.stringify(out, null, flags.output ? 2 : 0) + '\n';
1628
+ } else {
1629
+ output = merged.text + '\n';
1630
+ }
1631
+
1632
+ if (flags.output) {
1633
+ fs.writeFileSync(flags.output, output);
1634
+ process.stderr.write(`Saved to ${flags.output}\n`);
1635
+ } else {
1636
+ process.stdout.write(output);
1637
+ }
1638
+
1639
+ return EXIT_OK;
1640
+ } catch (err) {
1641
+ spinner.stop();
1642
+
1643
+ if (err.name === 'AbortError') {
1644
+ process.stderr.write('Aborted\n');
1645
+ return EXIT_TRANSCRIPTION;
1646
+ }
1647
+
1648
+ const parts = [`Error: ${err.message}`];
1649
+ if (fileSize) parts.push(` File: ${flags.file} (${formatFileSize(fileSize)})`);
1650
+
1651
+ if (err.networkError) {
1652
+ parts.push(' Hint: check your network connection and try again');
1653
+ } else if (err.status === 401) {
1654
+ parts.push(' Hint: invalid API key — run `dikt setup` to reconfigure');
1655
+ } else if (err.status === 429) {
1656
+ parts.push(' Hint: rate limited — wait a moment and try again');
1657
+ } else if (err.status >= 500) {
1658
+ parts.push(' Hint: Mistral API server error — try again later');
1659
+ }
1660
+
1661
+ process.stderr.write(parts.join('\n') + '\n');
1118
1662
  return EXIT_TRANSCRIPTION;
1663
+ } finally {
1664
+ if (progressTimer) clearInterval(progressTimer);
1665
+ if (abortHandler) process.removeListener('SIGINT', abortHandler);
1666
+ for (const f of tempFiles) { try { fs.unlinkSync(f); } catch {} }
1119
1667
  }
1120
1668
  }
1121
1669
 
@@ -1398,7 +1946,7 @@ async function main() {
1398
1946
  language: flagVal(args, '--language', 'e.g. en, de, fr'),
1399
1947
  file: flagVal(args, '--file', 'path to audio file'),
1400
1948
  noNewline: args.includes('--no-newline') || args.includes('-n'),
1401
- timestamps: flagVal(args, '--timestamps', 'segment, word, or segment,word', { valid: ['segment', 'word', 'segment,word'] }),
1949
+ timestamps: flagVal(args, '--timestamps', 'segment or word', { valid: ['segment', 'word'] }),
1402
1950
  diarize: args.includes('--diarize'),
1403
1951
  output: flagVal(args, '--output', 'path') || flagVal(args, '-o', 'path'),
1404
1952
  };
@@ -1468,13 +2016,13 @@ Options:
1468
2016
  --json Record once, output JSON to stdout
1469
2017
  -q, --quiet Record once, print transcript to stdout
1470
2018
  --stream Stream transcription chunks on pauses
1471
- --file <path> Transcribe an audio file (no mic needed)
2019
+ --file <path|url> Transcribe audio file or URL (via yt-dlp)
1472
2020
  -o, --output <path> Write output to file (.json auto-enables JSON)
1473
2021
  --silence <seconds> Silence duration before auto-stop (default: 2.0)
1474
2022
  --pause <seconds> Pause duration to split chunks (default: 1.0)
1475
2023
  --language <code> Language code, e.g. en, de, fr (default: auto)
1476
2024
  -n, --no-newline Join stream chunks without newlines
1477
- --timestamps <granularity> Add timestamps: segment, word, or segment,word
2025
+ --timestamps <granularity> Add timestamps: segment or word
1478
2026
  --diarize Enable speaker identification
1479
2027
  --no-input Fail if config is missing (no wizard)
1480
2028
  --no-color Disable colored output
@@ -1501,6 +2049,7 @@ Examples:
1501
2049
  dikt --file meeting.wav Transcribe an existing audio file
1502
2050
  dikt --file a.wav -o a.json Transcribe to a JSON file
1503
2051
  dikt --file a.wav -o a.txt Transcribe to a text file
2052
+ dikt --file https://youtube.com/watch?v=ID Transcribe from URL
1504
2053
  dikt --stream --silence 0 Stream continuously until Ctrl+C
1505
2054
  dikt --stream -n Stream as continuous flowing text
1506
2055
  dikt -q --json --diarize Transcribe with speaker labels
@@ -1514,13 +2063,13 @@ Environment variables:
1514
2063
 
1515
2064
  Exit codes:
1516
2065
  0 Success
1517
- 1 Missing dependency (sox)
2066
+ 1 Missing dependency (sox/ffmpeg)
1518
2067
  2 Not a terminal
1519
2068
  3 Configuration error
1520
2069
  4 Transcription error
1521
2070
 
1522
2071
  Config: ${CONFIG_DIR}/config.json
1523
- Requires: sox (brew install sox)`);
2072
+ Requires: sox (recording), ffmpeg (--file optimization), yt-dlp (URLs, optional)`);
1524
2073
  process.exit(EXIT_OK);
1525
2074
  }
1526
2075
 
@@ -1542,7 +2091,10 @@ Requires: sox (brew install sox)`);
1542
2091
 
1543
2092
  applyEnvOverrides(config);
1544
2093
  if (flags.language) config.language = flags.language;
1545
- if (!flags.timestamps && config.timestamps) flags.timestamps = config.timestamps;
2094
+ if (!flags.timestamps && config.timestamps) {
2095
+ // Migrate legacy 'segment,word' → 'word' (combined option removed)
2096
+ flags.timestamps = config.timestamps === 'segment,word' ? 'word' : config.timestamps;
2097
+ }
1546
2098
  if (!flags.diarize && config.diarize) flags.diarize = true;
1547
2099
  if (flags.output && flags.output.endsWith('.json')) flags.json = true;
1548
2100
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "dikt",
3
- "version": "1.2.0",
3
+ "version": "1.3.0",
4
4
  "description": "Voice dictation for the terminal.",
5
5
  "type": "module",
6
6
  "bin": {