dikt 1.2.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -4
- package/cli.mjs +594 -33
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -23,6 +23,11 @@ sudo apt install sox
|
|
|
23
23
|
sudo pacman -S sox
|
|
24
24
|
```
|
|
25
25
|
|
|
26
|
+
Optional dependencies for `--file` mode:
|
|
27
|
+
|
|
28
|
+
- [ffmpeg](https://ffmpeg.org/) — enables compression, chunked transcription of long files, and broader format support
|
|
29
|
+
- [yt-dlp](https://github.com/yt-dlp/yt-dlp) — enables transcribing audio from URLs (YouTube, podcasts, etc.)
|
|
30
|
+
|
|
26
31
|
## Setup
|
|
27
32
|
|
|
28
33
|
On first run, dikt will prompt you for your Mistral API key and model preferences:
|
|
@@ -90,7 +95,7 @@ dikt --stream --silence 0
|
|
|
90
95
|
|
|
91
96
|
### File mode
|
|
92
97
|
|
|
93
|
-
Transcribe an existing audio file (wav, mp3, m4a, flac, ogg, webm
|
|
98
|
+
Transcribe an existing audio file (wav, mp3, m4a, flac, ogg, webm, aac, wma, and more):
|
|
94
99
|
|
|
95
100
|
```bash
|
|
96
101
|
dikt --file meeting.wav
|
|
@@ -101,6 +106,10 @@ dikt --file meeting.wav -o transcript.txt
|
|
|
101
106
|
|
|
102
107
|
# With JSON output
|
|
103
108
|
dikt --file recording.mp3 --json
|
|
109
|
+
|
|
110
|
+
# Transcribe from a URL (requires yt-dlp)
|
|
111
|
+
dikt --file https://youtube.com/watch?v=VIDEO_ID
|
|
112
|
+
dikt --file https://youtube.com/watch?v=VIDEO_ID -o transcript.txt
|
|
104
113
|
```
|
|
105
114
|
|
|
106
115
|
### Speaker identification & timestamps
|
|
@@ -112,7 +121,7 @@ dikt -q --diarize
|
|
|
112
121
|
# Timestamps
|
|
113
122
|
dikt -q --timestamps segment
|
|
114
123
|
dikt -q --timestamps word
|
|
115
|
-
dikt
|
|
124
|
+
dikt --file lecture.mp3 --timestamps segment
|
|
116
125
|
|
|
117
126
|
# Combined with JSON
|
|
118
127
|
dikt -q --json --diarize
|
|
@@ -122,7 +131,7 @@ dikt -q --json --diarize
|
|
|
122
131
|
|
|
123
132
|
| Flag | Description |
|
|
124
133
|
|---|---|
|
|
125
|
-
| `--file <path>` | Transcribe
|
|
134
|
+
| `--file <path\|url>` | Transcribe audio file or URL (via yt-dlp) |
|
|
126
135
|
| `-o`, `--output <path>` | Write output to file (`.json` auto-enables JSON) |
|
|
127
136
|
| `--stream` | Stream transcription chunks on pauses |
|
|
128
137
|
| `--json` | Output JSON (single-shot or stream) |
|
|
@@ -130,7 +139,7 @@ dikt -q --json --diarize
|
|
|
130
139
|
| `--silence <seconds>` | Silence duration before auto-stop (default: 2.0) |
|
|
131
140
|
| `--pause <seconds>` | Pause duration to split stream chunks (default: 1.0) |
|
|
132
141
|
| `--language <code>` | Language code, e.g. en, de, fr (default: auto) |
|
|
133
|
-
| `--timestamps <granularity>` | Add timestamps: segment
|
|
142
|
+
| `--timestamps <granularity>` | Add timestamps: segment or word |
|
|
134
143
|
| `--diarize` | Enable speaker identification |
|
|
135
144
|
| `-n`, `--no-newline` | Join stream chunks without newlines |
|
|
136
145
|
| `--no-color` | Disable colored output |
|
package/cli.mjs
CHANGED
|
@@ -6,7 +6,10 @@ import fs from 'node:fs';
|
|
|
6
6
|
import path from 'node:path';
|
|
7
7
|
import os from 'node:os';
|
|
8
8
|
import readline from 'node:readline';
|
|
9
|
-
import { spawn, execFileSync } from 'node:child_process';
|
|
9
|
+
import { spawn, execFileSync, execFile as execFileCb } from 'node:child_process';
|
|
10
|
+
import { promisify } from 'node:util';
|
|
11
|
+
const execFileAsync = promisify(execFileCb);
|
|
12
|
+
import https from 'node:https';
|
|
10
13
|
|
|
11
14
|
// ── ANSI helpers ──────────────────────────────────────────────────────────────
|
|
12
15
|
|
|
@@ -37,9 +40,16 @@ if (process.env.NO_COLOR != null || process.env.TERM === 'dumb' || process.argv.
|
|
|
37
40
|
|
|
38
41
|
const moveTo = (row, col = 1) => `${ESC}${row};${col}H`;
|
|
39
42
|
|
|
43
|
+
function formatFileSize(bytes) {
|
|
44
|
+
if (bytes < 1024) return `${bytes} B`;
|
|
45
|
+
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
|
|
46
|
+
if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
|
|
47
|
+
return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`;
|
|
48
|
+
}
|
|
49
|
+
|
|
40
50
|
// ── Constants ─────────────────────────────────────────────────────────────────
|
|
41
51
|
|
|
42
|
-
const VERSION = '1.
|
|
52
|
+
const VERSION = '1.4.0';
|
|
43
53
|
const CONFIG_BASE = process.env.XDG_CONFIG_HOME || path.join(os.homedir(), '.config');
|
|
44
54
|
const CONFIG_DIR = path.join(CONFIG_BASE, 'dikt');
|
|
45
55
|
const CONFIG_FILE = path.join(CONFIG_DIR, 'config.json');
|
|
@@ -47,6 +57,56 @@ const MAX_HISTORY = 10;
|
|
|
47
57
|
const MIN_RECORDING_MS = 500;
|
|
48
58
|
const COST_PER_MIN = 0.003;
|
|
49
59
|
const SPINNER = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏'];
|
|
60
|
+
const TARGET_CHUNK_SEC = 270; // ~4.5 min target chunk size
|
|
61
|
+
const CHUNK_MIN_SEC = 360; // only chunk files longer than 6 minutes
|
|
62
|
+
const SPLIT_SEARCH_SEC = 30; // search ±30s around target for silence split point
|
|
63
|
+
const MIN_CHUNK_SEC = 30; // merge chunks shorter than this into neighbor
|
|
64
|
+
const MAX_PARALLEL = 4; // max concurrent API requests
|
|
65
|
+
const MIME_TYPES = { wav: 'audio/wav', mp3: 'audio/mpeg', ogg: 'audio/ogg', flac: 'audio/flac', opus: 'audio/ogg', webm: 'audio/webm', m4a: 'audio/mp4', aac: 'audio/aac', wma: 'audio/x-ms-wma', aif: 'audio/aiff', aiff: 'audio/aiff', mp4: 'audio/mp4', oga: 'audio/ogg', amr: 'audio/amr', caf: 'audio/x-caf' };
|
|
66
|
+
const COMPRESSIBLE = new Set(['wav', 'flac', 'aiff', 'aif', 'raw', 'caf']); // lossless formats worth re-encoding
|
|
67
|
+
|
|
68
|
+
function createStderrSpinner() {
|
|
69
|
+
let frame = 0;
|
|
70
|
+
let interval = null;
|
|
71
|
+
let currentMsg = '';
|
|
72
|
+
const isTTY = process.stderr.isTTY;
|
|
73
|
+
const render = () => {
|
|
74
|
+
const sp = SPINNER[frame++ % SPINNER.length];
|
|
75
|
+
process.stderr.write(`\r${CLEAR_LINE}${YELLOW}${sp}${RESET} ${currentMsg}`);
|
|
76
|
+
};
|
|
77
|
+
|
|
78
|
+
return {
|
|
79
|
+
start(msg) {
|
|
80
|
+
currentMsg = msg;
|
|
81
|
+
if (isTTY) {
|
|
82
|
+
render();
|
|
83
|
+
interval = setInterval(render, 80);
|
|
84
|
+
} else {
|
|
85
|
+
process.stderr.write(`${currentMsg}\n`);
|
|
86
|
+
}
|
|
87
|
+
},
|
|
88
|
+
update(msg) {
|
|
89
|
+
currentMsg = msg;
|
|
90
|
+
if (isTTY) {
|
|
91
|
+
// Restart interval — prevents queued callbacks from firing after sync calls
|
|
92
|
+
if (interval) { clearInterval(interval); }
|
|
93
|
+
render();
|
|
94
|
+
interval = setInterval(render, 80);
|
|
95
|
+
} else {
|
|
96
|
+
process.stderr.write(`${msg}\n`);
|
|
97
|
+
}
|
|
98
|
+
},
|
|
99
|
+
stop(finalMsg) {
|
|
100
|
+
if (interval) { clearInterval(interval); interval = null; }
|
|
101
|
+
if (isTTY) {
|
|
102
|
+
process.stderr.write(`\r${CLEAR_LINE}`);
|
|
103
|
+
if (finalMsg) process.stderr.write(`${finalMsg}\n`);
|
|
104
|
+
} else if (finalMsg) {
|
|
105
|
+
process.stderr.write(`${finalMsg}\n`);
|
|
106
|
+
}
|
|
107
|
+
},
|
|
108
|
+
};
|
|
109
|
+
}
|
|
50
110
|
|
|
51
111
|
const EXIT_OK = 0;
|
|
52
112
|
const EXIT_DEPENDENCY = 1;
|
|
@@ -93,8 +153,8 @@ function validateConfig(cfg) {
|
|
|
93
153
|
|
|
94
154
|
// ── Setup wizard (form-based) ─────────────────────────────────────────────────
|
|
95
155
|
|
|
96
|
-
const TIMESTAMPS_DISPLAY = { '': 'off', 'segment': 'segment', 'word': 'word'
|
|
97
|
-
const TIMESTAMPS_VALUE = { 'off': '', 'segment': 'segment', 'word': 'word'
|
|
156
|
+
const TIMESTAMPS_DISPLAY = { '': 'off', 'segment': 'segment', 'word': 'word' };
|
|
157
|
+
const TIMESTAMPS_VALUE = { 'off': '', 'segment': 'segment', 'word': 'word' };
|
|
98
158
|
|
|
99
159
|
async function setupWizard() {
|
|
100
160
|
const existing = loadConfig() || {};
|
|
@@ -105,7 +165,7 @@ async function setupWizard() {
|
|
|
105
165
|
{ key: 'language', label: 'Language', type: 'text', value: '', display: existing.language || 'auto', fallback: existing.language || '' },
|
|
106
166
|
{ key: 'temperature', label: 'Temperature', type: 'text', value: '', display: existing.temperature != null ? String(existing.temperature) : 'default', fallback: existing.temperature != null ? String(existing.temperature) : '' },
|
|
107
167
|
{ key: 'contextBias', label: 'Context bias', type: 'text', value: '', display: existing.contextBias || '', fallback: existing.contextBias || '' },
|
|
108
|
-
{ key: 'timestamps', label: 'Timestamps', type: 'select', options: ['off', 'segment', 'word'
|
|
168
|
+
{ key: 'timestamps', label: 'Timestamps', type: 'select', options: ['off', 'segment', 'word'], idx: ['off', 'segment', 'word'].indexOf(TIMESTAMPS_DISPLAY[existing.timestamps || ''] || 'off') },
|
|
109
169
|
{ key: 'diarize', label: 'Diarize', type: 'select', options: ['off', 'on'], idx: existing.diarize ? 1 : 0 },
|
|
110
170
|
];
|
|
111
171
|
|
|
@@ -398,13 +458,22 @@ function renderKeybar() {
|
|
|
398
458
|
return ` ${DIM}[SPACE]${RESET} Record ${copyKey}${autoCopyKey}${histKey}${retryKey}`.trimEnd();
|
|
399
459
|
}
|
|
400
460
|
|
|
461
|
+
function formatDuration(seconds) {
|
|
462
|
+
if (seconds < 60) return `${seconds.toFixed(1)}s`;
|
|
463
|
+
const m = Math.floor(seconds / 60);
|
|
464
|
+
const s = (seconds % 60).toFixed(1).padStart(4, '0');
|
|
465
|
+
if (m < 60) return `${m}m ${s}s`;
|
|
466
|
+
const h = Math.floor(m / 60);
|
|
467
|
+
const rm = String(m % 60).padStart(2, '0');
|
|
468
|
+
return `${h}h ${rm}m ${s}s`;
|
|
469
|
+
}
|
|
470
|
+
|
|
401
471
|
function renderStatus() {
|
|
402
472
|
switch (state.mode) {
|
|
403
473
|
case 'idle':
|
|
404
474
|
return ` ${GREY}● Idle${RESET}`;
|
|
405
475
|
case 'recording': {
|
|
406
|
-
|
|
407
|
-
return ` ${RED}${BOLD}● Recording${RESET} ${RED}${secs}s${RESET}`;
|
|
476
|
+
return ` ${RED}${BOLD}● Recording${RESET} ${RED}${formatDuration(state.duration)}${RESET}`;
|
|
408
477
|
}
|
|
409
478
|
case 'transcribing': {
|
|
410
479
|
const sp = SPINNER[state.spinnerFrame % SPINNER.length];
|
|
@@ -498,7 +567,7 @@ function renderMeta() {
|
|
|
498
567
|
const cost = (state.duration / 60 * COST_PER_MIN).toFixed(4);
|
|
499
568
|
const latencyStr = state.latency ? `${(state.latency / 1000).toFixed(1)}s` : '—';
|
|
500
569
|
const histLabel = state.historyIndex >= 0 ? ` · history ${state.historyIndex + 1}/${state.history.length}` : '';
|
|
501
|
-
return ` ${DIM}${state.wordCount} words · ${state.duration
|
|
570
|
+
return ` ${DIM}${state.wordCount} words · ${formatDuration(state.duration)} · latency ${latencyStr} · $${cost}${histLabel}${RESET}`;
|
|
502
571
|
}
|
|
503
572
|
|
|
504
573
|
function renderHelp() {
|
|
@@ -953,7 +1022,7 @@ function trimSilence(rawData) {
|
|
|
953
1022
|
return Buffer.concat(output);
|
|
954
1023
|
}
|
|
955
1024
|
|
|
956
|
-
async function callTranscribeAPI(file, { signal, timestamps, diarize } = {}) {
|
|
1025
|
+
async function callTranscribeAPI(file, { signal, timestamps, diarize, onProgress } = {}) {
|
|
957
1026
|
const fd = new FormData();
|
|
958
1027
|
fd.append('file', file);
|
|
959
1028
|
fd.append('model', config.model);
|
|
@@ -961,7 +1030,7 @@ async function callTranscribeAPI(file, { signal, timestamps, diarize } = {}) {
|
|
|
961
1030
|
if (config.temperature != null) fd.append('temperature', String(config.temperature));
|
|
962
1031
|
if (config.contextBias) fd.append('context_bias', config.contextBias);
|
|
963
1032
|
if (timestamps) {
|
|
964
|
-
|
|
1033
|
+
fd.append('timestamp_granularities[]', timestamps);
|
|
965
1034
|
}
|
|
966
1035
|
if (diarize) {
|
|
967
1036
|
fd.append('diarize', 'true');
|
|
@@ -969,17 +1038,68 @@ async function callTranscribeAPI(file, { signal, timestamps, diarize } = {}) {
|
|
|
969
1038
|
if (!timestamps) fd.append('timestamp_granularities[]', 'segment');
|
|
970
1039
|
}
|
|
971
1040
|
|
|
972
|
-
|
|
973
|
-
|
|
1041
|
+
// Use Request to serialize FormData into multipart body,
|
|
1042
|
+
// then send via node:https which has no hardcoded headersTimeout
|
|
1043
|
+
// (Node's built-in fetch/undici has a 300s headersTimeout that
|
|
1044
|
+
// cannot be configured without importing undici as a dependency).
|
|
1045
|
+
const req = new Request('https://api.mistral.ai/v1/audio/transcriptions', {
|
|
974
1046
|
method: 'POST',
|
|
975
1047
|
headers: { Authorization: `Bearer ${config.apiKey}` },
|
|
976
1048
|
body: fd,
|
|
977
|
-
|
|
1049
|
+
});
|
|
1050
|
+
const contentType = req.headers.get('content-type');
|
|
1051
|
+
const body = Buffer.from(await req.arrayBuffer());
|
|
1052
|
+
|
|
1053
|
+
const t0 = Date.now();
|
|
1054
|
+
const { status, raw } = await new Promise((resolve, reject) => {
|
|
1055
|
+
const hreq = https.request('https://api.mistral.ai/v1/audio/transcriptions', {
|
|
1056
|
+
method: 'POST',
|
|
1057
|
+
headers: {
|
|
1058
|
+
'Authorization': `Bearer ${config.apiKey}`,
|
|
1059
|
+
'Content-Type': contentType,
|
|
1060
|
+
'Content-Length': body.length,
|
|
1061
|
+
},
|
|
1062
|
+
}, (res) => {
|
|
1063
|
+
const chunks = [];
|
|
1064
|
+
res.on('data', (c) => chunks.push(c));
|
|
1065
|
+
res.on('end', () => resolve({ status: res.statusCode, raw: Buffer.concat(chunks).toString() }));
|
|
1066
|
+
res.on('error', reject);
|
|
1067
|
+
});
|
|
1068
|
+
|
|
1069
|
+
hreq.on('error', (err) => {
|
|
1070
|
+
const ne = new Error(`Network error: ${err.message}`);
|
|
1071
|
+
ne.networkError = true;
|
|
1072
|
+
reject(ne);
|
|
1073
|
+
});
|
|
1074
|
+
|
|
1075
|
+
const abortSig = signal || AbortSignal.timeout(30_000);
|
|
1076
|
+
if (abortSig.aborted) { hreq.destroy(); reject(new DOMException('The operation was aborted', 'AbortError')); return; }
|
|
1077
|
+
abortSig.addEventListener('abort', () => {
|
|
1078
|
+
hreq.destroy();
|
|
1079
|
+
reject(abortSig.reason instanceof DOMException ? abortSig.reason
|
|
1080
|
+
: new DOMException('The operation was aborted', 'AbortError'));
|
|
1081
|
+
}, { once: true });
|
|
1082
|
+
|
|
1083
|
+
// Write body in chunks to enable upload progress tracking
|
|
1084
|
+
const CHUNK_SIZE = 256 * 1024;
|
|
1085
|
+
let written = 0;
|
|
1086
|
+
const total = body.length;
|
|
1087
|
+
const writeChunks = () => {
|
|
1088
|
+
while (written < total) {
|
|
1089
|
+
const end = Math.min(written + CHUNK_SIZE, total);
|
|
1090
|
+
const ok = hreq.write(body.subarray(written, end));
|
|
1091
|
+
written = end;
|
|
1092
|
+
if (onProgress) onProgress(written, total);
|
|
1093
|
+
if (!ok) { hreq.once('drain', writeChunks); return; }
|
|
1094
|
+
}
|
|
1095
|
+
if (onProgress) onProgress(-1, total); // upload done, server processing
|
|
1096
|
+
hreq.end();
|
|
1097
|
+
};
|
|
1098
|
+
writeChunks();
|
|
978
1099
|
});
|
|
979
1100
|
const latency = Date.now() - t0;
|
|
980
1101
|
|
|
981
|
-
if (
|
|
982
|
-
const raw = await resp.text().catch(() => '');
|
|
1102
|
+
if (status < 200 || status >= 300) {
|
|
983
1103
|
let msg;
|
|
984
1104
|
try {
|
|
985
1105
|
const e = JSON.parse(raw);
|
|
@@ -992,14 +1112,14 @@ async function callTranscribeAPI(file, { signal, timestamps, diarize } = {}) {
|
|
|
992
1112
|
}
|
|
993
1113
|
if (!msg) msg = raw;
|
|
994
1114
|
} catch {
|
|
995
|
-
msg = raw || `HTTP ${
|
|
1115
|
+
msg = raw || `HTTP ${status}`;
|
|
996
1116
|
}
|
|
997
1117
|
const err = new Error(msg);
|
|
998
|
-
err.status =
|
|
1118
|
+
err.status = status;
|
|
999
1119
|
throw err;
|
|
1000
1120
|
}
|
|
1001
1121
|
|
|
1002
|
-
const data =
|
|
1122
|
+
const data = JSON.parse(raw);
|
|
1003
1123
|
const text = (data.text || '').trim();
|
|
1004
1124
|
return { text, latency, segments: data.segments, words: data.words };
|
|
1005
1125
|
}
|
|
@@ -1059,28 +1179,297 @@ function buildJsonOutput(base, { segments, words, timestamps, diarize } = {}) {
|
|
|
1059
1179
|
return out;
|
|
1060
1180
|
}
|
|
1061
1181
|
|
|
1182
|
+
// ── File optimization helpers ────────────────────────────────────────────────
|
|
1183
|
+
|
|
1184
|
+
let _ffmpegAvail;
|
|
1185
|
+
function ffmpegAvailable() {
|
|
1186
|
+
if (_ffmpegAvail !== undefined) return _ffmpegAvail;
|
|
1187
|
+
try {
|
|
1188
|
+
execFileSync('ffmpeg', ['-version'], { stdio: 'pipe' });
|
|
1189
|
+
execFileSync('ffprobe', ['-version'], { stdio: 'pipe' });
|
|
1190
|
+
_ffmpegAvail = true;
|
|
1191
|
+
} catch { _ffmpegAvail = false; }
|
|
1192
|
+
return _ffmpegAvail;
|
|
1193
|
+
}
|
|
1194
|
+
|
|
1195
|
+
let _ytdlpAvail;
|
|
1196
|
+
function ytdlpAvailable() {
|
|
1197
|
+
if (_ytdlpAvail !== undefined) return _ytdlpAvail;
|
|
1198
|
+
try { execFileSync('yt-dlp', ['--version'], { stdio: 'pipe' }); _ytdlpAvail = true; }
|
|
1199
|
+
catch { _ytdlpAvail = false; }
|
|
1200
|
+
return _ytdlpAvail;
|
|
1201
|
+
}
|
|
1202
|
+
|
|
1203
|
+
function downloadWithYtdlp(url, spinner) {
|
|
1204
|
+
const tmpBase = path.join(os.tmpdir(), `dikt-ytdlp-${process.pid}-${Date.now()}`);
|
|
1205
|
+
const outTemplate = `${tmpBase}.%(ext)s`;
|
|
1206
|
+
|
|
1207
|
+
return new Promise((resolve, reject) => {
|
|
1208
|
+
const proc = spawn('yt-dlp', [
|
|
1209
|
+
'-x', '--audio-format', 'opus', '--audio-quality', '48K',
|
|
1210
|
+
'-o', outTemplate, '--no-playlist', '--newline', url,
|
|
1211
|
+
], { stdio: ['ignore', 'pipe', 'pipe'] });
|
|
1212
|
+
|
|
1213
|
+
const cleanupPartial = () => {
|
|
1214
|
+
const dir = path.dirname(tmpBase);
|
|
1215
|
+
const prefix = path.basename(tmpBase);
|
|
1216
|
+
try {
|
|
1217
|
+
for (const f of fs.readdirSync(dir)) {
|
|
1218
|
+
if (f.startsWith(prefix) && f.length > prefix.length) try { fs.unlinkSync(path.join(dir, f)); } catch {}
|
|
1219
|
+
}
|
|
1220
|
+
} catch {}
|
|
1221
|
+
};
|
|
1222
|
+
|
|
1223
|
+
let aborted = false;
|
|
1224
|
+
const onSigint = () => { aborted = true; proc.kill(); };
|
|
1225
|
+
process.on('SIGINT', onSigint);
|
|
1226
|
+
|
|
1227
|
+
let lastErr = '';
|
|
1228
|
+
const parseOutput = (chunk) => {
|
|
1229
|
+
const lines = chunk.toString().split('\n');
|
|
1230
|
+
for (const line of lines) {
|
|
1231
|
+
if (!line.trim()) continue;
|
|
1232
|
+
const dl = line.match(/\[download\]\s+([\d.]+)%/);
|
|
1233
|
+
if (dl) { spinner.update(`Downloading... ${Math.round(parseFloat(dl[1]))}%`); continue; }
|
|
1234
|
+
if (/\[ExtractAudio\]/.test(line)) { spinner.update('Converting audio...'); continue; }
|
|
1235
|
+
if (/\[download\]\s+Destination:/.test(line)) { spinner.update('Downloading...'); continue; }
|
|
1236
|
+
}
|
|
1237
|
+
};
|
|
1238
|
+
proc.stdout.on('data', parseOutput);
|
|
1239
|
+
proc.stderr.on('data', (chunk) => {
|
|
1240
|
+
lastErr = chunk.toString().trim().split('\n').pop();
|
|
1241
|
+
parseOutput(chunk);
|
|
1242
|
+
});
|
|
1243
|
+
|
|
1244
|
+
proc.on('close', (code) => {
|
|
1245
|
+
process.removeListener('SIGINT', onSigint);
|
|
1246
|
+
if (aborted) { cleanupPartial(); return reject(new Error('Download aborted')); }
|
|
1247
|
+
if (code !== 0) { cleanupPartial(); return reject(new Error(lastErr || `yt-dlp exited with code ${code}`)); }
|
|
1248
|
+
// yt-dlp may produce a different extension than requested; find the actual file
|
|
1249
|
+
const dir = path.dirname(tmpBase);
|
|
1250
|
+
const prefix = path.basename(tmpBase);
|
|
1251
|
+
try {
|
|
1252
|
+
const match = fs.readdirSync(dir).find(f => f.startsWith(prefix) && f.length > prefix.length);
|
|
1253
|
+
if (!match) return reject(new Error('yt-dlp produced no output file'));
|
|
1254
|
+
resolve(path.join(dir, match));
|
|
1255
|
+
} catch (err) { reject(err); }
|
|
1256
|
+
});
|
|
1257
|
+
});
|
|
1258
|
+
}
|
|
1259
|
+
|
|
1260
|
+
function getAudioDuration(filePath) {
|
|
1261
|
+
try {
|
|
1262
|
+
const out = execFileSync('ffprobe', ['-v', 'quiet', '-show_entries', 'format=duration', '-of', 'csv=p=0', filePath], { stdio: 'pipe', encoding: 'utf8' });
|
|
1263
|
+
return parseFloat(out.trim()) || 0;
|
|
1264
|
+
} catch { return 0; }
|
|
1265
|
+
}
|
|
1266
|
+
|
|
1267
|
+
async function compressAudio(inputPath) {
|
|
1268
|
+
const base = path.join(os.tmpdir(), `dikt-${process.pid}-${Date.now()}-${path.basename(inputPath, path.extname(inputPath))}`);
|
|
1269
|
+
for (const codec of ['libopus', 'libvorbis']) {
|
|
1270
|
+
const outPath = `${base}.ogg`;
|
|
1271
|
+
try {
|
|
1272
|
+
await execFileAsync('ffmpeg', ['-i', inputPath, '-c:a', codec, '-b:a', '48k', '-y', '-v', 'quiet', outPath], { stdio: 'pipe' });
|
|
1273
|
+
if (fs.statSync(outPath).size > 0) return outPath;
|
|
1274
|
+
try { fs.unlinkSync(outPath); } catch {}
|
|
1275
|
+
} catch { try { fs.unlinkSync(outPath); } catch {} }
|
|
1276
|
+
}
|
|
1277
|
+
return null;
|
|
1278
|
+
}
|
|
1279
|
+
|
|
1280
|
+
async function findSilenceSplitPoint(filePath, targetSec) {
|
|
1281
|
+
const startSec = Math.max(0, targetSec - SPLIT_SEARCH_SEC);
|
|
1282
|
+
const durSec = SPLIT_SEARCH_SEC * 2;
|
|
1283
|
+
|
|
1284
|
+
try {
|
|
1285
|
+
// Extract a small window of raw PCM around the target for silence analysis
|
|
1286
|
+
const { stdout: raw } = await execFileAsync('ffmpeg', [
|
|
1287
|
+
'-ss', String(startSec), '-t', String(durSec), '-i', filePath,
|
|
1288
|
+
'-f', 's16le', '-ar', '16000', '-ac', '1', '-v', 'quiet', '-',
|
|
1289
|
+
], { encoding: 'buffer', maxBuffer: 16000 * 2 * durSec + 4096 });
|
|
1290
|
+
|
|
1291
|
+
// Scan for silence in 50ms windows
|
|
1292
|
+
const WINDOW_BYTES = Math.round(16000 * 0.05) * 2; // 50ms at 16kHz 16-bit mono
|
|
1293
|
+
let bestOffset = -1, bestLen = 0;
|
|
1294
|
+
let runStart = -1, runLen = 0;
|
|
1295
|
+
|
|
1296
|
+
for (let offset = 0; offset + WINDOW_BYTES <= raw.length; offset += WINDOW_BYTES) {
|
|
1297
|
+
const peak = peakAmplitude(raw.subarray(offset, offset + WINDOW_BYTES));
|
|
1298
|
+
if (peak < SILENCE_THRESHOLD) {
|
|
1299
|
+
if (runStart === -1) runStart = offset;
|
|
1300
|
+
runLen++;
|
|
1301
|
+
} else {
|
|
1302
|
+
if (runLen > bestLen) { bestOffset = runStart; bestLen = runLen; }
|
|
1303
|
+
runStart = -1; runLen = 0;
|
|
1304
|
+
}
|
|
1305
|
+
}
|
|
1306
|
+
if (runLen > bestLen) { bestOffset = runStart; bestLen = runLen; }
|
|
1307
|
+
|
|
1308
|
+
if (bestLen >= 10) { // at least 500ms of silence (avoids mid-word splits)
|
|
1309
|
+
const centerBytes = bestOffset + Math.floor(bestLen / 2) * WINDOW_BYTES;
|
|
1310
|
+
return startSec + centerBytes / (16000 * 2);
|
|
1311
|
+
}
|
|
1312
|
+
} catch {}
|
|
1313
|
+
|
|
1314
|
+
return targetSec; // fallback: no silence found, split at target
|
|
1315
|
+
}
|
|
1316
|
+
|
|
1317
|
+
function cleanChunkText(t) {
|
|
1318
|
+
if (!t) return '';
|
|
1319
|
+
// Strip [PRINT_WORDLEVEL_TIME] markup the API sometimes spontaneously returns
|
|
1320
|
+
if (t.includes('[PRINT_WORDLEVEL_TIME]')) {
|
|
1321
|
+
t = t.replace(/\[PRINT_WORDLEVEL_TIME\]/g, '');
|
|
1322
|
+
t = t.replace(/<\/?\d{2}:\d{2}\.\d+>/g, '');
|
|
1323
|
+
t = t.replace(/\s+/g, ' ');
|
|
1324
|
+
}
|
|
1325
|
+
return t.trim();
|
|
1326
|
+
}
|
|
1327
|
+
|
|
1328
|
+
function mergeChunkResults(results, splitPoints) {
|
|
1329
|
+
// No overlap — just concatenate text, offset timestamps
|
|
1330
|
+
let text = results.map(r => cleanChunkText(r.text)).filter(Boolean).join(' ');
|
|
1331
|
+
// Fix missing spaces after punctuation (API omits leading spaces on some segments)
|
|
1332
|
+
text = text.replace(/([.!?,])([A-Za-z])/g, '$1 $2');
|
|
1333
|
+
let maxLatency = 0;
|
|
1334
|
+
const allSegments = [];
|
|
1335
|
+
const allWords = [];
|
|
1336
|
+
|
|
1337
|
+
const round1 = (n) => Math.round(n * 10) / 10;
|
|
1338
|
+
for (let i = 0; i < results.length; i++) {
|
|
1339
|
+
const r = results[i];
|
|
1340
|
+
const offset = splitPoints[i];
|
|
1341
|
+
if (r.latency > maxLatency) maxLatency = r.latency;
|
|
1342
|
+
|
|
1343
|
+
if (r.segments) {
|
|
1344
|
+
for (const seg of r.segments) {
|
|
1345
|
+
allSegments.push({ ...seg, start: round1(seg.start + offset), end: round1(seg.end + offset) });
|
|
1346
|
+
}
|
|
1347
|
+
}
|
|
1348
|
+
if (r.words) {
|
|
1349
|
+
for (const w of r.words) {
|
|
1350
|
+
allWords.push({ ...w, start: round1(w.start + offset), end: round1(w.end + offset) });
|
|
1351
|
+
}
|
|
1352
|
+
}
|
|
1353
|
+
}
|
|
1354
|
+
|
|
1355
|
+
return {
|
|
1356
|
+
text,
|
|
1357
|
+
latency: maxLatency,
|
|
1358
|
+
segments: allSegments.length ? allSegments : undefined,
|
|
1359
|
+
words: allWords.length ? allWords : undefined,
|
|
1360
|
+
};
|
|
1361
|
+
}
|
|
1362
|
+
|
|
1363
|
+
async function parallelMap(items, fn, concurrency) {
|
|
1364
|
+
const results = new Array(items.length);
|
|
1365
|
+
let next = 0;
|
|
1366
|
+
const worker = async () => { while (next < items.length) { const i = next++; results[i] = await fn(items[i], i); } };
|
|
1367
|
+
await Promise.all(Array.from({ length: Math.min(concurrency, items.length) }, () => worker()));
|
|
1368
|
+
return results;
|
|
1369
|
+
}
|
|
1370
|
+
|
|
1062
1371
|
// ── File mode ────────────────────────────────────────────────────────────────
|
|
1063
1372
|
|
|
1064
1373
|
async function runFile(flags) {
|
|
1374
|
+
const spinner = createStderrSpinner();
|
|
1375
|
+
let fileSize = 0;
|
|
1376
|
+
let transcribeTimer = null;
|
|
1377
|
+
const tempFiles = [];
|
|
1378
|
+
|
|
1065
1379
|
try {
|
|
1066
|
-
|
|
1380
|
+
const isURL = /^https?:\/\//i.test(flags.file);
|
|
1381
|
+
|
|
1382
|
+
if (isURL) {
|
|
1383
|
+
if (!ytdlpAvailable()) {
|
|
1384
|
+
process.stderr.write(`\n${RED}${BOLD} yt-dlp not found.${RESET}\n\n`);
|
|
1385
|
+
process.stderr.write(` yt-dlp is required to download audio from URLs. Install it:\n\n`);
|
|
1386
|
+
if (process.platform === 'darwin') {
|
|
1387
|
+
process.stderr.write(` ${BOLD}brew install yt-dlp${RESET}\n\n`);
|
|
1388
|
+
} else if (process.platform === 'win32') {
|
|
1389
|
+
process.stderr.write(` ${BOLD}choco install yt-dlp${RESET} or ${BOLD}scoop install yt-dlp${RESET}\n\n`);
|
|
1390
|
+
} else {
|
|
1391
|
+
process.stderr.write(` ${BOLD}sudo apt install yt-dlp${RESET} (Debian/Ubuntu)\n`);
|
|
1392
|
+
process.stderr.write(` ${BOLD}pip install yt-dlp${RESET} (any platform)\n\n`);
|
|
1393
|
+
}
|
|
1394
|
+
return EXIT_DEPENDENCY;
|
|
1395
|
+
}
|
|
1396
|
+
spinner.start('Downloading audio...');
|
|
1397
|
+
try {
|
|
1398
|
+
const downloaded = await downloadWithYtdlp(flags.file, spinner);
|
|
1399
|
+
tempFiles.push(downloaded);
|
|
1400
|
+
flags = { ...flags, file: downloaded };
|
|
1401
|
+
} catch (err) {
|
|
1402
|
+
spinner.stop();
|
|
1403
|
+
process.stderr.write(`Error downloading: ${err.message}\n`);
|
|
1404
|
+
return EXIT_TRANSCRIPTION;
|
|
1405
|
+
}
|
|
1406
|
+
spinner.update('Processing audio...');
|
|
1407
|
+
} else if (!flags.file || !fs.existsSync(flags.file)) {
|
|
1067
1408
|
process.stderr.write(`Error: file not found: ${flags.file}\n`);
|
|
1068
1409
|
return EXIT_TRANSCRIPTION;
|
|
1410
|
+
} else {
|
|
1411
|
+
spinner.start('Reading file...');
|
|
1069
1412
|
}
|
|
1413
|
+
fileSize = fs.statSync(flags.file).size;
|
|
1414
|
+
const ext = path.extname(flags.file).slice(1).toLowerCase() || 'wav';
|
|
1070
1415
|
|
|
1071
|
-
|
|
1072
|
-
const
|
|
1073
|
-
const
|
|
1074
|
-
const
|
|
1075
|
-
|
|
1416
|
+
// Check if ffmpeg is available for chunking / compression optimizations
|
|
1417
|
+
const hasFFmpeg = ffmpegAvailable();
|
|
1418
|
+
const duration = hasFFmpeg ? getAudioDuration(flags.file) : 0;
|
|
1419
|
+
const canChunk = hasFFmpeg && !flags.diarize && duration > CHUNK_MIN_SEC;
|
|
1420
|
+
|
|
1421
|
+
if (canChunk) {
|
|
1422
|
+
spinner.stop();
|
|
1423
|
+
return await runFileChunked(flags, { fileSize, duration });
|
|
1424
|
+
}
|
|
1425
|
+
|
|
1426
|
+
// Compress uncompressed formats (wav/flac → ogg) for faster upload
|
|
1427
|
+
let uploadPath = flags.file;
|
|
1428
|
+
let uploadExt = ext;
|
|
1429
|
+
if (hasFFmpeg && COMPRESSIBLE.has(ext)) {
|
|
1430
|
+
spinner.update('Compressing...');
|
|
1431
|
+
const compressed = await compressAudio(flags.file);
|
|
1432
|
+
if (compressed) {
|
|
1433
|
+
const newSize = fs.statSync(compressed).size;
|
|
1434
|
+
if (newSize < fileSize) {
|
|
1435
|
+
tempFiles.push(compressed);
|
|
1436
|
+
uploadPath = compressed;
|
|
1437
|
+
uploadExt = path.extname(compressed).slice(1);
|
|
1438
|
+
spinner.update(`Compressed ${formatFileSize(fileSize)} → ${formatFileSize(newSize)}`);
|
|
1439
|
+
} else {
|
|
1440
|
+
try { fs.unlinkSync(compressed); } catch {}
|
|
1441
|
+
}
|
|
1442
|
+
}
|
|
1443
|
+
}
|
|
1444
|
+
|
|
1445
|
+
const blob = await fs.openAsBlob(uploadPath);
|
|
1446
|
+
const mime = MIME_TYPES[uploadExt] || 'application/octet-stream';
|
|
1447
|
+
const file = new File([blob], path.basename(uploadPath), { type: mime });
|
|
1448
|
+
|
|
1449
|
+
spinner.update(`Uploading to API... (${formatFileSize(blob.size)})`);
|
|
1076
1450
|
|
|
1077
1451
|
const ac = new AbortController();
|
|
1078
|
-
const abortHandler = () => ac.abort();
|
|
1452
|
+
const abortHandler = () => { spinner.stop('Aborting...'); ac.abort(); };
|
|
1079
1453
|
process.on('SIGINT', abortHandler);
|
|
1080
1454
|
|
|
1081
|
-
const
|
|
1455
|
+
const onProgress = (sent, total) => {
|
|
1456
|
+
if (sent === -1) {
|
|
1457
|
+
const t0 = Date.now();
|
|
1458
|
+
const elapsed = () => { const s = Math.floor((Date.now() - t0) / 1000); return `${Math.floor(s / 60)}:${String(s % 60).padStart(2, '0')}`; };
|
|
1459
|
+
spinner.update(`Transcribing... ${DIM}(${elapsed()})${RESET}`);
|
|
1460
|
+
transcribeTimer = setInterval(() => spinner.update(`Transcribing... ${DIM}(${elapsed()})${RESET}`), 1000);
|
|
1461
|
+
} else {
|
|
1462
|
+
const pct = Math.round((sent / total) * 100);
|
|
1463
|
+
spinner.update(`Uploading ${pct}% (${formatFileSize(sent)} / ${formatFileSize(total)})`);
|
|
1464
|
+
}
|
|
1465
|
+
};
|
|
1466
|
+
|
|
1467
|
+
const result = await callTranscribeAPI(file, { signal: ac.signal, timestamps: flags.timestamps, diarize: flags.diarize, onProgress });
|
|
1468
|
+
if (transcribeTimer) clearInterval(transcribeTimer);
|
|
1082
1469
|
process.removeListener('SIGINT', abortHandler);
|
|
1083
1470
|
|
|
1471
|
+
spinner.stop(`${GREEN}Done${RESET} (${(result.latency / 1000).toFixed(1)}s)`);
|
|
1472
|
+
|
|
1084
1473
|
if (!result.text) {
|
|
1085
1474
|
process.stderr.write('No speech detected\n');
|
|
1086
1475
|
return EXIT_TRANSCRIPTION;
|
|
@@ -1110,12 +1499,180 @@ async function runFile(flags) {
|
|
|
1110
1499
|
|
|
1111
1500
|
return EXIT_OK;
|
|
1112
1501
|
} catch (err) {
|
|
1502
|
+
if (transcribeTimer) clearInterval(transcribeTimer);
|
|
1503
|
+
spinner.stop();
|
|
1504
|
+
|
|
1113
1505
|
if (err.name === 'AbortError') {
|
|
1114
1506
|
process.stderr.write('Aborted\n');
|
|
1115
1507
|
return EXIT_TRANSCRIPTION;
|
|
1116
1508
|
}
|
|
1117
|
-
|
|
1509
|
+
|
|
1510
|
+
const parts = [`Error: ${err.message}`];
|
|
1511
|
+
if (fileSize) parts.push(` File: ${flags.file} (${formatFileSize(fileSize)})`);
|
|
1512
|
+
|
|
1513
|
+
if (err.networkError) {
|
|
1514
|
+
parts.push(' Hint: check your network connection and try again');
|
|
1515
|
+
} else if (err.status === 401) {
|
|
1516
|
+
parts.push(' Hint: invalid API key — run `dikt setup` to reconfigure');
|
|
1517
|
+
} else if (err.status === 413) {
|
|
1518
|
+
parts.push(' Hint: file is too large for the API — try a shorter recording');
|
|
1519
|
+
} else if (err.status === 429) {
|
|
1520
|
+
parts.push(' Hint: rate limited — wait a moment and try again');
|
|
1521
|
+
} else if (err.status >= 500) {
|
|
1522
|
+
parts.push(' Hint: Mistral API server error — try again later');
|
|
1523
|
+
}
|
|
1524
|
+
|
|
1525
|
+
process.stderr.write(parts.join('\n') + '\n');
|
|
1118
1526
|
return EXIT_TRANSCRIPTION;
|
|
1527
|
+
} finally {
|
|
1528
|
+
for (const f of tempFiles) { try { fs.unlinkSync(f); } catch {} }
|
|
1529
|
+
}
|
|
1530
|
+
}
|
|
1531
|
+
|
|
1532
|
+
async function runFileChunked(flags, { fileSize, duration }) {
|
|
1533
|
+
const spinner = createStderrSpinner();
|
|
1534
|
+
const tempFiles = [];
|
|
1535
|
+
const t0 = Date.now();
|
|
1536
|
+
let progressTimer = null;
|
|
1537
|
+
let abortHandler = null;
|
|
1538
|
+
|
|
1539
|
+
try {
|
|
1540
|
+
// Find optimal split points at silence boundaries
|
|
1541
|
+
const numTargetChunks = Math.ceil(duration / TARGET_CHUNK_SEC);
|
|
1542
|
+
spinner.start('Analyzing audio for split points...');
|
|
1543
|
+
|
|
1544
|
+
const splitPoints = [0];
|
|
1545
|
+
for (let i = 1; i < numTargetChunks; i++) {
|
|
1546
|
+
spinner.update(`Finding split point ${i}/${numTargetChunks - 1}...`);
|
|
1547
|
+
splitPoints.push(await findSilenceSplitPoint(flags.file, i * TARGET_CHUNK_SEC));
|
|
1548
|
+
}
|
|
1549
|
+
splitPoints.push(duration);
|
|
1550
|
+
|
|
1551
|
+
// Merge tiny trailing chunks (< MIN_CHUNK_SEC) into the previous one
|
|
1552
|
+
for (let i = splitPoints.length - 2; i > 0; i--) {
|
|
1553
|
+
if (splitPoints[i + 1] - splitPoints[i] < MIN_CHUNK_SEC) {
|
|
1554
|
+
splitPoints.splice(i, 1);
|
|
1555
|
+
}
|
|
1556
|
+
}
|
|
1557
|
+
|
|
1558
|
+
const numChunks = splitPoints.length - 1;
|
|
1559
|
+
|
|
1560
|
+
// Split audio and compress each chunk
|
|
1561
|
+
const chunkBase = path.join(os.tmpdir(), `dikt-${process.pid}-${Date.now()}`);
|
|
1562
|
+
const uploadPaths = [];
|
|
1563
|
+
|
|
1564
|
+
for (let i = 0; i < numChunks; i++) {
|
|
1565
|
+
spinner.update(`Preparing chunk ${i + 1}/${numChunks}...`);
|
|
1566
|
+
const start = splitPoints[i];
|
|
1567
|
+
const dur = splitPoints[i + 1] - start;
|
|
1568
|
+
const oggPath = `${chunkBase}-${i}.ogg`;
|
|
1569
|
+
try {
|
|
1570
|
+
await execFileAsync('ffmpeg', ['-ss', String(start), '-t', String(dur), '-i', flags.file, '-c:a', 'libopus', '-b:a', '48k', '-y', '-v', 'quiet', oggPath], { stdio: 'pipe' });
|
|
1571
|
+
if (fs.statSync(oggPath).size > 0) {
|
|
1572
|
+
tempFiles.push(oggPath);
|
|
1573
|
+
uploadPaths.push(oggPath);
|
|
1574
|
+
} else { throw new Error('empty output'); }
|
|
1575
|
+
} catch {
|
|
1576
|
+
try { fs.unlinkSync(oggPath); } catch {}
|
|
1577
|
+
const wavPath = `${chunkBase}-${i}.wav`;
|
|
1578
|
+
await execFileAsync('ffmpeg', ['-ss', String(start), '-t', String(dur), '-i', flags.file, '-y', '-v', 'quiet', wavPath], { stdio: 'pipe' });
|
|
1579
|
+
if (!fs.statSync(wavPath).size) throw new Error(`ffmpeg produced empty chunk ${i}`);
|
|
1580
|
+
tempFiles.push(wavPath);
|
|
1581
|
+
uploadPaths.push(wavPath);
|
|
1582
|
+
}
|
|
1583
|
+
}
|
|
1584
|
+
|
|
1585
|
+
const totalUploadSize = uploadPaths.reduce((sum, p) => sum + fs.statSync(p).size, 0);
|
|
1586
|
+
spinner.update(`Compressed → ${formatFileSize(totalUploadSize)} total`);
|
|
1587
|
+
|
|
1588
|
+
// Abort handling
|
|
1589
|
+
const ac = new AbortController();
|
|
1590
|
+
abortHandler = () => { spinner.stop('Aborting...'); ac.abort(); };
|
|
1591
|
+
process.on('SIGINT', abortHandler);
|
|
1592
|
+
|
|
1593
|
+
// Transcribe chunks in parallel
|
|
1594
|
+
let completed = 0;
|
|
1595
|
+
const elapsed = () => {
|
|
1596
|
+
const s = Math.floor((Date.now() - t0) / 1000);
|
|
1597
|
+
return `${Math.floor(s / 60)}:${String(s % 60).padStart(2, '0')}`;
|
|
1598
|
+
};
|
|
1599
|
+
spinner.update(`Transcribing ${numChunks} chunks... ${DIM}(${elapsed()})${RESET}`);
|
|
1600
|
+
progressTimer = setInterval(() => {
|
|
1601
|
+
spinner.update(`Transcribing... ${completed}/${numChunks} done ${DIM}(${elapsed()})${RESET}`);
|
|
1602
|
+
}, 1000);
|
|
1603
|
+
|
|
1604
|
+
const chunkIndices = Array.from({ length: numChunks }, (_, i) => i);
|
|
1605
|
+
const results = await parallelMap(chunkIndices, async (i) => {
|
|
1606
|
+
const uploadPath = uploadPaths[i];
|
|
1607
|
+
const ext = path.extname(uploadPath).slice(1);
|
|
1608
|
+
const blob = await fs.openAsBlob(uploadPath);
|
|
1609
|
+
const file = new File([blob], `chunk-${i}.${ext}`, { type: MIME_TYPES[ext] || 'audio/wav' });
|
|
1610
|
+
const result = await callTranscribeAPI(file, { signal: ac.signal, timestamps: flags.timestamps });
|
|
1611
|
+
completed++;
|
|
1612
|
+
return result;
|
|
1613
|
+
}, MAX_PARALLEL);
|
|
1614
|
+
|
|
1615
|
+
clearInterval(progressTimer); progressTimer = null;
|
|
1616
|
+
process.removeListener('SIGINT', abortHandler); abortHandler = null;
|
|
1617
|
+
|
|
1618
|
+
// Merge results — no overlap, just concatenate text and offset timestamps
|
|
1619
|
+
const merged = mergeChunkResults(results, splitPoints);
|
|
1620
|
+
const totalLatency = Date.now() - t0;
|
|
1621
|
+
spinner.stop(`${GREEN}Done${RESET} (${(totalLatency / 1000).toFixed(1)}s, ${numChunks} chunks)`);
|
|
1622
|
+
|
|
1623
|
+
if (!merged.text) {
|
|
1624
|
+
process.stderr.write('No speech detected\n');
|
|
1625
|
+
return EXIT_TRANSCRIPTION;
|
|
1626
|
+
}
|
|
1627
|
+
|
|
1628
|
+
const wordCount = merged.text.split(/\s+/).filter(Boolean).length;
|
|
1629
|
+
|
|
1630
|
+
let output;
|
|
1631
|
+
if (flags.json) {
|
|
1632
|
+
const out = buildJsonOutput(
|
|
1633
|
+
{ text: merged.text, latency: totalLatency, words: wordCount },
|
|
1634
|
+
{ segments: merged.segments, words: merged.words, timestamps: flags.timestamps, diarize: false },
|
|
1635
|
+
);
|
|
1636
|
+
output = JSON.stringify(out, null, flags.output ? 2 : 0) + '\n';
|
|
1637
|
+
} else {
|
|
1638
|
+
output = merged.text + '\n';
|
|
1639
|
+
}
|
|
1640
|
+
|
|
1641
|
+
if (flags.output) {
|
|
1642
|
+
fs.writeFileSync(flags.output, output);
|
|
1643
|
+
process.stderr.write(`Saved to ${flags.output}\n`);
|
|
1644
|
+
} else {
|
|
1645
|
+
process.stdout.write(output);
|
|
1646
|
+
}
|
|
1647
|
+
|
|
1648
|
+
return EXIT_OK;
|
|
1649
|
+
} catch (err) {
|
|
1650
|
+
spinner.stop();
|
|
1651
|
+
|
|
1652
|
+
if (err.name === 'AbortError') {
|
|
1653
|
+
process.stderr.write('Aborted\n');
|
|
1654
|
+
return EXIT_TRANSCRIPTION;
|
|
1655
|
+
}
|
|
1656
|
+
|
|
1657
|
+
const parts = [`Error: ${err.message}`];
|
|
1658
|
+
if (fileSize) parts.push(` File: ${flags.file} (${formatFileSize(fileSize)})`);
|
|
1659
|
+
|
|
1660
|
+
if (err.networkError) {
|
|
1661
|
+
parts.push(' Hint: check your network connection and try again');
|
|
1662
|
+
} else if (err.status === 401) {
|
|
1663
|
+
parts.push(' Hint: invalid API key — run `dikt setup` to reconfigure');
|
|
1664
|
+
} else if (err.status === 429) {
|
|
1665
|
+
parts.push(' Hint: rate limited — wait a moment and try again');
|
|
1666
|
+
} else if (err.status >= 500) {
|
|
1667
|
+
parts.push(' Hint: Mistral API server error — try again later');
|
|
1668
|
+
}
|
|
1669
|
+
|
|
1670
|
+
process.stderr.write(parts.join('\n') + '\n');
|
|
1671
|
+
return EXIT_TRANSCRIPTION;
|
|
1672
|
+
} finally {
|
|
1673
|
+
if (progressTimer) clearInterval(progressTimer);
|
|
1674
|
+
if (abortHandler) process.removeListener('SIGINT', abortHandler);
|
|
1675
|
+
for (const f of tempFiles) { try { fs.unlinkSync(f); } catch {} }
|
|
1119
1676
|
}
|
|
1120
1677
|
}
|
|
1121
1678
|
|
|
@@ -1398,7 +1955,7 @@ async function main() {
|
|
|
1398
1955
|
language: flagVal(args, '--language', 'e.g. en, de, fr'),
|
|
1399
1956
|
file: flagVal(args, '--file', 'path to audio file'),
|
|
1400
1957
|
noNewline: args.includes('--no-newline') || args.includes('-n'),
|
|
1401
|
-
timestamps: flagVal(args, '--timestamps', 'segment
|
|
1958
|
+
timestamps: flagVal(args, '--timestamps', 'segment or word', { valid: ['segment', 'word'] }),
|
|
1402
1959
|
diarize: args.includes('--diarize'),
|
|
1403
1960
|
output: flagVal(args, '--output', 'path') || flagVal(args, '-o', 'path'),
|
|
1404
1961
|
};
|
|
@@ -1468,13 +2025,13 @@ Options:
|
|
|
1468
2025
|
--json Record once, output JSON to stdout
|
|
1469
2026
|
-q, --quiet Record once, print transcript to stdout
|
|
1470
2027
|
--stream Stream transcription chunks on pauses
|
|
1471
|
-
--file <path>
|
|
2028
|
+
--file <path|url> Transcribe audio file or URL (via yt-dlp)
|
|
1472
2029
|
-o, --output <path> Write output to file (.json auto-enables JSON)
|
|
1473
2030
|
--silence <seconds> Silence duration before auto-stop (default: 2.0)
|
|
1474
2031
|
--pause <seconds> Pause duration to split chunks (default: 1.0)
|
|
1475
2032
|
--language <code> Language code, e.g. en, de, fr (default: auto)
|
|
1476
2033
|
-n, --no-newline Join stream chunks without newlines
|
|
1477
|
-
--timestamps <granularity> Add timestamps: segment
|
|
2034
|
+
--timestamps <granularity> Add timestamps: segment or word
|
|
1478
2035
|
--diarize Enable speaker identification
|
|
1479
2036
|
--no-input Fail if config is missing (no wizard)
|
|
1480
2037
|
--no-color Disable colored output
|
|
@@ -1501,6 +2058,7 @@ Examples:
|
|
|
1501
2058
|
dikt --file meeting.wav Transcribe an existing audio file
|
|
1502
2059
|
dikt --file a.wav -o a.json Transcribe to a JSON file
|
|
1503
2060
|
dikt --file a.wav -o a.txt Transcribe to a text file
|
|
2061
|
+
dikt --file https://youtube.com/watch?v=ID Transcribe from URL
|
|
1504
2062
|
dikt --stream --silence 0 Stream continuously until Ctrl+C
|
|
1505
2063
|
dikt --stream -n Stream as continuous flowing text
|
|
1506
2064
|
dikt -q --json --diarize Transcribe with speaker labels
|
|
@@ -1514,13 +2072,13 @@ Environment variables:
|
|
|
1514
2072
|
|
|
1515
2073
|
Exit codes:
|
|
1516
2074
|
0 Success
|
|
1517
|
-
1 Missing dependency (sox)
|
|
2075
|
+
1 Missing dependency (sox/ffmpeg)
|
|
1518
2076
|
2 Not a terminal
|
|
1519
2077
|
3 Configuration error
|
|
1520
2078
|
4 Transcription error
|
|
1521
2079
|
|
|
1522
2080
|
Config: ${CONFIG_DIR}/config.json
|
|
1523
|
-
Requires: sox (
|
|
2081
|
+
Requires: sox (recording), ffmpeg (--file optimization), yt-dlp (URLs, optional)`);
|
|
1524
2082
|
process.exit(EXIT_OK);
|
|
1525
2083
|
}
|
|
1526
2084
|
|
|
@@ -1542,7 +2100,10 @@ Requires: sox (brew install sox)`);
|
|
|
1542
2100
|
|
|
1543
2101
|
applyEnvOverrides(config);
|
|
1544
2102
|
if (flags.language) config.language = flags.language;
|
|
1545
|
-
if (!flags.timestamps && config.timestamps)
|
|
2103
|
+
if (!flags.timestamps && config.timestamps) {
|
|
2104
|
+
// Migrate legacy 'segment,word' → 'word' (combined option removed)
|
|
2105
|
+
flags.timestamps = config.timestamps === 'segment,word' ? 'word' : config.timestamps;
|
|
2106
|
+
}
|
|
1546
2107
|
if (!flags.diarize && config.diarize) flags.diarize = true;
|
|
1547
2108
|
if (flags.output && flags.output.endsWith('.json')) flags.json = true;
|
|
1548
2109
|
|