dikt 1.0.2 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli.mjs +661 -151
- package/package.json +4 -2
package/cli.mjs
CHANGED
|
@@ -17,6 +17,9 @@ let DIM = `${ESC}2m`;
|
|
|
17
17
|
let RED = `${ESC}31m`;
|
|
18
18
|
let GREEN = `${ESC}32m`;
|
|
19
19
|
let YELLOW = `${ESC}33m`;
|
|
20
|
+
let BLUE = `${ESC}34m`;
|
|
21
|
+
let MAGENTA = `${ESC}35m`;
|
|
22
|
+
let CYAN = `${ESC}36m`;
|
|
20
23
|
let GREY = `${ESC}90m`;
|
|
21
24
|
let WHITE = `${ESC}37m`;
|
|
22
25
|
let RED_BG = `${ESC}41m`;
|
|
@@ -29,14 +32,14 @@ const ALT_SCREEN_ON = `${ESC}?1049h`;
|
|
|
29
32
|
const ALT_SCREEN_OFF = `${ESC}?1049l`;
|
|
30
33
|
|
|
31
34
|
if (process.env.NO_COLOR != null || process.env.TERM === 'dumb' || process.argv.includes('--no-color')) {
|
|
32
|
-
RESET = BOLD = DIM = RED = GREEN = YELLOW = GREY = WHITE = RED_BG = '';
|
|
35
|
+
RESET = BOLD = DIM = RED = GREEN = YELLOW = BLUE = MAGENTA = CYAN = GREY = WHITE = RED_BG = '';
|
|
33
36
|
}
|
|
34
37
|
|
|
35
38
|
const moveTo = (row, col = 1) => `${ESC}${row};${col}H`;
|
|
36
39
|
|
|
37
40
|
// ── Constants ─────────────────────────────────────────────────────────────────
|
|
38
41
|
|
|
39
|
-
const VERSION = '1.0
|
|
42
|
+
const VERSION = '1.1.0';
|
|
40
43
|
const CONFIG_BASE = process.env.XDG_CONFIG_HOME || path.join(os.homedir(), '.config');
|
|
41
44
|
const CONFIG_DIR = path.join(CONFIG_BASE, 'dikt');
|
|
42
45
|
const CONFIG_FILE = path.join(CONFIG_DIR, 'config.json');
|
|
@@ -88,51 +91,179 @@ function validateConfig(cfg) {
|
|
|
88
91
|
return { valid: errors.length === 0, errors };
|
|
89
92
|
}
|
|
90
93
|
|
|
91
|
-
// ──
|
|
94
|
+
// ── Setup wizard (form-based) ─────────────────────────────────────────────────
|
|
95
|
+
|
|
96
|
+
const TIMESTAMPS_DISPLAY = { '': 'off', 'segment': 'segment', 'word': 'word', 'segment,word': 'both' };
|
|
97
|
+
const TIMESTAMPS_VALUE = { 'off': '', 'segment': 'segment', 'word': 'word', 'both': 'segment,word' };
|
|
98
|
+
|
|
99
|
+
async function setupWizard() {
|
|
100
|
+
const existing = loadConfig() || {};
|
|
101
|
+
|
|
102
|
+
const fields = [
|
|
103
|
+
{ key: 'apiKey', label: 'API key', type: 'secret', value: '', display: existing.apiKey ? '••••' + existing.apiKey.slice(-4) : '', fallback: existing.apiKey || '' },
|
|
104
|
+
{ key: 'model', label: 'Model', type: 'text', value: '', display: existing.model || 'voxtral-mini-latest', fallback: existing.model || 'voxtral-mini-latest' },
|
|
105
|
+
{ key: 'language', label: 'Language', type: 'text', value: '', display: existing.language || 'auto', fallback: existing.language || '' },
|
|
106
|
+
{ key: 'temperature', label: 'Temperature', type: 'text', value: '', display: existing.temperature != null ? String(existing.temperature) : 'default', fallback: existing.temperature != null ? String(existing.temperature) : '' },
|
|
107
|
+
{ key: 'contextBias', label: 'Context bias', type: 'text', value: '', display: existing.contextBias || '', fallback: existing.contextBias || '' },
|
|
108
|
+
{ key: 'timestamps', label: 'Timestamps', type: 'select', options: ['off', 'segment', 'word', 'both'], idx: ['off', 'segment', 'word', 'both'].indexOf(TIMESTAMPS_DISPLAY[existing.timestamps || ''] || 'off') },
|
|
109
|
+
{ key: 'diarize', label: 'Diarize', type: 'select', options: ['off', 'on'], idx: existing.diarize ? 1 : 0 },
|
|
110
|
+
];
|
|
111
|
+
|
|
112
|
+
const LABEL_W = 15; // right-align labels to this width
|
|
113
|
+
let active = 0;
|
|
114
|
+
let editing = false; // true when typing into a text/secret field
|
|
115
|
+
let inputBuf = '';
|
|
116
|
+
|
|
117
|
+
function renderForm() {
|
|
118
|
+
let out = `\x1b[H\x1b[2J`; // move home + clear screen
|
|
119
|
+
out += `\n${BOLD} dikt — setup${RESET}\n`;
|
|
120
|
+
|
|
121
|
+
// Contextual hint
|
|
122
|
+
const f = fields[active];
|
|
123
|
+
if (f.type === 'select') {
|
|
124
|
+
out += ` ${DIM}Tab/arrows to change, Enter to confirm${RESET}\n`;
|
|
125
|
+
} else if (editing) {
|
|
126
|
+
out += ` ${DIM}Type to ${f.type === 'secret' ? 'enter' : 'change'}, Enter to confirm${RESET}\n`;
|
|
127
|
+
} else {
|
|
128
|
+
out += ` ${DIM}Enter to keep default, or start typing to change${RESET}\n`;
|
|
129
|
+
}
|
|
130
|
+
out += '\n';
|
|
131
|
+
|
|
132
|
+
for (let i = 0; i < fields.length; i++) {
|
|
133
|
+
const fi = fields[i];
|
|
134
|
+
const label = fi.label.padStart(LABEL_W);
|
|
135
|
+
const isActive = i === active;
|
|
136
|
+
const marker = isActive ? `${GREEN}>${RESET}` : ' ';
|
|
137
|
+
|
|
138
|
+
if (fi.type === 'select') {
|
|
139
|
+
const parts = fi.options.map((opt, j) => {
|
|
140
|
+
if (isActive) {
|
|
141
|
+
return j === fi.idx ? `${BOLD}${GREEN}${opt}${RESET}` : `${DIM}${opt}${RESET}`;
|
|
142
|
+
}
|
|
143
|
+
return j === fi.idx ? opt : `${DIM}${opt}${RESET}`;
|
|
144
|
+
});
|
|
145
|
+
out += `${marker} ${isActive ? BOLD : DIM}${label}${RESET} ${parts.join(' ')}\n`;
|
|
146
|
+
} else {
|
|
147
|
+
let valueStr;
|
|
148
|
+
if (isActive && editing) {
|
|
149
|
+
valueStr = fi.type === 'secret'
|
|
150
|
+
? `${GREEN}${'•'.repeat(inputBuf.length)}${RESET}█`
|
|
151
|
+
: `${GREEN}${inputBuf}${RESET}█`;
|
|
152
|
+
} else if (isActive && !editing) {
|
|
153
|
+
valueStr = `${DIM}${fi.display}${RESET}`;
|
|
154
|
+
} else {
|
|
155
|
+
// Show confirmed value or default
|
|
156
|
+
const show = fi.value || fi.display;
|
|
157
|
+
valueStr = fi.value
|
|
158
|
+
? (fi.type === 'secret' ? '••••' + fi.value.slice(-4) : fi.value)
|
|
159
|
+
: `${DIM}${show}${RESET}`;
|
|
160
|
+
}
|
|
161
|
+
out += `${marker} ${isActive ? BOLD : DIM}${label}${RESET} ${valueStr}\n`;
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
process.stderr.write(out);
|
|
166
|
+
}
|
|
92
167
|
|
|
93
|
-
function readSecret(prompt) {
|
|
94
168
|
return new Promise((resolve) => {
|
|
95
|
-
process.stderr.write(prompt);
|
|
96
169
|
const { stdin } = process;
|
|
97
170
|
stdin.setRawMode(true);
|
|
98
171
|
stdin.resume();
|
|
99
172
|
stdin.setEncoding('utf8');
|
|
100
173
|
|
|
101
|
-
|
|
174
|
+
renderForm();
|
|
102
175
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
176
|
+
function advance() {
|
|
177
|
+
const f = fields[active];
|
|
178
|
+
// Commit text/secret field value
|
|
179
|
+
if (f.type !== 'select') {
|
|
180
|
+
if (inputBuf.trim()) {
|
|
181
|
+
f.value = inputBuf.trim();
|
|
182
|
+
} else {
|
|
183
|
+
f.value = f.fallback;
|
|
184
|
+
}
|
|
185
|
+
// Validate API key
|
|
186
|
+
if (f.key === 'apiKey' && !f.value) {
|
|
187
|
+
editing = false;
|
|
188
|
+
inputBuf = '';
|
|
189
|
+
renderForm();
|
|
190
|
+
process.stderr.write(`\n ${RED}API key is required.${RESET}\n`);
|
|
191
|
+
return; // stay on this field
|
|
192
|
+
}
|
|
193
|
+
editing = false;
|
|
194
|
+
inputBuf = '';
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
active++;
|
|
198
|
+
if (active >= fields.length) {
|
|
199
|
+
// Save and exit
|
|
200
|
+
stdin.removeListener('data', onData);
|
|
201
|
+
stdin.setRawMode(false);
|
|
202
|
+
stdin.pause();
|
|
203
|
+
|
|
204
|
+
const ts = fields.find(f => f.key === 'timestamps');
|
|
205
|
+
const di = fields.find(f => f.key === 'diarize');
|
|
206
|
+
const tsValue = TIMESTAMPS_VALUE[ts.options[ts.idx]];
|
|
207
|
+
const diValue = di.options[di.idx] === 'on';
|
|
208
|
+
|
|
209
|
+
const lang = fields.find(f => f.key === 'language').value;
|
|
210
|
+
const tempVal = fields.find(f => f.key === 'temperature').value;
|
|
211
|
+
|
|
212
|
+
const cfg = {
|
|
213
|
+
apiKey: fields.find(f => f.key === 'apiKey').value,
|
|
214
|
+
model: fields.find(f => f.key === 'model').value,
|
|
215
|
+
language: lang === 'auto' ? '' : lang,
|
|
216
|
+
temperature: tempVal && tempVal !== 'default' ? parseFloat(tempVal) : null,
|
|
217
|
+
contextBias: fields.find(f => f.key === 'contextBias').value,
|
|
218
|
+
autoCopy: existing.autoCopy || false,
|
|
219
|
+
timestamps: tsValue,
|
|
220
|
+
diarize: diValue,
|
|
221
|
+
};
|
|
222
|
+
saveConfig(cfg);
|
|
223
|
+
process.stderr.write(`\n ${GREEN}✓${RESET} Saved to ${DIM}${CONFIG_FILE}${RESET}\n\n`);
|
|
224
|
+
resolve(cfg);
|
|
225
|
+
return;
|
|
226
|
+
}
|
|
227
|
+
renderForm();
|
|
228
|
+
}
|
|
108
229
|
|
|
109
230
|
const onData = (ch) => {
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
231
|
+
const f = fields[active];
|
|
232
|
+
|
|
233
|
+
// Ctrl+C — exit
|
|
234
|
+
if (ch === '\u0003') {
|
|
235
|
+
stdin.removeListener('data', onData);
|
|
236
|
+
stdin.setRawMode(false);
|
|
237
|
+
stdin.pause();
|
|
238
|
+
process.stderr.write('\n');
|
|
239
|
+
process.exit(EXIT_CONFIG);
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
if (f.type === 'select') {
|
|
243
|
+
if (ch === '\t' || ch === '\x1b[C' || ch === '\x1b[B') { // Tab, Right, Down
|
|
244
|
+
f.idx = (f.idx + 1) % f.options.length;
|
|
245
|
+
renderForm();
|
|
246
|
+
} else if (ch === '\x1b[D' || ch === '\x1b[A') { // Left, Up
|
|
247
|
+
f.idx = (f.idx - 1 + f.options.length) % f.options.length;
|
|
248
|
+
renderForm();
|
|
249
|
+
} else if (ch === '\n' || ch === '\r') {
|
|
250
|
+
advance();
|
|
251
|
+
}
|
|
252
|
+
} else {
|
|
253
|
+
// text / secret field
|
|
254
|
+
if (ch === '\n' || ch === '\r') {
|
|
255
|
+
advance();
|
|
256
|
+
} else if (ch === '\u007F' || ch === '\b') { // Backspace
|
|
257
|
+
if (inputBuf.length > 0) {
|
|
258
|
+
inputBuf = inputBuf.slice(0, -1);
|
|
259
|
+
if (!inputBuf) editing = false;
|
|
260
|
+
renderForm();
|
|
134
261
|
}
|
|
135
|
-
|
|
262
|
+
} else if (ch.charCodeAt(0) >= 32 && !ch.startsWith('\x1b')) {
|
|
263
|
+
if (!editing) editing = true;
|
|
264
|
+
inputBuf += ch;
|
|
265
|
+
renderForm();
|
|
266
|
+
}
|
|
136
267
|
}
|
|
137
268
|
};
|
|
138
269
|
|
|
@@ -140,41 +271,6 @@ function readSecret(prompt) {
|
|
|
140
271
|
});
|
|
141
272
|
}
|
|
142
273
|
|
|
143
|
-
// ── Setup wizard ──────────────────────────────────────────────────────────────
|
|
144
|
-
|
|
145
|
-
async function setupWizard() {
|
|
146
|
-
const existing = loadConfig() || {};
|
|
147
|
-
|
|
148
|
-
process.stderr.write(`\n${BOLD} dikt — setup${RESET}\n`);
|
|
149
|
-
process.stderr.write(` ${DIM}Press Enter to keep the default shown in brackets.${RESET}\n\n`);
|
|
150
|
-
|
|
151
|
-
const apiKey = (await readSecret(` Mistral API key [${existing.apiKey ? '••••' + existing.apiKey.slice(-4) : ''}]: `)).trim()
|
|
152
|
-
|| existing.apiKey || '';
|
|
153
|
-
if (!apiKey) {
|
|
154
|
-
process.stderr.write(`\n ${RED}API key is required.${RESET}\n\n`);
|
|
155
|
-
process.exit(EXIT_CONFIG);
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
const rl = readline.createInterface({ input: process.stdin, output: process.stderr });
|
|
159
|
-
const ask = (q) => new Promise((res) => rl.question(q, res));
|
|
160
|
-
|
|
161
|
-
const model = (await ask(` Model [${existing.model || 'voxtral-mini-latest'}]: `)).trim()
|
|
162
|
-
|| existing.model || 'voxtral-mini-latest';
|
|
163
|
-
const language = (await ask(` Language [${existing.language || 'auto'}]: `)).trim()
|
|
164
|
-
|| existing.language || '';
|
|
165
|
-
const tempStr = (await ask(` Temperature [${existing.temperature ?? 'default'}]: `)).trim();
|
|
166
|
-
const temperature = tempStr ? parseFloat(tempStr) : (existing.temperature ?? null);
|
|
167
|
-
const contextBias = (await ask(` Context bias [${existing.contextBias || ''}]: `)).trim()
|
|
168
|
-
|| existing.contextBias || '';
|
|
169
|
-
|
|
170
|
-
rl.close();
|
|
171
|
-
|
|
172
|
-
const cfg = { apiKey, model, language: language === 'auto' ? '' : language, temperature, contextBias, autoCopy: existing.autoCopy || false };
|
|
173
|
-
saveConfig(cfg);
|
|
174
|
-
process.stderr.write(`\n ${GREEN}✓${RESET} Saved to ${DIM}${CONFIG_FILE}${RESET}\n\n`);
|
|
175
|
-
return cfg;
|
|
176
|
-
}
|
|
177
|
-
|
|
178
274
|
// ── Prerequisites ─────────────────────────────────────────────────────────────
|
|
179
275
|
|
|
180
276
|
function checkSox() {
|
|
@@ -236,8 +332,13 @@ function getTermWidth() {
|
|
|
236
332
|
function render() {
|
|
237
333
|
const w = getTermWidth();
|
|
238
334
|
const header = ` dikt`;
|
|
239
|
-
const
|
|
240
|
-
|
|
335
|
+
const tags = [];
|
|
336
|
+
if (config.diarize) tags.push('diarize');
|
|
337
|
+
if (config.timestamps) tags.push('timestamps');
|
|
338
|
+
const tagStr = tags.length ? ` ${DIM}${tags.join(' · ')}${RESET}` : '';
|
|
339
|
+
const tagPlain = tags.length ? ` ${tags.join(' · ')}` : '';
|
|
340
|
+
const right = `[s]etup [?] [q]uit `;
|
|
341
|
+
const pad = Math.max(0, w - header.length - tagPlain.length - right.length);
|
|
241
342
|
|
|
242
343
|
let out = moveTo(1);
|
|
243
344
|
|
|
@@ -251,7 +352,7 @@ function render() {
|
|
|
251
352
|
out += CLEAR_LINE + '\n';
|
|
252
353
|
out += renderHelp();
|
|
253
354
|
} else {
|
|
254
|
-
out += CLEAR_LINE + BOLD + header + ' '.repeat(pad) + DIM + right + RESET + '\n';
|
|
355
|
+
out += CLEAR_LINE + BOLD + header + RESET + tagStr + ' '.repeat(pad) + DIM + right + RESET + '\n';
|
|
255
356
|
out += CLEAR_LINE + ` ${'─'.repeat(Math.max(0, w - 2))}` + '\n';
|
|
256
357
|
out += CLEAR_LINE + '\n';
|
|
257
358
|
out += CLEAR_LINE + renderKeybar() + '\n';
|
|
@@ -270,7 +371,9 @@ function render() {
|
|
|
270
371
|
const rows = process.stdout.rows || 24;
|
|
271
372
|
const availableRows = rows - 9; // header(2) + blank + keybar + blank + status + blank + meta + cleardown
|
|
272
373
|
if (availableRows > 0 && lines.length > availableRows) {
|
|
273
|
-
|
|
374
|
+
const hidden = lines.length - availableRows + 1; // +1 to make room for the hint
|
|
375
|
+
lines = lines.slice(lines.length - availableRows + 1);
|
|
376
|
+
lines.unshift(` ${DIM}↑ ${hidden} more line${hidden === 1 ? '' : 's'} above${RESET}`);
|
|
274
377
|
}
|
|
275
378
|
for (const line of lines) {
|
|
276
379
|
out += CLEAR_LINE + line + '\n';
|
|
@@ -326,6 +429,46 @@ function wrapTranscript(termWidth) {
|
|
|
326
429
|
if (!text) return [];
|
|
327
430
|
const indent = ' ';
|
|
328
431
|
const maxLen = termWidth - indent.length - 1; // leave 1 col margin
|
|
432
|
+
|
|
433
|
+
// Diarized transcript: each line is already formatted with speaker labels + ANSI colors.
|
|
434
|
+
// Handle each speaker line independently — no quotes, just indent and wrap.
|
|
435
|
+
if (config.diarize && text.includes('\n')) {
|
|
436
|
+
const result = [];
|
|
437
|
+
for (const speakerLine of text.split('\n')) {
|
|
438
|
+
if (!speakerLine) continue;
|
|
439
|
+
// ANSI codes mess up length calculation — strip them for measuring
|
|
440
|
+
const plain = speakerLine.replace(/\x1b\[[0-9;]*m/g, '');
|
|
441
|
+
if (plain.length <= maxLen || maxLen < 10) {
|
|
442
|
+
result.push(`${indent}${speakerLine}`);
|
|
443
|
+
} else {
|
|
444
|
+
// Wrap long speaker lines: first line keeps the label, continuation lines get extra indent
|
|
445
|
+
const labelMatch = plain.match(/^([A-Z]\s{2})/);
|
|
446
|
+
const contIndent = labelMatch ? ' '.repeat(labelMatch[1].length) : '';
|
|
447
|
+
const words = speakerLine.split(/(\s+)/);
|
|
448
|
+
let cur = '';
|
|
449
|
+
let curPlain = '';
|
|
450
|
+
let first = true;
|
|
451
|
+
for (const word of words) {
|
|
452
|
+
const wordPlain = word.replace(/\x1b\[[0-9;]*m/g, '');
|
|
453
|
+
if (curPlain.length + wordPlain.length > maxLen && curPlain.length > 0) {
|
|
454
|
+
result.push(`${indent}${cur}`);
|
|
455
|
+
cur = first ? contIndent : '';
|
|
456
|
+
curPlain = first ? contIndent : '';
|
|
457
|
+
first = false;
|
|
458
|
+
const trimmed = word.replace(/^\s+/, '');
|
|
459
|
+
cur += trimmed;
|
|
460
|
+
curPlain += trimmed.replace(/\x1b\[[0-9;]*m/g, '');
|
|
461
|
+
} else {
|
|
462
|
+
cur += word;
|
|
463
|
+
curPlain += wordPlain;
|
|
464
|
+
}
|
|
465
|
+
}
|
|
466
|
+
if (cur) result.push(`${indent}${first ? '' : contIndent}${cur}`);
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
return result;
|
|
470
|
+
}
|
|
471
|
+
|
|
329
472
|
if (maxLen < 10) return [`${indent}${text}`];
|
|
330
473
|
|
|
331
474
|
const words = text.split(/(\s+)/);
|
|
@@ -564,60 +707,40 @@ async function transcribe(wavPath) {
|
|
|
564
707
|
try {
|
|
565
708
|
const blob = await fs.openAsBlob(wavPath, { type: 'audio/wav' });
|
|
566
709
|
const file = new File([blob], 'recording.wav', { type: 'audio/wav' });
|
|
567
|
-
const fd = new FormData();
|
|
568
|
-
fd.append('file', file);
|
|
569
|
-
fd.append('model', config.model);
|
|
570
|
-
if (config.language) fd.append('language', config.language);
|
|
571
|
-
if (config.temperature != null) fd.append('temperature', String(config.temperature));
|
|
572
|
-
if (config.contextBias) fd.append('context_bias', config.contextBias);
|
|
573
710
|
|
|
574
711
|
const t0 = Date.now();
|
|
575
|
-
const
|
|
576
|
-
method: 'POST',
|
|
577
|
-
headers: { Authorization: `Bearer ${config.apiKey}` },
|
|
578
|
-
body: fd,
|
|
712
|
+
const result = await callTranscribeAPI(file, {
|
|
579
713
|
signal: AbortSignal.timeout(30_000),
|
|
714
|
+
timestamps: config.timestamps || '',
|
|
715
|
+
diarize: config.diarize || false,
|
|
580
716
|
});
|
|
581
717
|
state.latency = Date.now() - t0;
|
|
582
718
|
|
|
583
|
-
|
|
584
|
-
const raw = await resp.text().catch(() => '');
|
|
585
|
-
let msg;
|
|
586
|
-
try {
|
|
587
|
-
const e = JSON.parse(raw);
|
|
588
|
-
msg = e.message;
|
|
589
|
-
if (!msg && Array.isArray(e.detail)) {
|
|
590
|
-
msg = e.detail.map(d => [d.loc?.join('.'), d.msg].filter(Boolean).join(': ')).join('; ');
|
|
591
|
-
} else if (!msg && e.detail) {
|
|
592
|
-
msg = typeof e.detail === 'string' ? e.detail : JSON.stringify(e.detail);
|
|
593
|
-
}
|
|
594
|
-
if (!msg) msg = raw;
|
|
595
|
-
} catch {
|
|
596
|
-
msg = raw || `HTTP ${resp.status}`;
|
|
597
|
-
}
|
|
598
|
-
if (resp.status === 401) msg += ' — press [s] to reconfigure';
|
|
599
|
-
throw new Error(msg);
|
|
600
|
-
}
|
|
601
|
-
|
|
602
|
-
const data = await resp.json();
|
|
603
|
-
const text = (data.text || '').trim();
|
|
719
|
+
const text = result.text;
|
|
604
720
|
|
|
605
721
|
if (!text) {
|
|
606
722
|
state.mode = 'error';
|
|
607
723
|
state.error = 'No speech detected';
|
|
608
724
|
} else {
|
|
609
|
-
|
|
725
|
+
// Format with speaker labels if diarization is active
|
|
726
|
+
if (config.diarize && result.segments) {
|
|
727
|
+
state.transcript = formatDiarizedText(result.segments, { color: true });
|
|
728
|
+
} else {
|
|
729
|
+
state.transcript = text;
|
|
730
|
+
}
|
|
610
731
|
state.wordCount = text.split(/\s+/).filter(Boolean).length;
|
|
611
732
|
state.mode = 'ready';
|
|
612
733
|
|
|
613
734
|
// Push to history
|
|
614
|
-
state.history.unshift({ transcript:
|
|
735
|
+
state.history.unshift({ transcript: state.transcript, wordCount: state.wordCount, duration: state.duration, latency: state.latency });
|
|
615
736
|
if (state.history.length > MAX_HISTORY) state.history.pop();
|
|
616
737
|
state.historyIndex = -1;
|
|
617
738
|
}
|
|
618
739
|
} catch (err) {
|
|
619
740
|
state.mode = 'error';
|
|
620
|
-
|
|
741
|
+
let msg = err.name === 'TimeoutError' ? 'Transcription timed out' : err.message;
|
|
742
|
+
if (err.status === 401) msg += ' — press [s] to reconfigure';
|
|
743
|
+
state.error = msg;
|
|
621
744
|
} finally {
|
|
622
745
|
clearInterval(state.spinnerInterval);
|
|
623
746
|
cleanupRecFile();
|
|
@@ -746,29 +869,265 @@ async function runSetup() {
|
|
|
746
869
|
renderAll();
|
|
747
870
|
}
|
|
748
871
|
|
|
872
|
+
// ── Audio helpers ─────────────────────────────────────────────────────────────
|
|
873
|
+
|
|
874
|
+
const SILENCE_THRESHOLD = Math.round(32768 * 0.01); // 1% of max 16-bit amplitude
|
|
875
|
+
|
|
876
|
+
function createWavHeader(dataSize) {
|
|
877
|
+
const buf = Buffer.alloc(44);
|
|
878
|
+
buf.write('RIFF', 0);
|
|
879
|
+
buf.writeUInt32LE(36 + dataSize, 4);
|
|
880
|
+
buf.write('WAVE', 8);
|
|
881
|
+
buf.write('fmt ', 12);
|
|
882
|
+
buf.writeUInt32LE(16, 16);
|
|
883
|
+
buf.writeUInt16LE(1, 20); // PCM
|
|
884
|
+
buf.writeUInt16LE(1, 22); // mono
|
|
885
|
+
buf.writeUInt32LE(16000, 24); // sample rate
|
|
886
|
+
buf.writeUInt32LE(32000, 28); // byte rate (16000 * 1 * 2)
|
|
887
|
+
buf.writeUInt16LE(2, 32); // block align
|
|
888
|
+
buf.writeUInt16LE(16, 34); // bits per sample
|
|
889
|
+
buf.write('data', 36);
|
|
890
|
+
buf.writeUInt32LE(dataSize, 40);
|
|
891
|
+
return buf;
|
|
892
|
+
}
|
|
893
|
+
|
|
894
|
+
function peakAmplitude(chunk) {
|
|
895
|
+
let peak = 0;
|
|
896
|
+
for (let i = 0; i < chunk.length - 1; i += 2) {
|
|
897
|
+
const abs = Math.abs(chunk.readInt16LE(i));
|
|
898
|
+
if (abs > peak) peak = abs;
|
|
899
|
+
}
|
|
900
|
+
return peak;
|
|
901
|
+
}
|
|
902
|
+
|
|
903
|
+
function trimSilence(rawData) {
|
|
904
|
+
const SAMPLE_RATE = 16000;
|
|
905
|
+
const BYTES_PER_SAMPLE = 2;
|
|
906
|
+
const WINDOW_SAMPLES = Math.round(SAMPLE_RATE * 0.05); // 50ms windows
|
|
907
|
+
const WINDOW_BYTES = WINDOW_SAMPLES * BYTES_PER_SAMPLE;
|
|
908
|
+
const MAX_SILENCE_WINDOWS = Math.round(1.0 / 0.05); // 1 second = 20 windows
|
|
909
|
+
const PAD_WINDOWS = Math.round(0.1 / 0.05); // 100ms padding = 2 windows
|
|
910
|
+
|
|
911
|
+
const windows = [];
|
|
912
|
+
for (let offset = 0; offset + WINDOW_BYTES <= rawData.length; offset += WINDOW_BYTES) {
|
|
913
|
+
windows.push(rawData.subarray(offset, offset + WINDOW_BYTES));
|
|
914
|
+
}
|
|
915
|
+
// Include any trailing partial window
|
|
916
|
+
const remainder = rawData.length % WINDOW_BYTES;
|
|
917
|
+
if (remainder > 0) {
|
|
918
|
+
windows.push(rawData.subarray(rawData.length - remainder));
|
|
919
|
+
}
|
|
920
|
+
|
|
921
|
+
const output = [];
|
|
922
|
+
let silentCount = 0;
|
|
923
|
+
|
|
924
|
+
for (const win of windows) {
|
|
925
|
+
const peak = peakAmplitude(win);
|
|
926
|
+
if (peak < SILENCE_THRESHOLD) {
|
|
927
|
+
silentCount++;
|
|
928
|
+
if (silentCount <= MAX_SILENCE_WINDOWS) {
|
|
929
|
+
output.push(win);
|
|
930
|
+
} else if (silentCount === MAX_SILENCE_WINDOWS + 1) {
|
|
931
|
+
// Replace excess silence with padding
|
|
932
|
+
const padBytes = PAD_WINDOWS * WINDOW_BYTES;
|
|
933
|
+
output.push(Buffer.alloc(padBytes)); // zeros = silence
|
|
934
|
+
}
|
|
935
|
+
// else: skip (already added padding)
|
|
936
|
+
} else {
|
|
937
|
+
silentCount = 0;
|
|
938
|
+
output.push(win);
|
|
939
|
+
}
|
|
940
|
+
}
|
|
941
|
+
|
|
942
|
+
return Buffer.concat(output);
|
|
943
|
+
}
|
|
944
|
+
|
|
945
|
+
async function callTranscribeAPI(file, { signal, timestamps, diarize } = {}) {
|
|
946
|
+
const fd = new FormData();
|
|
947
|
+
fd.append('file', file);
|
|
948
|
+
fd.append('model', config.model);
|
|
949
|
+
if (config.language) fd.append('language', config.language);
|
|
950
|
+
if (config.temperature != null) fd.append('temperature', String(config.temperature));
|
|
951
|
+
if (config.contextBias) fd.append('context_bias', config.contextBias);
|
|
952
|
+
if (timestamps) {
|
|
953
|
+
for (const g of timestamps.split(',')) fd.append('timestamp_granularities[]', g.trim());
|
|
954
|
+
}
|
|
955
|
+
if (diarize) {
|
|
956
|
+
fd.append('diarize', 'true');
|
|
957
|
+
// API requires segment timestamps when diarize is enabled
|
|
958
|
+
if (!timestamps) fd.append('timestamp_granularities[]', 'segment');
|
|
959
|
+
}
|
|
960
|
+
|
|
961
|
+
const t0 = Date.now();
|
|
962
|
+
const resp = await fetch('https://api.mistral.ai/v1/audio/transcriptions', {
|
|
963
|
+
method: 'POST',
|
|
964
|
+
headers: { Authorization: `Bearer ${config.apiKey}` },
|
|
965
|
+
body: fd,
|
|
966
|
+
signal: signal || AbortSignal.timeout(30_000),
|
|
967
|
+
});
|
|
968
|
+
const latency = Date.now() - t0;
|
|
969
|
+
|
|
970
|
+
if (!resp.ok) {
|
|
971
|
+
const raw = await resp.text().catch(() => '');
|
|
972
|
+
let msg;
|
|
973
|
+
try {
|
|
974
|
+
const e = JSON.parse(raw);
|
|
975
|
+
msg = e.message;
|
|
976
|
+
if (typeof msg === 'object' && msg !== null) msg = JSON.stringify(msg);
|
|
977
|
+
if (!msg && Array.isArray(e.detail)) {
|
|
978
|
+
msg = e.detail.map(d => [d.loc?.join('.'), d.msg].filter(Boolean).join(': ')).join('; ');
|
|
979
|
+
} else if (!msg && e.detail) {
|
|
980
|
+
msg = typeof e.detail === 'string' ? e.detail : JSON.stringify(e.detail);
|
|
981
|
+
}
|
|
982
|
+
if (!msg) msg = raw;
|
|
983
|
+
} catch {
|
|
984
|
+
msg = raw || `HTTP ${resp.status}`;
|
|
985
|
+
}
|
|
986
|
+
const err = new Error(msg);
|
|
987
|
+
err.status = resp.status;
|
|
988
|
+
throw err;
|
|
989
|
+
}
|
|
990
|
+
|
|
991
|
+
const data = await resp.json();
|
|
992
|
+
const text = (data.text || '').trim();
|
|
993
|
+
return { text, latency, segments: data.segments, words: data.words };
|
|
994
|
+
}
|
|
995
|
+
|
|
996
|
+
async function transcribeBuffer(rawChunks, { signal, timestamps, diarize } = {}) {
|
|
997
|
+
const rawData = Buffer.concat(rawChunks);
|
|
998
|
+
const trimmed = trimSilence(rawData);
|
|
999
|
+
const wavData = Buffer.concat([createWavHeader(trimmed.length), trimmed]);
|
|
1000
|
+
const blob = new Blob([wavData], { type: 'audio/wav' });
|
|
1001
|
+
const file = new File([blob], 'recording.wav', { type: 'audio/wav' });
|
|
1002
|
+
return callTranscribeAPI(file, { signal, timestamps, diarize });
|
|
1003
|
+
}
|
|
1004
|
+
|
|
1005
|
+
// ── Output formatting helpers ─────────────────────────────────────────────────
|
|
1006
|
+
|
|
1007
|
+
const SPEAKER_COLORS = [GREEN, YELLOW, CYAN, MAGENTA, BLUE, RED];
|
|
1008
|
+
|
|
1009
|
+
function formatDiarizedText(segments, { color = false } = {}) {
|
|
1010
|
+
if (!segments || !segments.length) return '';
|
|
1011
|
+
|
|
1012
|
+
// Map speaker IDs to short letters (A, B, C, ...)
|
|
1013
|
+
const speakerMap = new Map();
|
|
1014
|
+
for (const s of segments) {
|
|
1015
|
+
if (s.speaker_id != null && !speakerMap.has(s.speaker_id)) {
|
|
1016
|
+
speakerMap.set(s.speaker_id, speakerMap.size);
|
|
1017
|
+
}
|
|
1018
|
+
}
|
|
1019
|
+
|
|
1020
|
+
// Merge consecutive segments from the same speaker
|
|
1021
|
+
const merged = [];
|
|
1022
|
+
for (const s of segments) {
|
|
1023
|
+
const text = (s.text || '').trim();
|
|
1024
|
+
if (!text) continue;
|
|
1025
|
+
const last = merged[merged.length - 1];
|
|
1026
|
+
if (last && last.speaker_id === s.speaker_id) {
|
|
1027
|
+
last.text += ' ' + text;
|
|
1028
|
+
} else {
|
|
1029
|
+
merged.push({ speaker_id: s.speaker_id, text });
|
|
1030
|
+
}
|
|
1031
|
+
}
|
|
1032
|
+
|
|
1033
|
+
return merged.map(s => {
|
|
1034
|
+
const idx = speakerMap.get(s.speaker_id) ?? 0;
|
|
1035
|
+
const letter = String.fromCharCode(65 + idx); // A, B, C, ...
|
|
1036
|
+
if (color) {
|
|
1037
|
+
const c = SPEAKER_COLORS[idx % SPEAKER_COLORS.length];
|
|
1038
|
+
return `${c}${BOLD}${letter}${RESET} ${s.text}`;
|
|
1039
|
+
}
|
|
1040
|
+
return `${letter} ${s.text}`;
|
|
1041
|
+
}).join('\n');
|
|
1042
|
+
}
|
|
1043
|
+
|
|
1044
|
+
function buildJsonOutput(base, { segments, words, timestamps, diarize } = {}) {
|
|
1045
|
+
const out = { ...base, timestamp: new Date().toISOString() };
|
|
1046
|
+
if ((timestamps || diarize) && segments) out.segments = segments;
|
|
1047
|
+
if (timestamps && words) out.words = words;
|
|
1048
|
+
return out;
|
|
1049
|
+
}
|
|
1050
|
+
|
|
1051
|
+
// ── File mode ────────────────────────────────────────────────────────────────
|
|
1052
|
+
|
|
1053
|
+
async function runFile(flags) {
|
|
1054
|
+
try {
|
|
1055
|
+
if (!flags.file || !fs.existsSync(flags.file)) {
|
|
1056
|
+
process.stderr.write(`Error: file not found: ${flags.file}\n`);
|
|
1057
|
+
return EXIT_TRANSCRIPTION;
|
|
1058
|
+
}
|
|
1059
|
+
|
|
1060
|
+
const blob = await fs.openAsBlob(flags.file);
|
|
1061
|
+
const ext = path.extname(flags.file).slice(1) || 'wav';
|
|
1062
|
+
const mimeTypes = { wav: 'audio/wav', mp3: 'audio/mpeg', m4a: 'audio/mp4', ogg: 'audio/ogg', flac: 'audio/flac', webm: 'audio/webm' };
|
|
1063
|
+
const mime = mimeTypes[ext] || 'audio/wav';
|
|
1064
|
+
const file = new File([blob], path.basename(flags.file), { type: mime });
|
|
1065
|
+
|
|
1066
|
+
const result = await callTranscribeAPI(file, { timestamps: flags.timestamps, diarize: flags.diarize });
|
|
1067
|
+
|
|
1068
|
+
if (!result.text) {
|
|
1069
|
+
process.stderr.write('No speech detected\n');
|
|
1070
|
+
return EXIT_TRANSCRIPTION;
|
|
1071
|
+
}
|
|
1072
|
+
|
|
1073
|
+
const wordCount = result.text.split(/\s+/).filter(Boolean).length;
|
|
1074
|
+
|
|
1075
|
+
if (flags.json) {
|
|
1076
|
+
const out = buildJsonOutput(
|
|
1077
|
+
{ text: result.text, latency: result.latency, words: wordCount },
|
|
1078
|
+
{ segments: result.segments, words: result.words, timestamps: flags.timestamps, diarize: flags.diarize },
|
|
1079
|
+
);
|
|
1080
|
+
process.stdout.write(JSON.stringify(out) + '\n');
|
|
1081
|
+
} else if (flags.diarize && result.segments) {
|
|
1082
|
+
process.stdout.write(formatDiarizedText(result.segments) + '\n');
|
|
1083
|
+
} else {
|
|
1084
|
+
process.stdout.write(result.text + '\n');
|
|
1085
|
+
}
|
|
1086
|
+
|
|
1087
|
+
return EXIT_OK;
|
|
1088
|
+
} catch (err) {
|
|
1089
|
+
process.stderr.write(`Error: ${err.message}\n`);
|
|
1090
|
+
return EXIT_TRANSCRIPTION;
|
|
1091
|
+
}
|
|
1092
|
+
}
|
|
1093
|
+
|
|
749
1094
|
// ── Single-shot mode ──────────────────────────────────────────────────────────
|
|
750
1095
|
|
|
751
1096
|
async function runOnce(flags) {
|
|
752
|
-
const recFile = path.join(os.tmpdir(), `dikt-${Date.now()}.wav`);
|
|
753
|
-
|
|
754
1097
|
try {
|
|
755
|
-
// Record
|
|
1098
|
+
// Record raw PCM to stdout — silence detection handled in Node.js
|
|
756
1099
|
const recProc = spawn('rec', [
|
|
757
|
-
'-q', '-r', '16000', '-c', '1', '-b', '16',
|
|
758
|
-
recFile,
|
|
759
|
-
'silence', '1', '0.1', '1%', '1', '2.0', '1%',
|
|
1100
|
+
'-q', '-r', '16000', '-c', '1', '-b', '16', '-t', 'raw', '-',
|
|
760
1101
|
], {
|
|
761
|
-
stdio: ['ignore', '
|
|
1102
|
+
stdio: ['ignore', 'pipe', 'pipe'],
|
|
762
1103
|
});
|
|
763
1104
|
|
|
764
1105
|
recProc.stderr.on('data', () => {});
|
|
765
1106
|
|
|
766
|
-
// Ctrl+C stops recording gracefully
|
|
767
1107
|
const sigHandler = () => recProc.kill('SIGTERM');
|
|
768
1108
|
process.on('SIGINT', sigHandler);
|
|
769
1109
|
|
|
1110
|
+
const chunks = [];
|
|
1111
|
+
let heardSound = false;
|
|
1112
|
+
let lastSoundTime = Date.now();
|
|
770
1113
|
const recStart = Date.now();
|
|
1114
|
+
|
|
1115
|
+
recProc.stdout.on('data', (chunk) => {
|
|
1116
|
+
chunks.push(chunk);
|
|
1117
|
+
if (peakAmplitude(chunk) > SILENCE_THRESHOLD) {
|
|
1118
|
+
heardSound = true;
|
|
1119
|
+
lastSoundTime = Date.now();
|
|
1120
|
+
}
|
|
1121
|
+
});
|
|
1122
|
+
|
|
1123
|
+
const silenceTimer = setInterval(() => {
|
|
1124
|
+
if (flags.silence > 0 && heardSound && Date.now() - lastSoundTime > flags.silence * 1000) {
|
|
1125
|
+
recProc.kill('SIGTERM');
|
|
1126
|
+
}
|
|
1127
|
+
}, 100);
|
|
1128
|
+
|
|
771
1129
|
await new Promise((resolve) => recProc.on('close', resolve));
|
|
1130
|
+
clearInterval(silenceTimer);
|
|
772
1131
|
process.removeListener('SIGINT', sigHandler);
|
|
773
1132
|
const duration = (Date.now() - recStart) / 1000;
|
|
774
1133
|
|
|
@@ -782,45 +1141,26 @@ async function runOnce(flags) {
|
|
|
782
1141
|
const abortHandler = () => ac.abort();
|
|
783
1142
|
process.on('SIGINT', abortHandler);
|
|
784
1143
|
|
|
785
|
-
const
|
|
786
|
-
const file = new File([blob], 'recording.wav', { type: 'audio/wav' });
|
|
787
|
-
const fd = new FormData();
|
|
788
|
-
fd.append('file', file);
|
|
789
|
-
fd.append('model', config.model);
|
|
790
|
-
if (config.language) fd.append('language', config.language);
|
|
791
|
-
if (config.temperature != null) fd.append('temperature', String(config.temperature));
|
|
792
|
-
if (config.contextBias) fd.append('context_bias', config.contextBias);
|
|
793
|
-
|
|
794
|
-
const t0 = Date.now();
|
|
795
|
-
const resp = await fetch('https://api.mistral.ai/v1/audio/transcriptions', {
|
|
796
|
-
method: 'POST',
|
|
797
|
-
headers: { Authorization: `Bearer ${config.apiKey}` },
|
|
798
|
-
body: fd,
|
|
799
|
-
signal: ac.signal,
|
|
800
|
-
});
|
|
801
|
-
const latency = Date.now() - t0;
|
|
1144
|
+
const result = await transcribeBuffer(chunks, { signal: ac.signal, timestamps: flags.timestamps, diarize: flags.diarize });
|
|
802
1145
|
process.removeListener('SIGINT', abortHandler);
|
|
803
1146
|
|
|
804
|
-
if (!
|
|
805
|
-
const raw = await resp.text().catch(() => '');
|
|
806
|
-
process.stderr.write(`Error: ${raw || `HTTP ${resp.status}`}\n`);
|
|
807
|
-
return EXIT_TRANSCRIPTION;
|
|
808
|
-
}
|
|
809
|
-
|
|
810
|
-
const data = await resp.json();
|
|
811
|
-
const text = (data.text || '').trim();
|
|
812
|
-
|
|
813
|
-
if (!text) {
|
|
1147
|
+
if (!result.text) {
|
|
814
1148
|
process.stderr.write('No speech detected\n');
|
|
815
1149
|
return EXIT_TRANSCRIPTION;
|
|
816
1150
|
}
|
|
817
1151
|
|
|
818
|
-
const wordCount = text.split(/\s+/).filter(Boolean).length;
|
|
1152
|
+
const wordCount = result.text.split(/\s+/).filter(Boolean).length;
|
|
819
1153
|
|
|
820
1154
|
if (flags.json) {
|
|
821
|
-
|
|
1155
|
+
const out = buildJsonOutput(
|
|
1156
|
+
{ text: result.text, duration: parseFloat(duration.toFixed(1)), latency: result.latency, words: wordCount },
|
|
1157
|
+
{ segments: result.segments, words: result.words, timestamps: flags.timestamps, diarize: flags.diarize },
|
|
1158
|
+
);
|
|
1159
|
+
process.stdout.write(JSON.stringify(out) + '\n');
|
|
1160
|
+
} else if (flags.diarize && result.segments) {
|
|
1161
|
+
process.stdout.write(formatDiarizedText(result.segments) + '\n');
|
|
822
1162
|
} else {
|
|
823
|
-
process.stdout.write(text + '\n');
|
|
1163
|
+
process.stdout.write(result.text + '\n');
|
|
824
1164
|
}
|
|
825
1165
|
|
|
826
1166
|
return EXIT_OK;
|
|
@@ -831,8 +1171,123 @@ async function runOnce(flags) {
|
|
|
831
1171
|
process.stderr.write(`Error: ${err.message}\n`);
|
|
832
1172
|
}
|
|
833
1173
|
return EXIT_TRANSCRIPTION;
|
|
834
|
-
}
|
|
835
|
-
|
|
1174
|
+
}
|
|
1175
|
+
}
|
|
1176
|
+
|
|
1177
|
+
// ── Stream mode ──────────────────────────────────────────────────────────────
|
|
1178
|
+
|
|
1179
|
+
async function runStream(flags) {
|
|
1180
|
+
try {
|
|
1181
|
+
const recProc = spawn('rec', [
|
|
1182
|
+
'-q', '-r', '16000', '-c', '1', '-b', '16', '-t', 'raw', '-',
|
|
1183
|
+
], {
|
|
1184
|
+
stdio: ['ignore', 'pipe', 'pipe'],
|
|
1185
|
+
});
|
|
1186
|
+
|
|
1187
|
+
recProc.stderr.on('data', () => {});
|
|
1188
|
+
|
|
1189
|
+
let killed = false;
|
|
1190
|
+
const killRec = () => { if (!killed) { killed = true; recProc.kill('SIGTERM'); process.stderr.write('\n'); } };
|
|
1191
|
+
process.on('SIGINT', killRec);
|
|
1192
|
+
|
|
1193
|
+
let chunks = []; // current chunk buffer (resets per pause)
|
|
1194
|
+
let chunkHasAudio = false; // current chunk has sound (resets per pause)
|
|
1195
|
+
let heardSound = false; // ever heard sound (never resets)
|
|
1196
|
+
let lastSoundTime = Date.now();
|
|
1197
|
+
let chunkStart = Date.now();
|
|
1198
|
+
let chunkIndex = 0;
|
|
1199
|
+
const pending = [];
|
|
1200
|
+
|
|
1201
|
+
recProc.stdout.on('data', (chunk) => {
|
|
1202
|
+
chunks.push(chunk);
|
|
1203
|
+
if (peakAmplitude(chunk) > SILENCE_THRESHOLD) {
|
|
1204
|
+
chunkHasAudio = true;
|
|
1205
|
+
heardSound = true;
|
|
1206
|
+
lastSoundTime = Date.now();
|
|
1207
|
+
}
|
|
1208
|
+
});
|
|
1209
|
+
|
|
1210
|
+
const checkTimer = setInterval(() => {
|
|
1211
|
+
const silenceMs = Date.now() - lastSoundTime;
|
|
1212
|
+
|
|
1213
|
+
// Pause: send current chunk for transcription, keep recording
|
|
1214
|
+
if (chunkHasAudio && silenceMs > flags.pause * 1000 && chunks.length > 0) {
|
|
1215
|
+
const batch = chunks;
|
|
1216
|
+
const duration = (Date.now() - chunkStart) / 1000;
|
|
1217
|
+
const idx = chunkIndex++;
|
|
1218
|
+
chunks = [];
|
|
1219
|
+
chunkHasAudio = false;
|
|
1220
|
+
chunkStart = Date.now();
|
|
1221
|
+
|
|
1222
|
+
const p = transcribeBuffer(batch, { timestamps: flags.timestamps, diarize: flags.diarize })
|
|
1223
|
+
.then((result) => {
|
|
1224
|
+
if (!result.text) return;
|
|
1225
|
+
const wordCount = result.text.split(/\s+/).filter(Boolean).length;
|
|
1226
|
+
if (flags.json) {
|
|
1227
|
+
const out = buildJsonOutput(
|
|
1228
|
+
{ text: result.text, chunk: idx, duration: parseFloat(duration.toFixed(1)), latency: result.latency, words: wordCount },
|
|
1229
|
+
{ segments: result.segments, words: result.words, timestamps: flags.timestamps, diarize: flags.diarize },
|
|
1230
|
+
);
|
|
1231
|
+
process.stdout.write(JSON.stringify(out) + '\n');
|
|
1232
|
+
} else if (flags.diarize && result.segments) {
|
|
1233
|
+
const sep = flags.noNewline ? ' ' : '\n';
|
|
1234
|
+
process.stdout.write(formatDiarizedText(result.segments) + sep);
|
|
1235
|
+
} else {
|
|
1236
|
+
process.stdout.write(result.text + (flags.noNewline ? ' ' : '\n'));
|
|
1237
|
+
}
|
|
1238
|
+
})
|
|
1239
|
+
.catch((err) => {
|
|
1240
|
+
process.stderr.write(`Chunk ${idx} error: ${err.message}\n`);
|
|
1241
|
+
});
|
|
1242
|
+
pending.push(p);
|
|
1243
|
+
}
|
|
1244
|
+
|
|
1245
|
+
// Stop: full silence threshold reached
|
|
1246
|
+
if (flags.silence > 0 && heardSound && silenceMs > flags.silence * 1000) {
|
|
1247
|
+
killRec();
|
|
1248
|
+
}
|
|
1249
|
+
}, 100);
|
|
1250
|
+
|
|
1251
|
+
await new Promise((resolve) => recProc.on('close', resolve));
|
|
1252
|
+
clearInterval(checkTimer);
|
|
1253
|
+
process.removeListener('SIGINT', killRec);
|
|
1254
|
+
|
|
1255
|
+
// Send any remaining audio that hasn't been sent yet
|
|
1256
|
+
if (chunks.length > 0 && chunkHasAudio) {
|
|
1257
|
+
const duration = (Date.now() - chunkStart) / 1000;
|
|
1258
|
+
const idx = chunkIndex++;
|
|
1259
|
+
try {
|
|
1260
|
+
const result = await transcribeBuffer(chunks, { timestamps: flags.timestamps, diarize: flags.diarize });
|
|
1261
|
+
if (result.text) {
|
|
1262
|
+
const wordCount = result.text.split(/\s+/).filter(Boolean).length;
|
|
1263
|
+
if (flags.json) {
|
|
1264
|
+
const out = buildJsonOutput(
|
|
1265
|
+
{ text: result.text, chunk: idx, duration: parseFloat(duration.toFixed(1)), latency: result.latency, words: wordCount },
|
|
1266
|
+
{ segments: result.segments, words: result.words, timestamps: flags.timestamps, diarize: flags.diarize },
|
|
1267
|
+
);
|
|
1268
|
+
process.stdout.write(JSON.stringify(out) + '\n');
|
|
1269
|
+
} else if (flags.diarize && result.segments) {
|
|
1270
|
+
const sep = flags.noNewline ? ' ' : '\n';
|
|
1271
|
+
process.stdout.write(formatDiarizedText(result.segments) + sep);
|
|
1272
|
+
} else {
|
|
1273
|
+
process.stdout.write(result.text + (flags.noNewline ? ' ' : '\n'));
|
|
1274
|
+
}
|
|
1275
|
+
}
|
|
1276
|
+
} catch (err) {
|
|
1277
|
+
process.stderr.write(`Chunk ${idx} error: ${err.message}\n`);
|
|
1278
|
+
}
|
|
1279
|
+
}
|
|
1280
|
+
|
|
1281
|
+
// Wait for any in-flight transcriptions to finish
|
|
1282
|
+
await Promise.allSettled(pending);
|
|
1283
|
+
|
|
1284
|
+
// Final newline for --no-newline so shell prompt starts on a new line
|
|
1285
|
+
if (flags.noNewline && !flags.json) process.stdout.write('\n');
|
|
1286
|
+
|
|
1287
|
+
return EXIT_OK;
|
|
1288
|
+
} catch (err) {
|
|
1289
|
+
process.stderr.write(`Error: ${err.message}\n`);
|
|
1290
|
+
return EXIT_TRANSCRIPTION;
|
|
836
1291
|
}
|
|
837
1292
|
}
|
|
838
1293
|
|
|
@@ -863,6 +1318,14 @@ async function main() {
|
|
|
863
1318
|
quiet: args.includes('--quiet') || args.includes('-q'),
|
|
864
1319
|
noInput: args.includes('--no-input'),
|
|
865
1320
|
setup: args.includes('--setup') || args[0] === 'setup',
|
|
1321
|
+
stream: args.includes('--stream'),
|
|
1322
|
+
silence: args.includes('--silence') ? (Number.isFinite(parseFloat(args[args.indexOf('--silence') + 1])) ? parseFloat(args[args.indexOf('--silence') + 1]) : 2.0) : 2.0,
|
|
1323
|
+
pause: args.includes('--pause') ? parseFloat(args[args.indexOf('--pause') + 1]) || 1.0 : 1.0,
|
|
1324
|
+
language: args.includes('--language') ? args[args.indexOf('--language') + 1] || '' : '',
|
|
1325
|
+
file: args.includes('--file') ? args[args.indexOf('--file') + 1] || '' : '',
|
|
1326
|
+
noNewline: args.includes('--no-newline') || args.includes('-n'),
|
|
1327
|
+
timestamps: args.includes('--timestamps') ? args[args.indexOf('--timestamps') + 1] || '' : '',
|
|
1328
|
+
diarize: args.includes('--diarize'),
|
|
866
1329
|
};
|
|
867
1330
|
|
|
868
1331
|
if (args.includes('--version')) {
|
|
@@ -903,6 +1366,14 @@ Options:
|
|
|
903
1366
|
--update Update to latest version
|
|
904
1367
|
--json Record once, output JSON to stdout
|
|
905
1368
|
-q, --quiet Record once, print transcript to stdout
|
|
1369
|
+
--stream Stream transcription chunks on pauses
|
|
1370
|
+
--file <path> Transcribe an audio file (no mic needed)
|
|
1371
|
+
--silence <seconds> Silence duration before auto-stop (default: 2.0)
|
|
1372
|
+
--pause <seconds> Pause duration to split chunks (default: 1.0)
|
|
1373
|
+
--language <code> Language code, e.g. en, de, fr (default: auto)
|
|
1374
|
+
-n, --no-newline Join stream chunks without newlines
|
|
1375
|
+
--timestamps <granularity> Add timestamps: segment, word, or segment,word
|
|
1376
|
+
--diarize Enable speaker identification
|
|
906
1377
|
--no-input Fail if config is missing (no wizard)
|
|
907
1378
|
--no-color Disable colored output
|
|
908
1379
|
--version Show version
|
|
@@ -920,8 +1391,15 @@ Examples:
|
|
|
920
1391
|
dikt setup Reconfigure API key and model
|
|
921
1392
|
dikt -q Record once, print transcript to stdout
|
|
922
1393
|
dikt --json Record once, output JSON to stdout
|
|
1394
|
+
dikt -q --silence 5 Wait longer before auto-stopping
|
|
1395
|
+
dikt --stream Stream chunks as you speak
|
|
1396
|
+
dikt --stream --json Stream chunks as JSON Lines
|
|
923
1397
|
dikt -q | claude Dictate a prompt to Claude Code
|
|
924
1398
|
dikt update Update to the latest version
|
|
1399
|
+
dikt --file meeting.wav Transcribe an existing audio file
|
|
1400
|
+
dikt --stream --silence 0 Stream continuously until Ctrl+C
|
|
1401
|
+
dikt --stream -n Stream as continuous flowing text
|
|
1402
|
+
dikt -q --json --diarize Transcribe with speaker labels
|
|
925
1403
|
|
|
926
1404
|
Environment variables:
|
|
927
1405
|
DIKT_API_KEY Override API key from config
|
|
@@ -942,8 +1420,6 @@ Requires: sox (brew install sox)`);
|
|
|
942
1420
|
process.exit(EXIT_OK);
|
|
943
1421
|
}
|
|
944
1422
|
|
|
945
|
-
checkSox();
|
|
946
|
-
|
|
947
1423
|
// Load or setup config
|
|
948
1424
|
if (flags.setup) {
|
|
949
1425
|
checkTTY();
|
|
@@ -961,6 +1437,9 @@ Requires: sox (brew install sox)`);
|
|
|
961
1437
|
}
|
|
962
1438
|
|
|
963
1439
|
applyEnvOverrides(config);
|
|
1440
|
+
if (flags.language) config.language = flags.language;
|
|
1441
|
+
if (!flags.timestamps && config.timestamps) flags.timestamps = config.timestamps;
|
|
1442
|
+
if (!flags.diarize && config.diarize) flags.diarize = true;
|
|
964
1443
|
|
|
965
1444
|
const validation = validateConfig(config);
|
|
966
1445
|
if (!validation.valid) {
|
|
@@ -970,6 +1449,33 @@ Requires: sox (brew install sox)`);
|
|
|
970
1449
|
process.exit(EXIT_CONFIG);
|
|
971
1450
|
}
|
|
972
1451
|
|
|
1452
|
+
// Validate incompatible flag combinations
|
|
1453
|
+
const lang = config.language;
|
|
1454
|
+
if (lang && flags.timestamps) {
|
|
1455
|
+
process.stderr.write('Error: --timestamps and --language cannot be used together\n');
|
|
1456
|
+
process.exit(EXIT_CONFIG);
|
|
1457
|
+
}
|
|
1458
|
+
if (lang && flags.diarize) {
|
|
1459
|
+
process.stderr.write('Error: --diarize and --language cannot be used together\n');
|
|
1460
|
+
process.exit(EXIT_CONFIG);
|
|
1461
|
+
}
|
|
1462
|
+
if (flags.diarize && flags.stream) {
|
|
1463
|
+
process.stderr.write('Error: --diarize is not compatible with --stream, use -q --diarize instead\n');
|
|
1464
|
+
process.exit(EXIT_CONFIG);
|
|
1465
|
+
}
|
|
1466
|
+
|
|
1467
|
+
// File mode: transcribe an existing audio file (no sox needed)
|
|
1468
|
+
if (flags.file) {
|
|
1469
|
+
process.exit(await runFile(flags));
|
|
1470
|
+
}
|
|
1471
|
+
|
|
1472
|
+
checkSox();
|
|
1473
|
+
|
|
1474
|
+
// Stream mode: chunked transcription on pauses
|
|
1475
|
+
if (flags.stream) {
|
|
1476
|
+
process.exit(await runStream(flags));
|
|
1477
|
+
}
|
|
1478
|
+
|
|
973
1479
|
// Single-shot mode: record once, output, exit
|
|
974
1480
|
if (flags.json || flags.quiet) {
|
|
975
1481
|
process.exit(await runOnce(flags));
|
|
@@ -978,6 +1484,10 @@ Requires: sox (brew install sox)`);
|
|
|
978
1484
|
// Interactive TUI mode
|
|
979
1485
|
checkTTY();
|
|
980
1486
|
|
|
1487
|
+
// Clear any setup wizard output before entering alt screen, so it doesn't
|
|
1488
|
+
// leak back when the alt screen exits.
|
|
1489
|
+
process.stdout.write(CLEAR_SCREEN);
|
|
1490
|
+
|
|
981
1491
|
// Enter raw TUI mode (alternate screen buffer prevents scrollback corruption)
|
|
982
1492
|
process.stdout.write(ALT_SCREEN_ON + HIDE_CURSOR + CLEAR_SCREEN);
|
|
983
1493
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "dikt",
|
|
3
|
-
"version": "1.0
|
|
3
|
+
"version": "1.1.0",
|
|
4
4
|
"description": "Voice dictation for the terminal.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -17,7 +17,9 @@
|
|
|
17
17
|
"cli",
|
|
18
18
|
"terminal",
|
|
19
19
|
"whisper",
|
|
20
|
-
"mistral"
|
|
20
|
+
"mistral",
|
|
21
|
+
"diarization",
|
|
22
|
+
"voxtral"
|
|
21
23
|
],
|
|
22
24
|
"author": "johxyz",
|
|
23
25
|
"repository": {
|