listener-ai 2.7.0 → 2.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.DEFAULT_CODEX_TRANSCRIPTION_MODEL = exports.DEFAULT_CODEX_MODEL = exports.DEFAULT_GEMINI_FLASH_MODEL = exports.DEFAULT_GEMINI_MODEL = exports.AI_PROVIDERS = void 0;
3
+ exports.CODEX_TRANSCRIPTION_NON_DIARIZE_MODEL = exports.DEFAULT_CODEX_TRANSCRIPTION_MODEL = exports.DEFAULT_CODEX_MODEL = exports.DEFAULT_GEMINI_FLASH_MODEL = exports.DEFAULT_GEMINI_MODEL = exports.AI_PROVIDERS = void 0;
4
4
  exports.isAiProvider = isAiProvider;
5
5
  exports.normalizeAiProvider = normalizeAiProvider;
6
6
  exports.toPiAiProvider = toPiAiProvider;
@@ -8,7 +8,19 @@ exports.AI_PROVIDERS = ['gemini', 'codex'];
8
8
  exports.DEFAULT_GEMINI_MODEL = 'gemini-2.5-pro';
9
9
  exports.DEFAULT_GEMINI_FLASH_MODEL = 'gemini-2.5-flash';
10
10
  exports.DEFAULT_CODEX_MODEL = 'gpt-5.5';
11
- exports.DEFAULT_CODEX_TRANSCRIPTION_MODEL = 'gpt-4o-transcribe';
11
+ // gpt-4o-transcribe-diarize ships native speaker diarization at the same
12
+ // per-minute price ($0.006/min) as the non-diarize model. Trade-offs vs
13
+ // gpt-4o-transcribe (see docs/model-pricing.md):
14
+ // - doesn't accept the `prompt` parameter, so user glossaries
15
+ // (`knownWords`) are silently dropped on this path
16
+ // - we still segment audio into 5-min chunks for parallel-upload speed,
17
+ // so "Speaker 0" in chunk 1 is not guaranteed to be the same physical
18
+ // person as "Speaker 0" in chunk 2
19
+ exports.DEFAULT_CODEX_TRANSCRIPTION_MODEL = 'gpt-4o-transcribe-diarize';
20
+ // Pre-diarize model id. Useful for users who want the older prompt-driven
21
+ // behavior (vocabulary hints via `knownWords`) at the cost of speaker
22
+ // labels. Switch via `listener config set codexTranscriptionModel gpt-4o-transcribe`.
23
+ exports.CODEX_TRANSCRIPTION_NON_DIARIZE_MODEL = 'gpt-4o-transcribe';
12
24
  function isAiProvider(value) {
13
25
  return exports.AI_PROVIDERS.includes(value);
14
26
  }
@@ -6,6 +6,16 @@
6
6
  // Codex transcription flow needs only a multipart POST, so a thin direct
7
7
  // fetch is simpler than wedging audio into pi-ai's chat model.
8
8
  //
9
+ // Two output shapes, branched on model id:
10
+ // - `gpt-4o-transcribe-diarize` (default) returns `diarized_json` with
11
+ // speaker-labeled segments. We re-label "Speaker 0/1/..." onto the
12
+ // same `참가자N` convention the Gemini path uses so downstream code
13
+ // (summarization, transcript.md, Notion) doesn't have to care which
14
+ // transcription engine produced the text. This model rejects `prompt`,
15
+ // so user-supplied glossaries (`knownWords`) are dropped on this path.
16
+ // - `gpt-4o-transcribe` (and `whisper-1`) return `{text}` and accept
17
+ // `prompt` for vocabulary biasing, but produce no speaker labels.
18
+ //
9
19
  // Format support: OpenAI accepts mp3, mp4, mpeg, mpga, m4a, wav, webm. Inputs
10
20
  // outside that set are remuxed upstream in geminiService.ts via ffmpeg before
11
21
  // reaching this helper.
@@ -44,11 +54,14 @@ var __importStar = (this && this.__importStar) || (function () {
44
54
  })();
45
55
  Object.defineProperty(exports, "__esModule", { value: true });
46
56
  exports.OPENAI_TRANSCRIPTION_EXTENSIONS = void 0;
57
+ exports.isDiarizeModel = isDiarizeModel;
47
58
  exports.transcribeCodexAudio = transcribeCodexAudio;
59
+ exports.formatDiarizedSegments = formatDiarizedSegments;
48
60
  const fs = __importStar(require("fs"));
49
61
  const path = __importStar(require("path"));
50
62
  const audioFormats_1 = require("./audioFormats");
51
63
  const OPENAI_API_BASE_URL = 'https://api.openai.com/v1';
64
+ const DIARIZE_MODEL_ID = 'gpt-4o-transcribe-diarize';
52
65
  exports.OPENAI_TRANSCRIPTION_EXTENSIONS = new Set([
53
66
  '.mp3',
54
67
  '.mp4',
@@ -58,20 +71,41 @@ exports.OPENAI_TRANSCRIPTION_EXTENSIONS = new Set([
58
71
  '.wav',
59
72
  '.webm',
60
73
  ]);
74
+ function isDiarizeModel(model) {
75
+ return model.trim() === DIARIZE_MODEL_ID;
76
+ }
61
77
  async function transcribeCodexAudio(params) {
62
78
  const audioData = fs.readFileSync(params.audioFilePath);
63
79
  const ext = path.extname(params.audioFilePath);
80
+ const model = params.model.trim();
81
+ const diarize = isDiarizeModel(model);
64
82
  const form = new FormData();
65
- form.append('model', params.model.trim());
66
- if (params.prompt?.trim()) {
83
+ form.append('model', model);
84
+ if (params.language) {
85
+ form.append('language', params.language);
86
+ }
87
+ if (diarize) {
88
+ // Required for the diarize model. `chunking_strategy=auto` lets OpenAI
89
+ // split long audio internally while keeping speaker identity coherent
90
+ // across chunks -- so we can hand it a whole 50-minute meeting (subject
91
+ // to the 25MB file-size limit upstream).
92
+ form.append('response_format', 'diarized_json');
93
+ form.append('chunking_strategy', 'auto');
94
+ }
95
+ else if (params.prompt?.trim()) {
67
96
  form.append('prompt', params.prompt.trim());
68
97
  }
69
98
  form.append('file', new Blob([audioData], { type: (0, audioFormats_1.mimeTypeForExtension)(ext) }), path.basename(params.audioFilePath));
99
+ const sizeMB = (audioData.byteLength / (1024 * 1024)).toFixed(2);
100
+ const startedAt = Date.now();
101
+ console.log(`[codex-transcribe] -> ${path.basename(params.audioFilePath)} ${sizeMB}MB model=${model}${diarize ? ' diarize=true' : params.prompt ? ` prompt=${params.prompt.length}chars` : ''}${params.language ? ` lang=${params.language}` : ''}`);
70
102
  const response = await fetch(`${OPENAI_API_BASE_URL}/audio/transcriptions`, {
71
103
  method: 'POST',
72
104
  headers: { Authorization: `Bearer ${await params.getToken()}` },
73
105
  body: form,
74
106
  });
107
+ const elapsed = Date.now() - startedAt;
108
+ console.log(`[codex-transcribe] <- ${elapsed}ms status=${response.status} ${response.statusText}`);
75
109
  if (!response.ok) {
76
110
  // Truncate the error body so a verbose upstream response doesn't leak
77
111
  // headers/debug payload into logs and IPC error strings.
@@ -79,9 +113,56 @@ async function transcribeCodexAudio(params) {
79
113
  const trimmed = body.length > 500 ? `${body.slice(0, 500)}...` : body;
80
114
  throw new Error(`OpenAI transcription failed (${response.status} ${response.statusText})${trimmed ? `: ${trimmed}` : ''}`);
81
115
  }
116
+ if (diarize) {
117
+ const payload = (await response.json());
118
+ return formatDiarizedSegments(payload.segments);
119
+ }
82
120
  const payload = (await response.json());
83
121
  if (typeof payload.text !== 'string' || payload.text.trim().length === 0) {
84
122
  throw new Error('OpenAI transcription response missing text');
85
123
  }
86
124
  return payload.text;
87
125
  }
126
+ // Re-label OpenAI's raw speaker ids ("Speaker 0", "Speaker 1", or the names
127
+ // supplied via `known_speaker_names[]` if used) onto our `참가자N` convention,
128
+ // matching the format Gemini emits when prompted for speaker labels. Empty
129
+ // segments are dropped; consecutive segments from the same speaker are merged
130
+ // onto a single line so downstream consumers don't see one speaker split into
131
+ // 30+ "참가자1: ..." stubs.
132
+ function formatDiarizedSegments(segments) {
133
+ if (!segments || segments.length === 0) {
134
+ throw new Error('OpenAI diarized transcription returned no segments');
135
+ }
136
+ const speakerIdx = new Map();
137
+ let nextIdx = 1;
138
+ const lines = [];
139
+ let activeLabel;
140
+ let activeBuffer = '';
141
+ for (const seg of segments) {
142
+ const text = (seg.text ?? '').trim();
143
+ if (!text)
144
+ continue;
145
+ const rawSpeaker = seg.speaker ?? 'unknown';
146
+ let idx = speakerIdx.get(rawSpeaker);
147
+ if (idx === undefined) {
148
+ idx = nextIdx++;
149
+ speakerIdx.set(rawSpeaker, idx);
150
+ }
151
+ const label = `참가자${idx}`;
152
+ if (label === activeLabel) {
153
+ activeBuffer += ` ${text}`;
154
+ }
155
+ else {
156
+ if (activeLabel !== undefined)
157
+ lines.push(`${activeLabel}: ${activeBuffer}`);
158
+ activeLabel = label;
159
+ activeBuffer = text;
160
+ }
161
+ }
162
+ if (activeLabel !== undefined)
163
+ lines.push(`${activeLabel}: ${activeBuffer}`);
164
+ if (lines.length === 0) {
165
+ throw new Error('OpenAI diarized transcription had segments but no usable text');
166
+ }
167
+ return lines.join('\n\n');
168
+ }
@@ -80,6 +80,31 @@ class ConfigService {
80
80
  }
81
81
  this.configPath = path.join(userDataPath, 'config.json');
82
82
  this.loadConfig();
83
+ this.migrateLegacyDefaults();
84
+ }
85
+ // One-shot upgrade hook for keys that older versions auto-persisted from
86
+ // their then-current default. The settings modal in those versions wrote
87
+ // back the full payload on save -- including fields the user never
88
+ // touched -- so the next default change can't reach existing installs.
89
+ // Today's case: `codexTranscriptionModel: 'gpt-4o-transcribe'` was the
90
+ // legacy default before gpt-4o-transcribe-diarize shipped; clearing it
91
+ // here lets `getCodexTranscriptionModel()` return the current default
92
+ // (diarize) without forcing every user to manually unset it.
93
+ //
94
+ // The marker semantics are "we've considered migrating this user" --
95
+ // it lands on EVERY install on first launch, not just the ones we
96
+ // actually had to migrate. That way if a user later opts back into
97
+ // `gpt-4o-transcribe` deliberately (e.g. for glossary support), the
98
+ // next ConfigService construction sees the marker and skips the
99
+ // migration entirely instead of clobbering their explicit choice.
100
+ migrateLegacyDefaults() {
101
+ if (this.config.codexTranscriptionMigratedToDiarize)
102
+ return;
103
+ if (this.config.codexTranscriptionModel === 'gpt-4o-transcribe') {
104
+ this.setKey('codexTranscriptionModel', undefined);
105
+ }
106
+ this.setKey('codexTranscriptionMigratedToDiarize', true);
107
+ this.saveConfig();
83
108
  }
84
109
  loadConfig() {
85
110
  try {
@@ -197,7 +197,14 @@ class GeminiService {
197
197
  const modelId = this.provider === 'codex' ? this.codexModel : this.proModel;
198
198
  const apiKey = this.provider === 'codex' ? await this.getCodexToken() : this.requireGeminiApiKey();
199
199
  const model = await (0, piAiClient_1.getModel)(this.provider, modelId);
200
+ // Force formal Korean register (합니다체). Codex (GPT-5.x) defaults to
201
+ // mixed/해요체 in Korean output; Gemini tends to 합니다체 already but the
202
+ // explicit constraint keeps both providers consistent. Applied as a system
203
+ // prompt so it overrides whatever tone the user's customSummaryPrompt
204
+ // implies for summary/keyPoints/actionItems bodies.
205
+ const koreanToneSystem = '모든 한국어 출력은 격식체(합니다/입니다 어미)로 작성하세요. 반말이나 해요체를 쓰지 마세요. summary, keyPoints, actionItems 본문 모두 동일하게 적용합니다.';
200
206
  const context = {
207
+ systemPrompt: koreanToneSystem,
201
208
  messages: [
202
209
  {
203
210
  role: 'user',
@@ -210,6 +217,12 @@ class GeminiService {
210
217
  apiKey,
211
218
  temperature: 0.2,
212
219
  maxTokens: 32768,
220
+ // Codex-only knobs; pi-ai's google provider ignores unknown keys.
221
+ // pi-ai omits `reasoning.effort` by default (server default ~medium); we
222
+ // force xhigh for deepest analysis -- gpt-5.5's thinkingLevelMap maps
223
+ // xhigh -> "max". Verbosity stays at pi-ai's "low" default (terse output
224
+ // is fine; reasoning depth is what was missing).
225
+ reasoningEffort: 'xhigh',
213
226
  });
214
227
  return (0, piAiClient_1.extractFinalText)(response);
215
228
  }
@@ -361,15 +374,35 @@ class GeminiService {
361
374
  }
362
375
  }
363
376
  // Split audio file into segments
364
- async splitAudioIntoSegments(audioFilePath, segmentDuration = 300) {
377
+ async splitAudioIntoSegments(audioFilePath, segmentDuration = 300,
378
+ // re-encode segments instead of `-c copy`. ffmpeg's segment muxer can
379
+ // only cut at keyframes when copying, and webm-opus has near-zero
380
+ // keyframes by default -- so `-c copy -segment_time 300` silently
381
+ // produces 30+ minute segments that blow past gpt-4o-transcribe's
382
+ // 1400-second per-request limit. Caller passes `reencode: true` for
383
+ // the Codex transcription path; Gemini's API is tolerant of long
384
+ // inputs and stays on the faster `-c copy` path.
385
+ reencode = false) {
365
386
  const outputDir = path.dirname(audioFilePath);
366
387
  const baseName = path.basename(audioFilePath, path.extname(audioFilePath));
367
388
  const ext = path.extname(audioFilePath);
368
- const segmentPath = path.join(outputDir, `${baseName}_segment_%03d${ext}`);
389
+ // When re-encoding to opus we MUST force a container that supports
390
+ // opus -- ffmpeg picks the muxer from the output extension, so leaving
391
+ // an imported `.mp3`/`.m4a`/`.wav` source as `.mp3` makes ffmpeg pick
392
+ // the MP3 muxer and reject the opus stream. `.webm` is in OpenAI's
393
+ // supported transcription extensions, so the segments still upload.
394
+ const segmentExt = reencode ? '.webm' : ext;
395
+ const segmentPath = path.join(outputDir, `${baseName}_segment_%03d${segmentExt}`);
369
396
  // Get the bundled FFmpeg path
370
397
  const ffmpegPath = await this.getFFmpegPath();
371
398
  try {
372
- // Split audio into segments
399
+ const codecArgs = reencode ? ['-c:a', 'libopus', '-b:a', '48k'] : ['-c', 'copy'];
400
+ // Split audio into segments. `-reset_timestamps 1` makes each segment
401
+ // start at PTS 0 and gives it its own container duration. Without it,
402
+ // webm output keeps the source file's total duration in the header --
403
+ // and OpenAI rejects the request based on the header value even when
404
+ // the actual encoded audio is short (`audio duration N seconds is
405
+ // longer than 1400` errors on small last-segment files).
373
406
  await execFileAsync(ffmpegPath, [
374
407
  '-i',
375
408
  audioFilePath,
@@ -377,14 +410,17 @@ class GeminiService {
377
410
  'segment',
378
411
  '-segment_time',
379
412
  String(segmentDuration),
380
- '-c',
381
- 'copy',
413
+ '-reset_timestamps',
414
+ '1',
415
+ ...codecArgs,
382
416
  segmentPath,
383
417
  ]);
384
- // Find all created segment files
418
+ // Find all created segment files. Match on the EXTENSION WE TOLD
419
+ // FFMPEG TO WRITE -- when re-encoding, that's `.webm` regardless of
420
+ // the source's original extension.
385
421
  const segmentFiles = fs
386
422
  .readdirSync(outputDir)
387
- .filter((file) => file.startsWith(`${baseName}_segment_`) && file.endsWith(ext))
423
+ .filter((file) => file.startsWith(`${baseName}_segment_`) && file.endsWith(segmentExt))
388
424
  .map((file) => path.join(outputDir, file))
389
425
  .sort();
390
426
  console.error(`Split audio into ${segmentFiles.length} segments`);
@@ -438,6 +474,13 @@ class GeminiService {
438
474
  let fullTranscript = '';
439
475
  const stats = fs.statSync(audioFilePath);
440
476
  const fileSizeInMB = stats.size / (1024 * 1024);
477
+ // Segment intentionally for parallelism: even when the API would
478
+ // accept the whole file (Gemini long-context, gpt-4o-transcribe-diarize
479
+ // via chunking_strategy=auto), N parallel 5-min requests finish much
480
+ // faster than one big sequential pass. Trade-off for the diarize
481
+ // model: speaker IDs are mapped fresh per segment ("Speaker 0" in
482
+ // segment 1 may not be the same physical person as "Speaker 0" in
483
+ // segment 2). See docs/model-pricing.md.
441
484
  const shouldSegment = duration > 300 || (this.provider === 'codex' && fileSizeInMB > 24);
442
485
  const segmentDuration = this.provider === 'codex' && duration > 0 && fileSizeInMB > 20
443
486
  ? Math.max(30, Math.min(300, Math.floor((20 / fileSizeInMB) * duration)))
@@ -562,7 +605,14 @@ Return as JSON:
562
605
  getToken: () => this.getCodexToken(),
563
606
  audioFilePath,
564
607
  model: this.codexTranscriptionModel,
608
+ // `prompt` is dropped inside transcribeCodexAudio when the
609
+ // diarize model is active. Keep passing it -- the helper picks
610
+ // the right shape per model.
565
611
  prompt: transcriptPrompt,
612
+ // Intentionally NOT passing `language: 'ko'`. Whisper-derived
613
+ // transcription auto-detects from the first ~30s, which handles
614
+ // bilingual/code-switched meetings (Korean primary, English
615
+ // acronyms/quotes) better than forcing a single language.
566
616
  });
567
617
  }
568
618
  const ai = this.gemini();
@@ -745,8 +795,11 @@ Return as JSON:
745
795
  // Get segmented transcript (renamed from transcribeAudioSegmented)
746
796
  async getSegmentedTranscript(audioFilePath, duration, progressCallback, customPrompt, segmentDuration = 300) {
747
797
  try {
748
- // Split audio into 5-minute segments
749
- const segmentFiles = await this.splitAudioIntoSegments(audioFilePath, segmentDuration);
798
+ // Split audio into 5-minute segments. Codex transcription requires
799
+ // accurate cut times (gpt-4o-transcribe rejects >1400s/segment), so
800
+ // force re-encode there; Gemini's API tolerates long inputs and we
801
+ // keep the cheaper `-c copy` path for it.
802
+ const segmentFiles = await this.splitAudioIntoSegments(audioFilePath, segmentDuration, this.provider === 'codex');
750
803
  if (progressCallback) {
751
804
  progressCallback(20, `Processing ${segmentFiles.length} segments...`);
752
805
  }
@@ -23,11 +23,69 @@ async function getModel(provider, modelId) {
23
23
  // path for non-literal ids ("Custom Models" in pi-ai's README).
24
24
  return m.getModel(piId, modelId);
25
25
  }
26
+ function summarizeContextSize(context) {
27
+ let chars = 0;
28
+ let toolCalls = 0;
29
+ let toolResults = 0;
30
+ for (const msg of context.messages) {
31
+ if (msg.role === 'user') {
32
+ chars +=
33
+ typeof msg.content === 'string'
34
+ ? msg.content.length
35
+ : msg.content.reduce((n, b) => n + (b.type === 'text' ? b.text.length : 0), 0);
36
+ }
37
+ else if (msg.role === 'assistant') {
38
+ for (const b of msg.content) {
39
+ if (b.type === 'text')
40
+ chars += b.text.length;
41
+ else if (b.type === 'toolCall')
42
+ toolCalls++;
43
+ }
44
+ }
45
+ else if (msg.role === 'toolResult') {
46
+ toolResults++;
47
+ for (const b of msg.content)
48
+ if (b.type === 'text')
49
+ chars += b.text.length;
50
+ }
51
+ }
52
+ const systemChars = context.systemPrompt?.length ?? 0;
53
+ return `messages=${context.messages.length} chars=${chars + systemChars} (system=${systemChars}) toolCalls=${toolCalls} toolResults=${toolResults} tools=${context.tools?.length ?? 0}`;
54
+ }
55
+ // Strip options the target provider doesn't accept. OpenAI Codex routes
56
+ // through GPT-5.x reasoning models which reject sampling parameters
57
+ // (`Unsupported parameter: temperature`). pi-ai forwards options verbatim,
58
+ // so the adjustment has to happen at our boundary -- doing it here keeps
59
+ // callsites free of provider conditionals.
60
+ function adjustOptionsForModel(model, options) {
61
+ if (!options)
62
+ return undefined;
63
+ const isCodex = model.api === 'openai-codex-responses' || model.provider === 'openai-codex';
64
+ if (isCodex) {
65
+ const { temperature: _t, ...rest } = options;
66
+ return { ...rest };
67
+ }
68
+ return { ...options };
69
+ }
26
70
  async function complete(model, context, options) {
27
71
  const m = await loadPiAi();
28
- // pi-ai's ProviderStreamOptions is `StreamOptions & Record<string, unknown>`;
29
- // spread to satisfy the index-signature constraint.
30
- return await m.complete(model, context, options ? { ...options } : undefined);
72
+ const tag = `[pi-ai ${model.provider}/${model.id}]`;
73
+ const startedAt = Date.now();
74
+ console.log(`${tag} -> ${summarizeContextSize(context)}`);
75
+ const adjustedOptions = adjustOptionsForModel(model, options);
76
+ const response = await m.complete(model, context, adjustedOptions);
77
+ const elapsed = Date.now() - startedAt;
78
+ const stop = response.stopReason ?? 'unknown';
79
+ const textChars = extractFinalText(response).length;
80
+ console.log(`${tag} <- ${elapsed}ms stop=${stop} textChars=${textChars} usage=in:${response.usage?.input ?? '?'}/out:${response.usage?.output ?? '?'}${response.errorMessage ? ` errorMessage=${response.errorMessage.slice(0, 300)}` : ''}`);
81
+ // pi-ai surfaces upstream failures via stopReason='error' rather than
82
+ // throwing. Without this, geminiService.generateSummary returns "" and
83
+ // agentService.run returns "(no answer)" with no breadcrumb. Promote the
84
+ // diagnostic into a thrown error so it reaches the renderer / CLI surface.
85
+ if (response.stopReason === 'error') {
86
+ throw new Error(`Pi-ai ${model.provider}/${model.id} failed: ${response.errorMessage ?? 'no errorMessage'}`);
87
+ }
88
+ return response;
31
89
  }
32
90
  async function getTypeBox() {
33
91
  const m = await loadPiAi();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "listener-ai",
3
- "version": "2.7.0",
3
+ "version": "2.7.2",
4
4
  "description": "A lightweight desktop application for recording and transcribing meetings with AI-powered notes.",
5
5
  "main": "dist/main.js",
6
6
  "bin": {