npm - listener-ai - Versions diffs - 2.7.0 → 2.7.2 - Mend

listener-ai 2.7.0 → 2.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/dist/aiProvider.js +14 -2
package/dist/codexTranscription.js +83 -2
package/dist/configService.js +25 -0
package/dist/geminiService.js +62 -9
package/dist/piAiClient.js +61 -3
package/package.json +1 -1

package/dist/aiProvider.js CHANGED Viewed

@@ -1,6 +1,6 @@
 "use strict";
 Object.defineProperty(exports, "__esModule", { value: true });
-exports.DEFAULT_CODEX_TRANSCRIPTION_MODEL = exports.DEFAULT_CODEX_MODEL = exports.DEFAULT_GEMINI_FLASH_MODEL = exports.DEFAULT_GEMINI_MODEL = exports.AI_PROVIDERS = void 0;
+exports.CODEX_TRANSCRIPTION_NON_DIARIZE_MODEL = exports.DEFAULT_CODEX_TRANSCRIPTION_MODEL = exports.DEFAULT_CODEX_MODEL = exports.DEFAULT_GEMINI_FLASH_MODEL = exports.DEFAULT_GEMINI_MODEL = exports.AI_PROVIDERS = void 0;
 exports.isAiProvider = isAiProvider;
 exports.normalizeAiProvider = normalizeAiProvider;
 exports.toPiAiProvider = toPiAiProvider;
@@ -8,7 +8,19 @@ exports.AI_PROVIDERS = ['gemini', 'codex'];
 exports.DEFAULT_GEMINI_MODEL = 'gemini-2.5-pro';
 exports.DEFAULT_GEMINI_FLASH_MODEL = 'gemini-2.5-flash';
 exports.DEFAULT_CODEX_MODEL = 'gpt-5.5';
-exports.DEFAULT_CODEX_TRANSCRIPTION_MODEL = 'gpt-4o-transcribe';
+// gpt-4o-transcribe-diarize ships native speaker diarization at the same
+// per-minute price ($0.006/min) as the non-diarize model. Trade-offs vs
+// gpt-4o-transcribe (see docs/model-pricing.md):
+//   - doesn't accept the `prompt` parameter, so user glossaries
+//     (`knownWords`) are silently dropped on this path
+//   - we still segment audio into 5-min chunks for parallel-upload speed,
+//     so "Speaker 0" in chunk 1 is not guaranteed to be the same physical
+//     person as "Speaker 0" in chunk 2
+exports.DEFAULT_CODEX_TRANSCRIPTION_MODEL = 'gpt-4o-transcribe-diarize';
+// Pre-diarize model id. Useful for users who want the older prompt-driven
+// behavior (vocabulary hints via `knownWords`) at the cost of speaker
+// labels. Switch via `listener config set codexTranscriptionModel gpt-4o-transcribe`.
+exports.CODEX_TRANSCRIPTION_NON_DIARIZE_MODEL = 'gpt-4o-transcribe';
 function isAiProvider(value) {
     return exports.AI_PROVIDERS.includes(value);
 }

package/dist/codexTranscription.js CHANGED Viewed

@@ -6,6 +6,16 @@
 // Codex transcription flow needs only a multipart POST, so a thin direct
 // fetch is simpler than wedging audio into pi-ai's chat model.
 //
+// Two output shapes, branched on model id:
+//   - `gpt-4o-transcribe-diarize` (default) returns `diarized_json` with
+//     speaker-labeled segments. We re-label "Speaker 0/1/..." onto the
+//     same `참가자N` convention the Gemini path uses so downstream code
+//     (summarization, transcript.md, Notion) doesn't have to care which
+//     transcription engine produced the text. This model rejects `prompt`,
+//     so user-supplied glossaries (`knownWords`) are dropped on this path.
+//   - `gpt-4o-transcribe` (and `whisper-1`) return `{text}` and accept
+//     `prompt` for vocabulary biasing, but produce no speaker labels.
+//
 // Format support: OpenAI accepts mp3, mp4, mpeg, mpga, m4a, wav, webm. Inputs
 // outside that set are remuxed upstream in geminiService.ts via ffmpeg before
 // reaching this helper.
@@ -44,11 +54,14 @@ var __importStar = (this && this.__importStar) || (function () {
 })();
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.OPENAI_TRANSCRIPTION_EXTENSIONS = void 0;
+exports.isDiarizeModel = isDiarizeModel;
 exports.transcribeCodexAudio = transcribeCodexAudio;
+exports.formatDiarizedSegments = formatDiarizedSegments;
 const fs = __importStar(require("fs"));
 const path = __importStar(require("path"));
 const audioFormats_1 = require("./audioFormats");
 const OPENAI_API_BASE_URL = 'https://api.openai.com/v1';
+const DIARIZE_MODEL_ID = 'gpt-4o-transcribe-diarize';
 exports.OPENAI_TRANSCRIPTION_EXTENSIONS = new Set([
     '.mp3',
     '.mp4',
@@ -58,20 +71,41 @@ exports.OPENAI_TRANSCRIPTION_EXTENSIONS = new Set([
     '.wav',
     '.webm',
 ]);
+function isDiarizeModel(model) {
+    return model.trim() === DIARIZE_MODEL_ID;
+}
 async function transcribeCodexAudio(params) {
     const audioData = fs.readFileSync(params.audioFilePath);
     const ext = path.extname(params.audioFilePath);
+    const model = params.model.trim();
+    const diarize = isDiarizeModel(model);
     const form = new FormData();
-    form.append('model', params.model.trim());
-    if (params.prompt?.trim()) {
+    form.append('model', model);
+    if (params.language) {
+        form.append('language', params.language);
+    }
+    if (diarize) {
+        // Required for the diarize model. `chunking_strategy=auto` lets OpenAI
+        // split long audio internally while keeping speaker identity coherent
+        // across chunks -- so we can hand it a whole 50-minute meeting (subject
+        // to the 25MB file-size limit upstream).
+        form.append('response_format', 'diarized_json');
+        form.append('chunking_strategy', 'auto');
+    }
+    else if (params.prompt?.trim()) {
         form.append('prompt', params.prompt.trim());
     }
     form.append('file', new Blob([audioData], { type: (0, audioFormats_1.mimeTypeForExtension)(ext) }), path.basename(params.audioFilePath));
+    const sizeMB = (audioData.byteLength / (1024 * 1024)).toFixed(2);
+    const startedAt = Date.now();
+    console.log(`[codex-transcribe] -> ${path.basename(params.audioFilePath)} ${sizeMB}MB model=${model}${diarize ? ' diarize=true' : params.prompt ? ` prompt=${params.prompt.length}chars` : ''}${params.language ? ` lang=${params.language}` : ''}`);
     const response = await fetch(`${OPENAI_API_BASE_URL}/audio/transcriptions`, {
         method: 'POST',
         headers: { Authorization: `Bearer ${await params.getToken()}` },
         body: form,
     });
+    const elapsed = Date.now() - startedAt;
+    console.log(`[codex-transcribe] <- ${elapsed}ms status=${response.status} ${response.statusText}`);
     if (!response.ok) {
         // Truncate the error body so a verbose upstream response doesn't leak
         // headers/debug payload into logs and IPC error strings.
@@ -79,9 +113,56 @@ async function transcribeCodexAudio(params) {
         const trimmed = body.length > 500 ? `${body.slice(0, 500)}...` : body;
         throw new Error(`OpenAI transcription failed (${response.status} ${response.statusText})${trimmed ? `: ${trimmed}` : ''}`);
     }
+    if (diarize) {
+        const payload = (await response.json());
+        return formatDiarizedSegments(payload.segments);
+    }
     const payload = (await response.json());
     if (typeof payload.text !== 'string' || payload.text.trim().length === 0) {
         throw new Error('OpenAI transcription response missing text');
     }
     return payload.text;
 }
+// Re-label OpenAI's raw speaker ids ("Speaker 0", "Speaker 1", or the names
+// supplied via `known_speaker_names[]` if used) onto our `참가자N` convention,
+// matching the format Gemini emits when prompted for speaker labels. Empty
+// segments are dropped; consecutive segments from the same speaker are merged
+// onto a single line so downstream consumers don't see one speaker split into
+// 30+ "참가자1: ..." stubs.
+function formatDiarizedSegments(segments) {
+    if (!segments || segments.length === 0) {
+        throw new Error('OpenAI diarized transcription returned no segments');
+    }
+    const speakerIdx = new Map();
+    let nextIdx = 1;
+    const lines = [];
+    let activeLabel;
+    let activeBuffer = '';
+    for (const seg of segments) {
+        const text = (seg.text ?? '').trim();
+        if (!text)
+            continue;
+        const rawSpeaker = seg.speaker ?? 'unknown';
+        let idx = speakerIdx.get(rawSpeaker);
+        if (idx === undefined) {
+            idx = nextIdx++;
+            speakerIdx.set(rawSpeaker, idx);
+        }
+        const label = `참가자${idx}`;
+        if (label === activeLabel) {
+            activeBuffer += ` ${text}`;
+        }
+        else {
+            if (activeLabel !== undefined)
+                lines.push(`${activeLabel}: ${activeBuffer}`);
+            activeLabel = label;
+            activeBuffer = text;
+        }
+    }
+    if (activeLabel !== undefined)
+        lines.push(`${activeLabel}: ${activeBuffer}`);
+    if (lines.length === 0) {
+        throw new Error('OpenAI diarized transcription had segments but no usable text');
+    }
+    return lines.join('\n\n');
+}

package/dist/configService.js CHANGED Viewed

@@ -80,6 +80,31 @@ class ConfigService {
         }
         this.configPath = path.join(userDataPath, 'config.json');
         this.loadConfig();
+        this.migrateLegacyDefaults();
+    }
+    // One-shot upgrade hook for keys that older versions auto-persisted from
+    // their then-current default. The settings modal in those versions wrote
+    // back the full payload on save -- including fields the user never
+    // touched -- so the next default change can't reach existing installs.
+    // Today's case: `codexTranscriptionModel: 'gpt-4o-transcribe'` was the
+    // legacy default before gpt-4o-transcribe-diarize shipped; clearing it
+    // here lets `getCodexTranscriptionModel()` return the current default
+    // (diarize) without forcing every user to manually unset it.
+    //
+    // The marker semantics are "we've considered migrating this user" --
+    // it lands on EVERY install on first launch, not just the ones we
+    // actually had to migrate. That way if a user later opts back into
+    // `gpt-4o-transcribe` deliberately (e.g. for glossary support), the
+    // next ConfigService construction sees the marker and skips the
+    // migration entirely instead of clobbering their explicit choice.
+    migrateLegacyDefaults() {
+        if (this.config.codexTranscriptionMigratedToDiarize)
+            return;
+        if (this.config.codexTranscriptionModel === 'gpt-4o-transcribe') {
+            this.setKey('codexTranscriptionModel', undefined);
+        }
+        this.setKey('codexTranscriptionMigratedToDiarize', true);
+        this.saveConfig();
     }
     loadConfig() {
         try {

package/dist/geminiService.js CHANGED Viewed

@@ -197,7 +197,14 @@ class GeminiService {
         const modelId = this.provider === 'codex' ? this.codexModel : this.proModel;
         const apiKey = this.provider === 'codex' ? await this.getCodexToken() : this.requireGeminiApiKey();
         const model = await (0, piAiClient_1.getModel)(this.provider, modelId);
+        // Force formal Korean register (합니다체). Codex (GPT-5.x) defaults to
+        // mixed/해요체 in Korean output; Gemini tends to 합니다체 already but the
+        // explicit constraint keeps both providers consistent. Applied as a system
+        // prompt so it overrides whatever tone the user's customSummaryPrompt
+        // implies for summary/keyPoints/actionItems bodies.
+        const koreanToneSystem = '모든 한국어 출력은 격식체(합니다/입니다 어미)로 작성하세요. 반말이나 해요체를 쓰지 마세요. summary, keyPoints, actionItems 본문 모두 동일하게 적용합니다.';
         const context = {
+            systemPrompt: koreanToneSystem,
             messages: [
                 {
                     role: 'user',
@@ -210,6 +217,12 @@ class GeminiService {
             apiKey,
             temperature: 0.2,
             maxTokens: 32768,
+            // Codex-only knobs; pi-ai's google provider ignores unknown keys.
+            // pi-ai omits `reasoning.effort` by default (server default ~medium); we
+            // force xhigh for deepest analysis -- gpt-5.5's thinkingLevelMap maps
+            // xhigh -> "max". Verbosity stays at pi-ai's "low" default (terse output
+            // is fine; reasoning depth is what was missing).
+            reasoningEffort: 'xhigh',
         });
         return (0, piAiClient_1.extractFinalText)(response);
     }
@@ -361,15 +374,35 @@ class GeminiService {
         }
     }
     // Split audio file into segments
-    async splitAudioIntoSegments(audioFilePath, segmentDuration = 300) {
+    async splitAudioIntoSegments(audioFilePath, segmentDuration = 300,
+    // re-encode segments instead of `-c copy`. ffmpeg's segment muxer can
+    // only cut at keyframes when copying, and webm-opus has near-zero
+    // keyframes by default -- so `-c copy -segment_time 300` silently
+    // produces 30+ minute segments that blow past gpt-4o-transcribe's
+    // 1400-second per-request limit. Caller passes `reencode: true` for
+    // the Codex transcription path; Gemini's API is tolerant of long
+    // inputs and stays on the faster `-c copy` path.
+    reencode = false) {
         const outputDir = path.dirname(audioFilePath);
         const baseName = path.basename(audioFilePath, path.extname(audioFilePath));
         const ext = path.extname(audioFilePath);
-        const segmentPath = path.join(outputDir, `${baseName}_segment_%03d${ext}`);
+        // When re-encoding to opus we MUST force a container that supports
+        // opus -- ffmpeg picks the muxer from the output extension, so leaving
+        // an imported `.mp3`/`.m4a`/`.wav` source as `.mp3` makes ffmpeg pick
+        // the MP3 muxer and reject the opus stream. `.webm` is in OpenAI's
+        // supported transcription extensions, so the segments still upload.
+        const segmentExt = reencode ? '.webm' : ext;
+        const segmentPath = path.join(outputDir, `${baseName}_segment_%03d${segmentExt}`);
         // Get the bundled FFmpeg path
         const ffmpegPath = await this.getFFmpegPath();
         try {
-            // Split audio into segments
+            const codecArgs = reencode ? ['-c:a', 'libopus', '-b:a', '48k'] : ['-c', 'copy'];
+            // Split audio into segments. `-reset_timestamps 1` makes each segment
+            // start at PTS 0 and gives it its own container duration. Without it,
+            // webm output keeps the source file's total duration in the header --
+            // and OpenAI rejects the request based on the header value even when
+            // the actual encoded audio is short (`audio duration N seconds is
+            // longer than 1400` errors on small last-segment files).
             await execFileAsync(ffmpegPath, [
                 '-i',
                 audioFilePath,
@@ -377,14 +410,17 @@ class GeminiService {
                 'segment',
                 '-segment_time',
                 String(segmentDuration),
-                '-c',
-                'copy',
+                '-reset_timestamps',
+                '1',
+                ...codecArgs,
                 segmentPath,
             ]);
-            // Find all created segment files
+            // Find all created segment files. Match on the EXTENSION WE TOLD
+            // FFMPEG TO WRITE -- when re-encoding, that's `.webm` regardless of
+            // the source's original extension.
             const segmentFiles = fs
                 .readdirSync(outputDir)
-                .filter((file) => file.startsWith(`${baseName}_segment_`) && file.endsWith(ext))
+                .filter((file) => file.startsWith(`${baseName}_segment_`) && file.endsWith(segmentExt))
                 .map((file) => path.join(outputDir, file))
                 .sort();
             console.error(`Split audio into ${segmentFiles.length} segments`);
@@ -438,6 +474,13 @@ class GeminiService {
             let fullTranscript = '';
             const stats = fs.statSync(audioFilePath);
             const fileSizeInMB = stats.size / (1024 * 1024);
+            // Segment intentionally for parallelism: even when the API would
+            // accept the whole file (Gemini long-context, gpt-4o-transcribe-diarize
+            // via chunking_strategy=auto), N parallel 5-min requests finish much
+            // faster than one big sequential pass. Trade-off for the diarize
+            // model: speaker IDs are mapped fresh per segment ("Speaker 0" in
+            // segment 1 may not be the same physical person as "Speaker 0" in
+            // segment 2). See docs/model-pricing.md.
             const shouldSegment = duration > 300 || (this.provider === 'codex' && fileSizeInMB > 24);
             const segmentDuration = this.provider === 'codex' && duration > 0 && fileSizeInMB > 20
                 ? Math.max(30, Math.min(300, Math.floor((20 / fileSizeInMB) * duration)))
@@ -562,7 +605,14 @@ Return as JSON:
                     getToken: () => this.getCodexToken(),
                     audioFilePath,
                     model: this.codexTranscriptionModel,
+                    // `prompt` is dropped inside transcribeCodexAudio when the
+                    // diarize model is active. Keep passing it -- the helper picks
+                    // the right shape per model.
                     prompt: transcriptPrompt,
+                    // Intentionally NOT passing `language: 'ko'`. Whisper-derived
+                    // transcription auto-detects from the first ~30s, which handles
+                    // bilingual/code-switched meetings (Korean primary, English
+                    // acronyms/quotes) better than forcing a single language.
                 });
             }
             const ai = this.gemini();
@@ -745,8 +795,11 @@ Return as JSON:
     // Get segmented transcript (renamed from transcribeAudioSegmented)
     async getSegmentedTranscript(audioFilePath, duration, progressCallback, customPrompt, segmentDuration = 300) {
         try {
-            // Split audio into 5-minute segments
-            const segmentFiles = await this.splitAudioIntoSegments(audioFilePath, segmentDuration);
+            // Split audio into 5-minute segments. Codex transcription requires
+            // accurate cut times (gpt-4o-transcribe rejects >1400s/segment), so
+            // force re-encode there; Gemini's API tolerates long inputs and we
+            // keep the cheaper `-c copy` path for it.
+            const segmentFiles = await this.splitAudioIntoSegments(audioFilePath, segmentDuration, this.provider === 'codex');
             if (progressCallback) {
                 progressCallback(20, `Processing ${segmentFiles.length} segments...`);
             }

package/dist/piAiClient.js CHANGED Viewed

@@ -23,11 +23,69 @@ async function getModel(provider, modelId) {
     // path for non-literal ids ("Custom Models" in pi-ai's README).
     return m.getModel(piId, modelId);
 }
+function summarizeContextSize(context) {
+    let chars = 0;
+    let toolCalls = 0;
+    let toolResults = 0;
+    for (const msg of context.messages) {
+        if (msg.role === 'user') {
+            chars +=
+                typeof msg.content === 'string'
+                    ? msg.content.length
+                    : msg.content.reduce((n, b) => n + (b.type === 'text' ? b.text.length : 0), 0);
+        }
+        else if (msg.role === 'assistant') {
+            for (const b of msg.content) {
+                if (b.type === 'text')
+                    chars += b.text.length;
+                else if (b.type === 'toolCall')
+                    toolCalls++;
+            }
+        }
+        else if (msg.role === 'toolResult') {
+            toolResults++;
+            for (const b of msg.content)
+                if (b.type === 'text')
+                    chars += b.text.length;
+        }
+    }
+    const systemChars = context.systemPrompt?.length ?? 0;
+    return `messages=${context.messages.length} chars=${chars + systemChars} (system=${systemChars}) toolCalls=${toolCalls} toolResults=${toolResults} tools=${context.tools?.length ?? 0}`;
+}
+// Strip options the target provider doesn't accept. OpenAI Codex routes
+// through GPT-5.x reasoning models which reject sampling parameters
+// (`Unsupported parameter: temperature`). pi-ai forwards options verbatim,
+// so the adjustment has to happen at our boundary -- doing it here keeps
+// callsites free of provider conditionals.
+function adjustOptionsForModel(model, options) {
+    if (!options)
+        return undefined;
+    const isCodex = model.api === 'openai-codex-responses' || model.provider === 'openai-codex';
+    if (isCodex) {
+        const { temperature: _t, ...rest } = options;
+        return { ...rest };
+    }
+    return { ...options };
+}
 async function complete(model, context, options) {
     const m = await loadPiAi();
-    // pi-ai's ProviderStreamOptions is `StreamOptions & Record<string, unknown>`;
-    // spread to satisfy the index-signature constraint.
-    return await m.complete(model, context, options ? { ...options } : undefined);
+    const tag = `[pi-ai ${model.provider}/${model.id}]`;
+    const startedAt = Date.now();
+    console.log(`${tag} -> ${summarizeContextSize(context)}`);
+    const adjustedOptions = adjustOptionsForModel(model, options);
+    const response = await m.complete(model, context, adjustedOptions);
+    const elapsed = Date.now() - startedAt;
+    const stop = response.stopReason ?? 'unknown';
+    const textChars = extractFinalText(response).length;
+    console.log(`${tag} <- ${elapsed}ms stop=${stop} textChars=${textChars} usage=in:${response.usage?.input ?? '?'}/out:${response.usage?.output ?? '?'}${response.errorMessage ? ` errorMessage=${response.errorMessage.slice(0, 300)}` : ''}`);
+    // pi-ai surfaces upstream failures via stopReason='error' rather than
+    // throwing. Without this, geminiService.generateSummary returns "" and
+    // agentService.run returns "(no answer)" with no breadcrumb. Promote the
+    // diagnostic into a thrown error so it reaches the renderer / CLI surface.
+    if (response.stopReason === 'error') {
+        throw new Error(`Pi-ai ${model.provider}/${model.id} failed: ${response.errorMessage ?? 'no errorMessage'}`);
+    }
+    return response;
 }
 async function getTypeBox() {
     const m = await loadPiAi();

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "listener-ai",
-  "version": "2.7.0",
+  "version": "2.7.2",
   "description": "A lightweight desktop application for recording and transcribing meetings with AI-powered notes.",
   "main": "dist/main.js",
   "bin": {