npm - vidistill - Versions diffs - 0.4.4 → 0.4.5 - Mend

vidistill 0.4.4 → 0.4.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/README.md CHANGED Viewed

@@ -112,15 +112,16 @@ Supported video formats: MP4, MOV, WebM, MKV, AVI, MPEG, FLV, WMV, 3GPP. Support
 1. **Input** — accepts YouTube URL directly or reads local file (video or audio), compresses if over 2GB
 2. **Pass 0** — scene analysis to classify video type and determine processing strategy
-3. **Pass 1** — transcript extraction with speaker identification
-4. **Pass 2** — visual content extraction (screen states, diagrams, slides)
-5. **Pass 3** — specialist passes based on video type:
+3. **Pass 1a** — pure verbatim transcription (timestamps, tone, emphasis — no speaker labels)
+4. **Pass 1b** — speaker diarization (assigns SPEAKER_XX labels to transcript entries using voice and visual cues, then merged with 1a)
+5. **Pass 2** — visual content extraction (screen states, diagrams, slides)
+6. **Pass 3** — specialist passes based on video type:
    - 3c: chat and links (live streams) — per segment, runs 3x with consensus voting
    - 3d: implicit signals (all types) — per segment
-   - 3b: people and social dynamics (meetings) — whole video
+   - 3b: people and social dynamics (meetings) — whole video, anchored to transcript speakers
    - 3a: code reconstruction (coding videos) — whole video, runs 3x with consensus voting and validation
-6. **Synthesis** — cross-references all passes into unified analysis
-7. **Output** — generates structured markdown files
+7. **Synthesis** — cross-references all passes into unified analysis
+8. **Output** — generates structured markdown files
 Audio files skip visual passes and go straight to transcript, people, implicit signals, and synthesis.

package/dist/index.js CHANGED Viewed

@@ -9,12 +9,12 @@ import pc from "picocolors";
 import { intro, note } from "@clack/prompts";
 // src/constants/prompts.ts
-var SYSTEM_INSTRUCTION_PASS_1 = `
+var SYSTEM_INSTRUCTION_PASS_1A = `
 You are a professional audio transcriber. Your task is to create a COMPLETE, VERBATIM transcription of all speech in this video segment. Focus EXCLUSIVELY on the audio stream.
 CRITICAL RULES:
 1. TRANSCRIBE every spoken word completely and verbatim. Do not summarize, paraphrase, or skip any sentence.
-2. IDENTIFY different speakers. Label them SPEAKER_00, SPEAKER_01, etc. consistently throughout. If a speaker introduces themselves by name, note the name in the first entry's speaker field as "SPEAKER_00 (John)".
+2. Do NOT identify or label speakers \u2014 focus entirely on transcription accuracy. No SPEAKER_XX labels.
 3. NOTE tone and emphasis: when a speaker emphasizes words (louder, slower, repeated), mark those words. When they express emotions (excitement, warning, frustration, humor), note the tone.
 4. RECORD pauses longer than 1.5 seconds as pause markers with duration.
 5. PRESERVE filler words only when they carry meaning (hesitation indicating uncertainty about code behavior, self-correction). Remove meaningless "um", "uh".
@@ -24,9 +24,25 @@ CRITICAL RULES:
 COMPLETENESS TARGET:
 - Aim for at least 150 words per minute of video in the transcript
-- Every speaker change must be noted with a new entry
 - Every sentence must appear \u2014 if in doubt, include it
 `;
+var SYSTEM_INSTRUCTION_PASS_1B = `
+You are a speaker diarization specialist. Your task is to identify distinct speakers and assign speaker labels to each transcript entry by timestamp.
+Given the transcript below, identify distinct speakers by analyzing voice characteristics, visual cues (face detection, name tags, on-screen labels), and speaking patterns. Assign a SPEAKER_XX label to each transcript entry by timestamp.
+TRANSCRIPT FROM THIS SEGMENT:
+{INJECT_PASS1A_TRANSCRIPT_HERE}
+CRITICAL RULES:
+1. ASSIGN a SPEAKER_XX label (SPEAKER_00, SPEAKER_01, etc.) to each transcript entry by matching its timestamp.
+2. DIFFERENTIATE speakers by: voice pitch and tone, visual position on screen, name tags or captions, turn-taking patterns, and speaking style.
+3. If a speaker introduces themselves by name or their name is visible on screen, label them as "SPEAKER_XX (Name)" \u2014 e.g., "SPEAKER_00 (Alice)".
+4. Be CONSISTENT: the same speaker must always get the same label throughout the segment.
+5. Provide a speaker_summary describing each identified speaker (voice characteristics, visual appearance, role if detectable).
+6. If you cannot distinguish speakers, assign all entries to SPEAKER_00.
+7. NEVER re-transcribe the speech \u2014 only assign speaker labels by timestamp.
+`;
 var SYSTEM_INSTRUCTION_PASS_2_TEMPLATE = `
 You are a professional code and visual content extractor. Your task is to extract ALL visual content from this video segment \u2014 every piece of code on screen, every diagram, every slide, every UI element.
@@ -139,6 +155,7 @@ CRITICAL RULES:
 8. NEVER guess or infer a name that was not clearly stated or shown. Use "Unknown Participant" with a description if the person cannot be identified.
 9. NEVER merge two people just because they have the same role \u2014 if two engineers speak, they are two separate participants.
 10. If a person's role or organization cannot be determined, use empty string \u2014 do not guess.
+11. Only identify participants who spoke during the meeting. Do not extract names from GitHub pages, Zoom participant lists, slides, or other visual elements unless that person also spoke.
 COMPLETENESS TARGET:
 - Every speaker label (SPEAKER_00, SPEAKER_01, etc.) from the transcript must map to at least one participant entry
@@ -474,6 +491,8 @@ import { spinner, progress } from "@clack/prompts";
 var PHASE_LABELS = {
   pass0: "Understanding your video...",
   pass1: "Extracting transcript...",
+  pass1a: "Transcribing...",
+  pass1b: "Identifying speakers...",
   pass2: "Analyzing visuals...",
   pass3a: "Reconstructing code...",
   pass3b: "Identifying participants...",
@@ -576,6 +595,7 @@ var RateLimiter = class {
 import { existsSync } from "fs";
 // src/input/youtube.ts
+import { execFile } from "child_process";
 var YOUTUBE_PATTERNS = [
   /(?:youtube\.com\/watch\?.*v=|youtu\.be\/|youtube\.com\/embed\/|youtube\.com\/v\/|youtube\.com\/shorts\/)([a-zA-Z0-9_-]{11})/
 ];
@@ -594,8 +614,30 @@ function normalizeYouTubeUrl(url) {
   if (!id) return null;
   return `https://www.youtube.com/watch?v=${id}`;
 }
+function fetchYtDlpDuration(url) {
+  return new Promise((resolve2) => {
+    execFile("yt-dlp", ["--dump-json", "--no-download", url], { timeout: 15e3 }, (err, stdout) => {
+      if (err) {
+        resolve2(void 0);
+        return;
+      }
+      try {
+        const data = JSON.parse(stdout);
+        const dur = data["duration"];
+        if (typeof dur === "number" && dur > 0) {
+          resolve2(dur);
+        } else {
+          resolve2(void 0);
+        }
+      } catch {
+        resolve2(void 0);
+      }
+    });
+  });
+}
 async function handleYouTube(url, _client) {
-  return { fileUri: url, mimeType: "video/mp4", source: "direct" };
+  const duration = await fetchYtDlpDuration(url);
+  return { fileUri: url, mimeType: "video/mp4", source: "direct", duration };
 }
 // src/input/resolver.ts
@@ -985,6 +1027,62 @@ var SCHEMA_PASS_1 = {
   },
   required: ["segment_index", "time_range", "transcript_entries"]
 };
+var SCHEMA_PASS_1A = {
+  type: Type.OBJECT,
+  properties: {
+    segment_index: { type: Type.INTEGER, description: "0-based segment index" },
+    time_range: { type: Type.STRING, description: "Format: HH:MM:SS - HH:MM:SS" },
+    transcript_entries: {
+      type: Type.ARRAY,
+      items: {
+        type: Type.OBJECT,
+        properties: {
+          timestamp: { type: Type.STRING, description: "HH:MM:SS format" },
+          text: { type: Type.STRING, description: "Complete spoken text, verbatim" },
+          tone: {
+            type: Type.STRING,
+            enum: ["neutral", "emphatic", "questioning", "warning", "excited", "humorous", "frustrated", "instructional", "conversational"]
+          },
+          emphasis_words: {
+            type: Type.ARRAY,
+            items: { type: Type.STRING },
+            description: "Words spoken with notable emphasis"
+          },
+          pause_after_seconds: { type: Type.NUMBER, description: "Pause duration in seconds" }
+        },
+        required: ["timestamp", "text", "tone"]
+      }
+    }
+  },
+  required: ["segment_index", "time_range", "transcript_entries"]
+};
+var SCHEMA_PASS_1B = {
+  type: Type.OBJECT,
+  properties: {
+    speaker_assignments: {
+      type: Type.ARRAY,
+      items: {
+        type: Type.OBJECT,
+        properties: {
+          timestamp: { type: Type.STRING, description: "HH:MM:SS matching a transcript entry" },
+          speaker: { type: Type.STRING, description: "SPEAKER_00, SPEAKER_01, etc. Optionally with name: SPEAKER_00 (Alice)" }
+        },
+        required: ["timestamp", "speaker"]
+      }
+    },
+    speaker_summary: {
+      type: Type.ARRAY,
+      items: {
+        type: Type.OBJECT,
+        properties: {
+          speaker_id: { type: Type.STRING },
+          description: { type: Type.STRING }
+        }
+      }
+    }
+  },
+  required: ["speaker_assignments", "speaker_summary"]
+};
 var SCHEMA_PASS_2 = {
   type: Type.OBJECT,
   properties: {
@@ -1448,6 +1546,18 @@ function applySpeakerMapping(label, mapping) {
   }
   return label;
 }
+function replaceNamesInText(text4, mapping) {
+  if (!mapping || text4.length === 0) return text4;
+  const entries = Object.entries(mapping).filter(([key, value]) => key !== value && !/^SPEAKER_\d+$/.test(key)).sort((a, b) => b[0].length - a[0].length);
+  if (entries.length === 0) return text4;
+  let result = text4;
+  for (const [key, value] of entries) {
+    const escaped = key.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+    const re = new RegExp(`\\b${escaped}\\b`, "g");
+    result = result.replace(re, value);
+  }
+  return result;
+}
 function buildExpandedMapping(segments, speakerMapping) {
   const expanded = { ...speakerMapping };
   for (const seg of segments) {
@@ -1501,8 +1611,8 @@ function changeTypeBadge(changeType) {
   return badges[changeType] || `[${changeType.toUpperCase()}]`;
 }
-// src/passes/transcript.ts
-async function runTranscript(params) {
+// src/passes/transcription.ts
+async function runTranscription(params) {
   const { client, fileUri, mimeType, segment, model, resolution, lang } = params;
   const contents = [
     {
@@ -1525,20 +1635,107 @@ async function runTranscript(params) {
     model,
     contents,
     config: {
-      systemInstruction: withLanguage(SYSTEM_INSTRUCTION_PASS_1, lang),
-      responseSchema: SCHEMA_PASS_1,
+      systemInstruction: withLanguage(SYSTEM_INSTRUCTION_PASS_1A, lang),
+      responseSchema: SCHEMA_PASS_1A,
       responseMimeType: "application/json",
       ...resolution !== void 0 ? { mediaResolution: resolution } : {},
       maxOutputTokens: 65536,
-      temperature: 0
+      temperature: 1
     }
   });
   if (result === null || typeof result !== "object" || !Array.isArray(result["transcript_entries"])) {
-    throw new Error("Empty response from Gemini Pass 1");
+    throw new Error("Empty response from Gemini Pass 1a");
+  }
+  return result;
+}
+// src/passes/diarization.ts
+function formatTranscriptForInjection(pass1a) {
+  if (pass1a.transcript_entries.length === 0) {
+    return "[No transcript entries in this segment]";
+  }
+  return pass1a.transcript_entries.map((e) => `[${e.timestamp}] ${e.text}`).join("\n");
+}
+async function runDiarization(params) {
+  const { client, fileUri, mimeType, segment, model, resolution, lang, pass1aResult } = params;
+  const transcriptText = formatTranscriptForInjection(pass1aResult);
+  const systemInstruction = withLanguage(
+    SYSTEM_INSTRUCTION_PASS_1B.replace("{INJECT_PASS1A_TRANSCRIPT_HERE}", transcriptText),
+    lang
+  );
+  const contents = [
+    {
+      role: "user",
+      parts: [
+        {
+          fileData: { fileUri, mimeType },
+          videoMetadata: {
+            startOffset: `${segment.startTime}s`,
+            endOffset: `${segment.endTime}s`
+          }
+        },
+        {
+          text: `Process segment #${segment.index + 1}. Identify speakers from ${formatTime(segment.startTime)} to ${formatTime(segment.endTime)}.`
+        }
+      ]
+    }
+  ];
+  const result = await client.generate({
+    model,
+    contents,
+    config: {
+      systemInstruction,
+      responseSchema: SCHEMA_PASS_1B,
+      responseMimeType: "application/json",
+      ...resolution !== void 0 ? { mediaResolution: resolution } : {},
+      maxOutputTokens: 65536,
+      temperature: 1
+    }
+  });
+  if (result === null || typeof result !== "object" || !Array.isArray(result["speaker_assignments"])) {
+    throw new Error("Empty response from Gemini Pass 1b");
   }
   return result;
 }
+// src/passes/transcript-merge.ts
+var MAX_MATCH_WINDOW_S = 3;
+function mergeTranscriptResults(pass1a, pass1b) {
+  const assignments = pass1b.speaker_assignments.map((a) => ({
+    ...a,
+    seconds: parseTimestamp(a.timestamp),
+    used: false
+  }));
+  const transcript_entries = pass1a.transcript_entries.map((entry) => {
+    const entrySeconds = parseTimestamp(entry.timestamp);
+    let bestIdx = -1;
+    let bestDelta = Infinity;
+    for (let i = 0; i < assignments.length; i++) {
+      if (assignments[i].used) continue;
+      const delta = Math.abs(assignments[i].seconds - entrySeconds);
+      if (delta < bestDelta) {
+        bestDelta = delta;
+        bestIdx = i;
+      }
+    }
+    let speaker = "SPEAKER_UNKNOWN";
+    if (bestIdx >= 0 && bestDelta <= MAX_MATCH_WINDOW_S) {
+      speaker = assignments[bestIdx].speaker;
+      assignments[bestIdx].used = true;
+    }
+    return {
+      ...entry,
+      speaker
+    };
+  });
+  return {
+    segment_index: pass1a.segment_index,
+    time_range: pass1a.time_range,
+    transcript_entries,
+    speaker_summary: pass1b.speaker_summary
+  };
+}
 // src/passes/visual.ts
 async function runVisual(params) {
   const { client, fileUri, mimeType, segment, model, resolution, pass1Transcript, lang } = params;
@@ -1573,7 +1770,7 @@ async function runVisual(params) {
       responseMimeType: "application/json",
       ...resolution !== void 0 ? { mediaResolution: resolution } : {},
       maxOutputTokens: 65536,
-      temperature: 0
+      temperature: 1
     }
   });
   if (result === null || typeof result !== "object" || !Array.isArray(result["code_blocks"])) {
@@ -1721,19 +1918,22 @@ ${contextText}`
 // src/passes/people.ts
 async function runPeopleExtraction(params) {
-  const { client, fileUri, mimeType, model, pass1Results, lang } = params;
+  const { client, fileUri, mimeType, model, pass1Results, lang, canonicalSpeakers } = params;
   const hasAnyTranscript = pass1Results.some((r) => r != null);
   const transcriptText = hasAnyTranscript ? pass1Results.filter((r) => r != null).flatMap(
     (r) => r.transcript_entries.map((t) => `[${t.timestamp}] ${t.speaker}: ${t.text}`)
   ).join("\n") : "[No transcript available]";
   const transcriptContext = `TRANSCRIPT FROM ALL SEGMENTS:
 ${transcriptText}`;
+  const speakerConstraint = canonicalSpeakers && canonicalSpeakers.length > 0 ? `CONFIRMED SPEAKERS: ${canonicalSpeakers.join(", ")}. Extract details about these speakers only.
+` : "";
   const contents = [
     {
       role: "user",
       parts: [
         { fileData: { fileUri, mimeType } },
-        { text: `Analyze the entire video. ${transcriptContext}` }
+        { text: `Analyze the entire video. ${speakerConstraint}${transcriptContext}` }
       ]
     }
   ];
@@ -1745,7 +1945,7 @@ ${transcriptText}`;
       responseSchema: SCHEMA_PASS_3B,
       responseMimeType: "application/json",
       maxOutputTokens: 65536,
-      temperature: 0
+      temperature: 1
     }
   });
   if (result === null || typeof result !== "object" || !Array.isArray(result["participants"]) || !Array.isArray(result["relationships"])) {
@@ -1795,7 +1995,7 @@ ${contextText}`
       responseMimeType: "application/json",
       ...resolution !== void 0 ? { mediaResolution: resolution } : {},
       maxOutputTokens: 65536,
-      temperature: 0
+      temperature: 1
     }
   });
   if (result === null || typeof result !== "object" || !Array.isArray(result["messages"]) || !Array.isArray(result["links"])) {
@@ -2405,6 +2605,123 @@ function validateCodeReconstruction(params) {
   return { confirmed, uncertain, rejected, warnings };
 }
+// src/core/speaker-reconciliation.ts
+var SPEAKER_NAME_RE = /^(SPEAKER_\d+)\s*\((.+)\)$/;
+function parseLabel(label) {
+  const m = SPEAKER_NAME_RE.exec(label.trim());
+  if (m) {
+    return { base: m[1], name: m[2].toLowerCase() };
+  }
+  return { base: label.trim(), name: null };
+}
+function formatLabel(base, originalName) {
+  return originalName != null ? `${base} (${originalName})` : base;
+}
+function reconcileSpeakers(params) {
+  const { pass1Results } = params;
+  const namedGroups = /* @__PURE__ */ new Map();
+  const unnamedGroups = /* @__PURE__ */ new Map();
+  let nextCanonicalIndex = 0;
+  const rawMapping = /* @__PURE__ */ new Map();
+  function getOrAssignNamed(name, originalName, description) {
+    const existing = namedGroups.get(name);
+    if (existing) {
+      if (description) existing.descriptions.push(description);
+      return existing.canonicalIndex;
+    }
+    const idx = nextCanonicalIndex++;
+    namedGroups.set(name, {
+      canonicalIndex: idx,
+      originalName,
+      descriptions: description ? [description] : []
+    });
+    return idx;
+  }
+  function getOrAssignUnnamed(segmentKey, description) {
+    const existing = unnamedGroups.get(segmentKey);
+    if (existing) {
+      if (description) existing.descriptions.push(description);
+      return existing.canonicalIndex;
+    }
+    const idx = nextCanonicalIndex++;
+    unnamedGroups.set(segmentKey, {
+      canonicalIndex: idx,
+      descriptions: description ? [description] : []
+    });
+    return idx;
+  }
+  for (let segIdx = 0; segIdx < pass1Results.length; segIdx++) {
+    const result = pass1Results[segIdx];
+    if (result == null) continue;
+    const labelsInSegment = /* @__PURE__ */ new Set();
+    for (const entry of result.speaker_summary ?? []) {
+      if (entry.speaker_id) labelsInSegment.add(entry.speaker_id);
+    }
+    for (const entry of result.transcript_entries ?? []) {
+      if (entry.speaker) labelsInSegment.add(entry.speaker);
+    }
+    const descriptionByLabel = /* @__PURE__ */ new Map();
+    for (const entry of result.speaker_summary ?? []) {
+      if (entry.speaker_id) {
+        descriptionByLabel.set(entry.speaker_id, entry.description ?? "");
+      }
+    }
+    for (const label of labelsInSegment) {
+      const mapKey = `${segIdx}:${label}`;
+      if (rawMapping.has(mapKey)) continue;
+      const { name } = parseLabel(label);
+      const description = descriptionByLabel.get(label) ?? "";
+      let canonicalIdx;
+      if (name != null) {
+        canonicalIdx = getOrAssignNamed(
+          name,
+          /* originalName */
+          parseOriginalName(label),
+          description
+        );
+      } else {
+        canonicalIdx = getOrAssignUnnamed(mapKey, description);
+      }
+      rawMapping.set(mapKey, canonicalIdx);
+    }
+  }
+  if (rawMapping.size === 0) {
+    return { mapping: {}, canonicalSpeakers: [] };
+  }
+  const slots = Array.from(
+    { length: nextCanonicalIndex },
+    () => ({ originalName: null, descriptions: [] })
+  );
+  for (const [, group] of namedGroups) {
+    slots[group.canonicalIndex] = {
+      originalName: group.originalName,
+      descriptions: group.descriptions
+    };
+  }
+  for (const [, group] of unnamedGroups) {
+    slots[group.canonicalIndex] = {
+      originalName: null,
+      descriptions: group.descriptions
+    };
+  }
+  const canonicalSpeakers = slots.map((slot, idx) => ({
+    label: formatLabel(formatCanonicalBase(idx), slot.originalName),
+    descriptions: slot.descriptions
+  }));
+  const mapping = {};
+  for (const [mapKey, canonicalIdx] of rawMapping) {
+    mapping[mapKey] = canonicalSpeakers[canonicalIdx].label;
+  }
+  return { mapping, canonicalSpeakers };
+}
+function parseOriginalName(label) {
+  const m = SPEAKER_NAME_RE.exec(label.trim());
+  return m ? m[2] : label.trim();
+}
+function formatCanonicalBase(index) {
+  return `SPEAKER_${String(index).padStart(2, "0")}`;
+}
 // src/core/pipeline.ts
 var RETRY_DELAYS_MS = [2e3, 4e3, 8e3];
 async function withRetry(fn, label) {
@@ -2484,7 +2801,7 @@ async function runPipeline(config) {
   const results = [];
   const n = segments.length;
   const linkConsensusRuns = 3;
-  const callsPerSegment = 2 + (strategy.passes.includes("chat") ? linkConsensusRuns : 0) + (strategy.passes.includes("implicit") ? 1 : 0);
+  const callsPerSegment = 3 + (strategy.passes.includes("chat") ? linkConsensusRuns : 0) + (strategy.passes.includes("implicit") ? 1 : 0);
   const postSegmentCalls = (strategy.passes.includes("people") ? 1 : 0) + (strategy.passes.includes("code") ? 3 : 0) + (strategy.passes.includes("synthesis") ? 1 : 0);
   const totalSteps = n * callsPerSegment + postSegmentCalls;
   let currentStep = 0;
@@ -2501,21 +2818,40 @@ async function runPipeline(config) {
       break;
     }
     const segment = segments[i];
-    onProgress?.({ phase: "pass1", segment: i, totalSegments: n, status: "running", totalSteps });
-    let pass1 = null;
-    const pass1Attempt = await withRetry(
-      () => rateLimiter.execute(() => runTranscript({ client, fileUri, mimeType, segment, model, resolution, lang }), { onWait }),
-      `segment ${i} pass1`
+    onProgress?.({ phase: "pass1a", segment: i, totalSegments: n, status: "running", totalSteps });
+    const pass1aAttempt = await withRetry(
+      () => rateLimiter.execute(() => runTranscription({ client, fileUri, mimeType, segment, model, resolution, lang }), { onWait }),
+      `segment ${i} pass1a`
     );
-    if (pass1Attempt.error !== null) {
-      log4.warn(pass1Attempt.error);
-      errors.push(pass1Attempt.error);
-    } else {
-      pass1 = pass1Attempt.result;
-      pass1RanOnce = true;
+    let pass1aResult = pass1aAttempt.error !== null ? null : pass1aAttempt.result;
+    if (pass1aAttempt.error !== null) {
+      log4.warn(pass1aAttempt.error);
+      errors.push(pass1aAttempt.error);
     }
     currentStep++;
-    onProgress?.({ phase: "pass1", segment: i, totalSegments: n, status: "done", currentStep, totalSteps });
+    onProgress?.({ phase: "pass1a", segment: i, totalSegments: n, status: "done", currentStep, totalSteps });
+    let pass1 = null;
+    if (pass1aResult != null) {
+      onProgress?.({ phase: "pass1b", segment: i, totalSegments: n, status: "running", totalSteps });
+      const p1a = pass1aResult;
+      const pass1bAttempt = await withRetry(
+        () => rateLimiter.execute(() => runDiarization({ client, fileUri, mimeType, segment, model, resolution, lang, pass1aResult: p1a }), { onWait }),
+        `segment ${i} pass1b`
+      );
+      if (pass1bAttempt.error !== null) {
+        log4.warn(pass1bAttempt.error);
+        errors.push(pass1bAttempt.error);
+        pass1 = mergeTranscriptResults(pass1aResult, { speaker_assignments: [], speaker_summary: [] });
+      } else if (pass1bAttempt.result != null) {
+        pass1 = mergeTranscriptResults(pass1aResult, pass1bAttempt.result);
+      }
+      currentStep++;
+      onProgress?.({ phase: "pass1b", segment: i, totalSegments: n, status: "done", currentStep, totalSteps });
+      pass1RanOnce = true;
+    } else {
+      currentStep++;
+      onProgress?.({ phase: "pass1b", segment: i, totalSegments: n, status: "done", currentStep, totalSteps });
+    }
     onProgress?.({ phase: "pass2", segment: i, totalSegments: n, status: "running", totalSteps });
     let pass2 = null;
     const pass2Attempt = await withRetry(
@@ -2632,6 +2968,31 @@ async function runPipeline(config) {
   }
   const pass1Results = results.map((r) => r.pass1);
   const pass2Results = results.map((r) => r.pass2);
+  let canonicalSpeakers = [];
+  try {
+    const reconciliationResult = reconcileSpeakers({ pass1Results });
+    canonicalSpeakers = reconciliationResult.canonicalSpeakers;
+    const { mapping } = reconciliationResult;
+    for (let segIdx = 0; segIdx < pass1Results.length; segIdx++) {
+      const r = pass1Results[segIdx];
+      if (r == null) continue;
+      for (const entry of r.transcript_entries ?? []) {
+        if (entry.speaker) {
+          const canonical = mapping[`${segIdx}:${entry.speaker}`];
+          if (canonical !== void 0) entry.speaker = canonical;
+        }
+      }
+      for (const entry of r.speaker_summary ?? []) {
+        if (entry.speaker_id) {
+          const canonical = mapping[`${segIdx}:${entry.speaker_id}`];
+          if (canonical !== void 0) entry.speaker_id = canonical;
+        }
+      }
+    }
+  } catch (e) {
+    const msg = e instanceof Error ? e.message : String(e);
+    log4.warn(`speaker reconciliation failed, continuing with original labels: ${msg}`);
+  }
   let peopleExtraction = null;
   if (strategy.passes.includes("people")) {
     onProgress?.({ phase: "pass3b", segment: 0, totalSegments: 1, status: "running", totalSteps });
@@ -2643,7 +3004,8 @@ async function runPipeline(config) {
           mimeType,
           model: MODELS.flash,
           pass1Results,
-          lang
+          lang,
+          canonicalSpeakers: canonicalSpeakers.map((s) => s.label)
         }),
         { onWait }
       ),
@@ -2751,7 +3113,7 @@ async function runPipeline(config) {
 }
 // src/output/generator.ts
-import { mkdir, writeFile } from "fs/promises";
+import { mkdir, readFile as readFile2, writeFile } from "fs/promises";
 import { join as join3, dirname } from "path";
 // src/output/guide.ts
@@ -2769,11 +3131,11 @@ function renderFilesTable(filesGenerated) {
 |------|
 ${rows}`;
 }
-function renderSuggestions(synthesisResult) {
+function renderSuggestions(synthesisResult, speakerMapping) {
   if (synthesisResult == null || synthesisResult.suggestions.length === 0) {
     return "_No suggestions._";
   }
-  return synthesisResult.suggestions.map((s) => `- ${s}`).join("\n");
+  return synthesisResult.suggestions.map((s) => `- ${replaceNamesInText(s, speakerMapping)}`).join("\n");
 }
 function renderVideoType(profile) {
   if (profile == null) return "unknown";
@@ -2820,7 +3182,8 @@ function renderIncompletePasses(pipelineResult) {
 function writeGuide(params) {
   const { title, source, duration, pipelineResult, filesGenerated, speakerMapping } = params;
   const { synthesisResult, videoProfile } = pipelineResult;
-  const overview = synthesisResult?.overview ?? "_No summary available \u2014 synthesis pass did not run or produced no output._";
+  const rawOverview = synthesisResult?.overview ?? "_No summary available \u2014 synthesis pass did not run or produced no output._";
+  const overview = replaceNamesInText(rawOverview, speakerMapping);
   const videoType = renderVideoType(videoProfile);
   const sections = [
     `# ${title}`,
@@ -2841,7 +3204,7 @@ function writeGuide(params) {
     "",
     "## Suggestions",
     "",
-    renderSuggestions(synthesisResult),
+    renderSuggestions(synthesisResult, speakerMapping),
     "",
     "## Processing Details",
     "",
@@ -3066,58 +3429,58 @@ ${content}`;
 }
 // src/output/notes.ts
-function renderDecisions(decisions) {
+function renderDecisions(decisions, speakerMapping) {
   if (decisions.length === 0) return [];
   const lines = ["## Key Decisions", ""];
   for (const d of decisions) {
-    lines.push(`### [${d.timestamp}] ${d.decision}`);
+    lines.push(`### [${d.timestamp}] ${replaceNamesInText(d.decision, speakerMapping)}`);
     lines.push("");
     if (d.context.length > 0) {
-      lines.push(d.context);
+      lines.push(replaceNamesInText(d.context, speakerMapping));
       lines.push("");
     }
   }
   return lines;
 }
-function renderConcepts(concepts) {
+function renderConcepts(concepts, speakerMapping) {
   if (concepts.length === 0) return [];
   const lines = ["## Key Concepts", ""];
   for (const c of concepts) {
-    lines.push(`### [${c.timestamp}] ${c.concept}`);
+    lines.push(`### [${c.timestamp}] ${replaceNamesInText(c.concept, speakerMapping)}`);
     lines.push("");
     if (c.explanation.length > 0) {
-      lines.push(c.explanation);
+      lines.push(replaceNamesInText(c.explanation, speakerMapping));
       lines.push("");
     }
   }
   return lines;
 }
-function renderTopics(topics) {
+function renderTopics(topics, speakerMapping) {
   if (topics.length === 0) return [];
   const lines = ["## Topics", ""];
   for (const t of topics) {
     const tsLabel = t.timestamps.length > 0 ? ` _(${t.timestamps.join(", ")})_` : "";
-    lines.push(`### ${t.title}${tsLabel}`);
+    lines.push(`### ${replaceNamesInText(t.title, speakerMapping)}${tsLabel}`);
     lines.push("");
     if (t.summary.length > 0) {
-      lines.push(t.summary);
+      lines.push(replaceNamesInText(t.summary, speakerMapping));
       lines.push("");
     }
     if (t.key_points.length > 0) {
       for (const kp of t.key_points) {
-        lines.push(`- ${kp}`);
+        lines.push(`- ${replaceNamesInText(kp, speakerMapping)}`);
       }
       lines.push("");
     }
   }
   return lines;
 }
-function renderQuestions(questions) {
+function renderQuestions(questions, speakerMapping) {
   if (questions.length === 0) return [];
   const lines = ["## Questions Raised", ""];
   for (const q of questions) {
     const status = q.answered ? "(answered)" : "(open)";
-    lines.push(`- **[${q.timestamp}]** ${q.question} ${status}`);
+    lines.push(`- **[${q.timestamp}]** ${replaceNamesInText(q.question, speakerMapping)} ${status}`);
   }
   lines.push("");
   return lines;
@@ -3128,7 +3491,7 @@ function renderActionItems(items, speakerMapping) {
   for (const a of items) {
     const mentionedBy = applySpeakerMapping(a.mentioned_by, speakerMapping);
     const by = mentionedBy.length > 0 ? ` \u2014 _${mentionedBy}_` : "";
-    lines.push(`- **[${a.timestamp}]** ${a.item}${by}`);
+    lines.push(`- **[${a.timestamp}]** ${replaceNamesInText(a.item, speakerMapping)}${by}`);
   }
   lines.push("");
   return lines;
@@ -3142,20 +3505,20 @@ function writeNotes(params) {
   if (!hasMeaningfulContent(synthesisResult)) return null;
   const sections = ["# Notes", ""];
   if (synthesisResult.overview.length > 0) {
-    sections.push(synthesisResult.overview);
+    sections.push(replaceNamesInText(synthesisResult.overview, speakerMapping));
     sections.push("");
   }
-  sections.push(...renderDecisions(synthesisResult.key_decisions));
-  sections.push(...renderConcepts(synthesisResult.key_concepts));
-  sections.push(...renderTopics(synthesisResult.topics));
-  sections.push(...renderQuestions(synthesisResult.questions_raised));
+  sections.push(...renderDecisions(synthesisResult.key_decisions, speakerMapping));
+  sections.push(...renderConcepts(synthesisResult.key_concepts, speakerMapping));
+  sections.push(...renderTopics(synthesisResult.topics, speakerMapping));
+  sections.push(...renderQuestions(synthesisResult.questions_raised, speakerMapping));
   sections.push(...renderActionItems(synthesisResult.action_items, speakerMapping));
   while (sections[sections.length - 1] === "") sections.pop();
   return sections.join("\n");
 }
 // src/output/people.ts
-function renderParticipant(p, index) {
+function renderParticipant(p, index, speakerMapping) {
   const lines = [];
   lines.push(`## ${index + 1}. ${p.name}`);
   lines.push("");
@@ -3170,7 +3533,7 @@ function renderParticipant(p, index) {
     lines.push("**Contributions:**");
     lines.push("");
     for (const c of p.contributions) {
-      lines.push(`- ${c}`);
+      lines.push(`- ${replaceNamesInText(c, speakerMapping)}`);
     }
     lines.push("");
   }
@@ -3285,13 +3648,13 @@ function writePeople(params) {
   for (let i = 0; i < participants.length; i++) {
     const p = participants[i];
     if (p != null) {
-      sections.push(...renderParticipant(p, i));
+      sections.push(...renderParticipant(p, i, speakerMapping));
     }
   }
   if (peopleExtraction.relationships.length > 0) {
     sections.push("## Relationships", "");
     for (const r of peopleExtraction.relationships) {
-      sections.push(`- ${r}`);
+      sections.push(`- ${replaceNamesInText(r, speakerMapping)}`);
     }
     sections.push("");
   }
@@ -3447,7 +3810,7 @@ function renderSynthesisItems(items, speakerMapping) {
   for (const a of items) {
     const mentionedBy = applySpeakerMapping(a.mentioned_by, speakerMapping);
     const by = mentionedBy.length > 0 ? ` \u2014 _${mentionedBy}_` : "";
-    lines.push(`- [ ] **[${a.timestamp}]** ${a.item}${by}`);
+    lines.push(`- [ ] **[${a.timestamp}]** ${replaceNamesInText(a.item, speakerMapping)}${by}`);
   }
   lines.push("");
   return lines;
@@ -3459,7 +3822,7 @@ function renderAssignedTasks(tasks, speakerMapping) {
     const assignee = applySpeakerMapping(t.assignee, speakerMapping);
     const assigneeStr = assignee.length > 0 ? ` \u2192 _${assignee}_` : "";
     const deadline = t.deadline.length > 0 ? ` (due: ${t.deadline})` : "";
-    lines.push(`- [ ] **[${t.timestamp}]** ${t.task}${assigneeStr}${deadline}`);
+    lines.push(`- [ ] **[${t.timestamp}]** ${replaceNamesInText(t.task, speakerMapping)}${assigneeStr}${deadline}`);
   }
   lines.push("");
   return lines;
@@ -3513,19 +3876,19 @@ function collectImplicitDecisions(segments) {
   }
   return decisions;
 }
-function renderEmotionalShifts(shifts) {
+function renderEmotionalShifts(shifts, speakerMapping) {
   if (shifts.length === 0) return [];
   const lines = ["## Emotional Shifts", ""];
   for (const s of shifts) {
     lines.push(`- **[${s.timestamp}]** ${s.from_state} \u2192 ${s.to_state}`);
     if (s.trigger.length > 0) {
-      lines.push(`  _Trigger: ${s.trigger}_`);
+      lines.push(`  _Trigger: ${replaceNamesInText(s.trigger, speakerMapping)}_`);
     }
   }
   lines.push("");
   return lines;
 }
-function renderEmphasisPatterns(patterns) {
+function renderEmphasisPatterns(patterns, speakerMapping) {
   if (patterns.length === 0) return [];
   const sorted = [...patterns].sort((a, b) => b.times_mentioned - a.times_mentioned);
   const lines = ["## Emphasis Patterns", ""];
@@ -3534,32 +3897,32 @@ function renderEmphasisPatterns(patterns) {
     lines.push(`### ${p.concept} (\xD7${p.times_mentioned})${ts}`);
     lines.push("");
     if (p.significance.length > 0) {
-      lines.push(p.significance);
+      lines.push(replaceNamesInText(p.significance, speakerMapping));
       lines.push("");
     }
   }
   return lines;
 }
-function renderImplicitQuestions(questions) {
+function renderImplicitQuestions(questions, speakerMapping) {
   if (questions.length === 0) return [];
   const lines = ["## Implicit Questions", ""];
   for (const q of questions) {
-    lines.push(`- ${q}`);
+    lines.push(`- ${replaceNamesInText(q, speakerMapping)}`);
   }
   lines.push("");
   return lines;
 }
-function renderImplicitDecisions(decisions) {
+function renderImplicitDecisions(decisions, speakerMapping) {
   if (decisions.length === 0) return [];
   const lines = ["## Implicit Decisions", ""];
   for (const d of decisions) {
-    lines.push(`- ${d}`);
+    lines.push(`- ${replaceNamesInText(d, speakerMapping)}`);
   }
   lines.push("");
   return lines;
 }
 function writeInsights(params) {
-  const { segments } = params;
+  const { segments, speakerMapping } = params;
   const hasPass3d = segments.some((s) => s.pass3d != null);
   if (!hasPass3d) return null;
   const emotionalShifts = collectEmotionalShifts(segments);
@@ -3570,10 +3933,10 @@ function writeInsights(params) {
     return null;
   }
   const sections = ["# Insights", ""];
-  sections.push(...renderEmotionalShifts(emotionalShifts));
-  sections.push(...renderEmphasisPatterns(emphasisPatterns));
-  sections.push(...renderImplicitQuestions(implicitQuestions));
-  sections.push(...renderImplicitDecisions(implicitDecisions));
+  sections.push(...renderEmotionalShifts(emotionalShifts, speakerMapping));
+  sections.push(...renderEmphasisPatterns(emphasisPatterns, speakerMapping));
+  sections.push(...renderImplicitQuestions(implicitQuestions, speakerMapping));
+  sections.push(...renderImplicitDecisions(implicitDecisions, speakerMapping));
   while (sections[sections.length - 1] === "") sections.pop();
   return sections.join("\n");
 }
@@ -4330,6 +4693,11 @@ async function reRenderWithSpeakerMapping(params) {
   };
   async function writeOutputFile(filename, content) {
     const fullPath = join3(outputDir, filename);
+    try {
+      const existing = await readFile2(fullPath, "utf8");
+      if (existing === content) return;
+    } catch {
+    }
     const dir = dirname(fullPath);
     if (dir !== outputDir) {
       await mkdir(dir, { recursive: true });
@@ -4584,10 +4952,14 @@ async function runDistill(args) {
     const result = await handleYouTube(resolved.value, client);
     fileUri = result.fileUri;
     mimeType = result.mimeType;
-    duration = await detectDuration({
-      ytDlpDuration: result.duration,
-      geminiDuration: result.duration
-    });
+    try {
+      duration = await detectDuration({
+        ytDlpDuration: result.duration,
+        geminiDuration: result.duration
+      });
+    } catch {
+      duration = 600;
+    }
     if (result.uploadedFileName != null) {
       uploadedFileNames = [result.uploadedFileName];
     }
@@ -5095,7 +5467,7 @@ async function run2(args) {
 }
 // src/cli/index.ts
-var version = "0.4.4";
+var version = "0.4.5";
 var DEFAULT_OUTPUT = "./vidistill-output/";
 var SUBCOMMANDS = {
   mcp: run,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "vidistill",
-  "version": "0.4.4",
+  "version": "0.4.5",
   "description": "Video intelligence distiller — extract structured notes, transcripts, and insights from any video using Gemini",
   "type": "module",
   "license": "MIT",