vidistill 0.4.4 → 0.4.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +7 -6
  2. package/dist/index.js +448 -76
  3. package/package.json +1 -1
package/README.md CHANGED
@@ -112,15 +112,16 @@ Supported video formats: MP4, MOV, WebM, MKV, AVI, MPEG, FLV, WMV, 3GPP. Support
112
112
 
113
113
  1. **Input** — accepts YouTube URL directly or reads local file (video or audio), compresses if over 2GB
114
114
  2. **Pass 0** — scene analysis to classify video type and determine processing strategy
115
- 3. **Pass 1** — transcript extraction with speaker identification
116
- 4. **Pass 2** — visual content extraction (screen states, diagrams, slides)
117
- 5. **Pass 3** — specialist passes based on video type:
115
+ 3. **Pass 1a** — pure verbatim transcription (timestamps, tone, emphasis — no speaker labels)
116
+ 4. **Pass 1b** — speaker diarization (assigns SPEAKER_XX labels to transcript entries using voice and visual cues, then merged with 1a)
117
+ 5. **Pass 2** — visual content extraction (screen states, diagrams, slides)
118
+ 6. **Pass 3** — specialist passes based on video type:
118
119
  - 3c: chat and links (live streams) — per segment, runs 3x with consensus voting
119
120
  - 3d: implicit signals (all types) — per segment
120
- - 3b: people and social dynamics (meetings) — whole video
121
+ - 3b: people and social dynamics (meetings) — whole video, anchored to transcript speakers
121
122
  - 3a: code reconstruction (coding videos) — whole video, runs 3x with consensus voting and validation
122
- 6. **Synthesis** — cross-references all passes into unified analysis
123
- 7. **Output** — generates structured markdown files
123
+ 7. **Synthesis** — cross-references all passes into unified analysis
124
+ 8. **Output** — generates structured markdown files
124
125
 
125
126
  Audio files skip visual passes and go straight to transcript, people, implicit signals, and synthesis.
126
127
 
package/dist/index.js CHANGED
@@ -9,12 +9,12 @@ import pc from "picocolors";
9
9
  import { intro, note } from "@clack/prompts";
10
10
 
11
11
  // src/constants/prompts.ts
12
- var SYSTEM_INSTRUCTION_PASS_1 = `
12
+ var SYSTEM_INSTRUCTION_PASS_1A = `
13
13
  You are a professional audio transcriber. Your task is to create a COMPLETE, VERBATIM transcription of all speech in this video segment. Focus EXCLUSIVELY on the audio stream.
14
14
 
15
15
  CRITICAL RULES:
16
16
  1. TRANSCRIBE every spoken word completely and verbatim. Do not summarize, paraphrase, or skip any sentence.
17
- 2. IDENTIFY different speakers. Label them SPEAKER_00, SPEAKER_01, etc. consistently throughout. If a speaker introduces themselves by name, note the name in the first entry's speaker field as "SPEAKER_00 (John)".
17
+ 2. Do NOT identify or label speakers \u2014 focus entirely on transcription accuracy. No SPEAKER_XX labels.
18
18
  3. NOTE tone and emphasis: when a speaker emphasizes words (louder, slower, repeated), mark those words. When they express emotions (excitement, warning, frustration, humor), note the tone.
19
19
  4. RECORD pauses longer than 1.5 seconds as pause markers with duration.
20
20
  5. PRESERVE filler words only when they carry meaning (hesitation indicating uncertainty about code behavior, self-correction). Remove meaningless "um", "uh".
@@ -24,9 +24,25 @@ CRITICAL RULES:
24
24
 
25
25
  COMPLETENESS TARGET:
26
26
  - Aim for at least 150 words per minute of video in the transcript
27
- - Every speaker change must be noted with a new entry
28
27
  - Every sentence must appear \u2014 if in doubt, include it
29
28
  `;
29
+ var SYSTEM_INSTRUCTION_PASS_1B = `
30
+ You are a speaker diarization specialist. Your task is to identify distinct speakers and assign speaker labels to each transcript entry by timestamp.
31
+
32
+ Given the transcript below, identify distinct speakers by analyzing voice characteristics, visual cues (face detection, name tags, on-screen labels), and speaking patterns. Assign a SPEAKER_XX label to each transcript entry by timestamp.
33
+
34
+ TRANSCRIPT FROM THIS SEGMENT:
35
+ {INJECT_PASS1A_TRANSCRIPT_HERE}
36
+
37
+ CRITICAL RULES:
38
+ 1. ASSIGN a SPEAKER_XX label (SPEAKER_00, SPEAKER_01, etc.) to each transcript entry by matching its timestamp.
39
+ 2. DIFFERENTIATE speakers by: voice pitch and tone, visual position on screen, name tags or captions, turn-taking patterns, and speaking style.
40
+ 3. If a speaker introduces themselves by name or their name is visible on screen, label them as "SPEAKER_XX (Name)" \u2014 e.g., "SPEAKER_00 (Alice)".
41
+ 4. Be CONSISTENT: the same speaker must always get the same label throughout the segment.
42
+ 5. Provide a speaker_summary describing each identified speaker (voice characteristics, visual appearance, role if detectable).
43
+ 6. If you cannot distinguish speakers, assign all entries to SPEAKER_00.
44
+ 7. NEVER re-transcribe the speech \u2014 only assign speaker labels by timestamp.
45
+ `;
30
46
  var SYSTEM_INSTRUCTION_PASS_2_TEMPLATE = `
31
47
  You are a professional code and visual content extractor. Your task is to extract ALL visual content from this video segment \u2014 every piece of code on screen, every diagram, every slide, every UI element.
32
48
 
@@ -139,6 +155,7 @@ CRITICAL RULES:
139
155
  8. NEVER guess or infer a name that was not clearly stated or shown. Use "Unknown Participant" with a description if the person cannot be identified.
140
156
  9. NEVER merge two people just because they have the same role \u2014 if two engineers speak, they are two separate participants.
141
157
  10. If a person's role or organization cannot be determined, use empty string \u2014 do not guess.
158
+ 11. Only identify participants who spoke during the meeting. Do not extract names from GitHub pages, Zoom participant lists, slides, or other visual elements unless that person also spoke.
142
159
 
143
160
  COMPLETENESS TARGET:
144
161
  - Every speaker label (SPEAKER_00, SPEAKER_01, etc.) from the transcript must map to at least one participant entry
@@ -474,6 +491,8 @@ import { spinner, progress } from "@clack/prompts";
474
491
  var PHASE_LABELS = {
475
492
  pass0: "Understanding your video...",
476
493
  pass1: "Extracting transcript...",
494
+ pass1a: "Transcribing...",
495
+ pass1b: "Identifying speakers...",
477
496
  pass2: "Analyzing visuals...",
478
497
  pass3a: "Reconstructing code...",
479
498
  pass3b: "Identifying participants...",
@@ -576,6 +595,7 @@ var RateLimiter = class {
576
595
  import { existsSync } from "fs";
577
596
 
578
597
  // src/input/youtube.ts
598
+ import { execFile } from "child_process";
579
599
  var YOUTUBE_PATTERNS = [
580
600
  /(?:youtube\.com\/watch\?.*v=|youtu\.be\/|youtube\.com\/embed\/|youtube\.com\/v\/|youtube\.com\/shorts\/)([a-zA-Z0-9_-]{11})/
581
601
  ];
@@ -594,8 +614,30 @@ function normalizeYouTubeUrl(url) {
594
614
  if (!id) return null;
595
615
  return `https://www.youtube.com/watch?v=${id}`;
596
616
  }
617
+ function fetchYtDlpDuration(url) {
618
+ return new Promise((resolve2) => {
619
+ execFile("yt-dlp", ["--dump-json", "--no-download", url], { timeout: 15e3 }, (err, stdout) => {
620
+ if (err) {
621
+ resolve2(void 0);
622
+ return;
623
+ }
624
+ try {
625
+ const data = JSON.parse(stdout);
626
+ const dur = data["duration"];
627
+ if (typeof dur === "number" && dur > 0) {
628
+ resolve2(dur);
629
+ } else {
630
+ resolve2(void 0);
631
+ }
632
+ } catch {
633
+ resolve2(void 0);
634
+ }
635
+ });
636
+ });
637
+ }
597
638
  async function handleYouTube(url, _client) {
598
- return { fileUri: url, mimeType: "video/mp4", source: "direct" };
639
+ const duration = await fetchYtDlpDuration(url);
640
+ return { fileUri: url, mimeType: "video/mp4", source: "direct", duration };
599
641
  }
600
642
 
601
643
  // src/input/resolver.ts
@@ -985,6 +1027,62 @@ var SCHEMA_PASS_1 = {
985
1027
  },
986
1028
  required: ["segment_index", "time_range", "transcript_entries"]
987
1029
  };
1030
+ var SCHEMA_PASS_1A = {
1031
+ type: Type.OBJECT,
1032
+ properties: {
1033
+ segment_index: { type: Type.INTEGER, description: "0-based segment index" },
1034
+ time_range: { type: Type.STRING, description: "Format: HH:MM:SS - HH:MM:SS" },
1035
+ transcript_entries: {
1036
+ type: Type.ARRAY,
1037
+ items: {
1038
+ type: Type.OBJECT,
1039
+ properties: {
1040
+ timestamp: { type: Type.STRING, description: "HH:MM:SS format" },
1041
+ text: { type: Type.STRING, description: "Complete spoken text, verbatim" },
1042
+ tone: {
1043
+ type: Type.STRING,
1044
+ enum: ["neutral", "emphatic", "questioning", "warning", "excited", "humorous", "frustrated", "instructional", "conversational"]
1045
+ },
1046
+ emphasis_words: {
1047
+ type: Type.ARRAY,
1048
+ items: { type: Type.STRING },
1049
+ description: "Words spoken with notable emphasis"
1050
+ },
1051
+ pause_after_seconds: { type: Type.NUMBER, description: "Pause duration in seconds" }
1052
+ },
1053
+ required: ["timestamp", "text", "tone"]
1054
+ }
1055
+ }
1056
+ },
1057
+ required: ["segment_index", "time_range", "transcript_entries"]
1058
+ };
1059
+ var SCHEMA_PASS_1B = {
1060
+ type: Type.OBJECT,
1061
+ properties: {
1062
+ speaker_assignments: {
1063
+ type: Type.ARRAY,
1064
+ items: {
1065
+ type: Type.OBJECT,
1066
+ properties: {
1067
+ timestamp: { type: Type.STRING, description: "HH:MM:SS matching a transcript entry" },
1068
+ speaker: { type: Type.STRING, description: "SPEAKER_00, SPEAKER_01, etc. Optionally with name: SPEAKER_00 (Alice)" }
1069
+ },
1070
+ required: ["timestamp", "speaker"]
1071
+ }
1072
+ },
1073
+ speaker_summary: {
1074
+ type: Type.ARRAY,
1075
+ items: {
1076
+ type: Type.OBJECT,
1077
+ properties: {
1078
+ speaker_id: { type: Type.STRING },
1079
+ description: { type: Type.STRING }
1080
+ }
1081
+ }
1082
+ }
1083
+ },
1084
+ required: ["speaker_assignments", "speaker_summary"]
1085
+ };
988
1086
  var SCHEMA_PASS_2 = {
989
1087
  type: Type.OBJECT,
990
1088
  properties: {
@@ -1448,6 +1546,18 @@ function applySpeakerMapping(label, mapping) {
1448
1546
  }
1449
1547
  return label;
1450
1548
  }
1549
+ function replaceNamesInText(text4, mapping) {
1550
+ if (!mapping || text4.length === 0) return text4;
1551
+ const entries = Object.entries(mapping).filter(([key, value]) => key !== value && !/^SPEAKER_\d+$/.test(key)).sort((a, b) => b[0].length - a[0].length);
1552
+ if (entries.length === 0) return text4;
1553
+ let result = text4;
1554
+ for (const [key, value] of entries) {
1555
+ const escaped = key.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
1556
+ const re = new RegExp(`\\b${escaped}\\b`, "g");
1557
+ result = result.replace(re, value);
1558
+ }
1559
+ return result;
1560
+ }
1451
1561
  function buildExpandedMapping(segments, speakerMapping) {
1452
1562
  const expanded = { ...speakerMapping };
1453
1563
  for (const seg of segments) {
@@ -1501,8 +1611,8 @@ function changeTypeBadge(changeType) {
1501
1611
  return badges[changeType] || `[${changeType.toUpperCase()}]`;
1502
1612
  }
1503
1613
 
1504
- // src/passes/transcript.ts
1505
- async function runTranscript(params) {
1614
+ // src/passes/transcription.ts
1615
+ async function runTranscription(params) {
1506
1616
  const { client, fileUri, mimeType, segment, model, resolution, lang } = params;
1507
1617
  const contents = [
1508
1618
  {
@@ -1525,20 +1635,107 @@ async function runTranscript(params) {
1525
1635
  model,
1526
1636
  contents,
1527
1637
  config: {
1528
- systemInstruction: withLanguage(SYSTEM_INSTRUCTION_PASS_1, lang),
1529
- responseSchema: SCHEMA_PASS_1,
1638
+ systemInstruction: withLanguage(SYSTEM_INSTRUCTION_PASS_1A, lang),
1639
+ responseSchema: SCHEMA_PASS_1A,
1530
1640
  responseMimeType: "application/json",
1531
1641
  ...resolution !== void 0 ? { mediaResolution: resolution } : {},
1532
1642
  maxOutputTokens: 65536,
1533
- temperature: 0
1643
+ temperature: 1
1534
1644
  }
1535
1645
  });
1536
1646
  if (result === null || typeof result !== "object" || !Array.isArray(result["transcript_entries"])) {
1537
- throw new Error("Empty response from Gemini Pass 1");
1647
+ throw new Error("Empty response from Gemini Pass 1a");
1648
+ }
1649
+ return result;
1650
+ }
1651
+
1652
+ // src/passes/diarization.ts
1653
+ function formatTranscriptForInjection(pass1a) {
1654
+ if (pass1a.transcript_entries.length === 0) {
1655
+ return "[No transcript entries in this segment]";
1656
+ }
1657
+ return pass1a.transcript_entries.map((e) => `[${e.timestamp}] ${e.text}`).join("\n");
1658
+ }
1659
+ async function runDiarization(params) {
1660
+ const { client, fileUri, mimeType, segment, model, resolution, lang, pass1aResult } = params;
1661
+ const transcriptText = formatTranscriptForInjection(pass1aResult);
1662
+ const systemInstruction = withLanguage(
1663
+ SYSTEM_INSTRUCTION_PASS_1B.replace("{INJECT_PASS1A_TRANSCRIPT_HERE}", transcriptText),
1664
+ lang
1665
+ );
1666
+ const contents = [
1667
+ {
1668
+ role: "user",
1669
+ parts: [
1670
+ {
1671
+ fileData: { fileUri, mimeType },
1672
+ videoMetadata: {
1673
+ startOffset: `${segment.startTime}s`,
1674
+ endOffset: `${segment.endTime}s`
1675
+ }
1676
+ },
1677
+ {
1678
+ text: `Process segment #${segment.index + 1}. Identify speakers from ${formatTime(segment.startTime)} to ${formatTime(segment.endTime)}.`
1679
+ }
1680
+ ]
1681
+ }
1682
+ ];
1683
+ const result = await client.generate({
1684
+ model,
1685
+ contents,
1686
+ config: {
1687
+ systemInstruction,
1688
+ responseSchema: SCHEMA_PASS_1B,
1689
+ responseMimeType: "application/json",
1690
+ ...resolution !== void 0 ? { mediaResolution: resolution } : {},
1691
+ maxOutputTokens: 65536,
1692
+ temperature: 1
1693
+ }
1694
+ });
1695
+ if (result === null || typeof result !== "object" || !Array.isArray(result["speaker_assignments"])) {
1696
+ throw new Error("Empty response from Gemini Pass 1b");
1538
1697
  }
1539
1698
  return result;
1540
1699
  }
1541
1700
 
1701
+ // src/passes/transcript-merge.ts
1702
+ var MAX_MATCH_WINDOW_S = 3;
1703
+ function mergeTranscriptResults(pass1a, pass1b) {
1704
+ const assignments = pass1b.speaker_assignments.map((a) => ({
1705
+ ...a,
1706
+ seconds: parseTimestamp(a.timestamp),
1707
+ used: false
1708
+ }));
1709
+ const transcript_entries = pass1a.transcript_entries.map((entry) => {
1710
+ const entrySeconds = parseTimestamp(entry.timestamp);
1711
+ let bestIdx = -1;
1712
+ let bestDelta = Infinity;
1713
+ for (let i = 0; i < assignments.length; i++) {
1714
+ if (assignments[i].used) continue;
1715
+ const delta = Math.abs(assignments[i].seconds - entrySeconds);
1716
+ if (delta < bestDelta) {
1717
+ bestDelta = delta;
1718
+ bestIdx = i;
1719
+ }
1720
+ }
1721
+ let speaker = "SPEAKER_UNKNOWN";
1722
+ if (bestIdx >= 0 && bestDelta <= MAX_MATCH_WINDOW_S) {
1723
+ speaker = assignments[bestIdx].speaker;
1724
+ assignments[bestIdx].used = true;
1725
+ }
1726
+ return {
1727
+ ...entry,
1728
+ speaker
1729
+ };
1730
+ });
1731
+ return {
1732
+ segment_index: pass1a.segment_index,
1733
+ time_range: pass1a.time_range,
1734
+ transcript_entries,
1735
+ speaker_summary: pass1b.speaker_summary
1736
+ };
1737
+ }
1738
+
1542
1739
  // src/passes/visual.ts
1543
1740
  async function runVisual(params) {
1544
1741
  const { client, fileUri, mimeType, segment, model, resolution, pass1Transcript, lang } = params;
@@ -1573,7 +1770,7 @@ async function runVisual(params) {
1573
1770
  responseMimeType: "application/json",
1574
1771
  ...resolution !== void 0 ? { mediaResolution: resolution } : {},
1575
1772
  maxOutputTokens: 65536,
1576
- temperature: 0
1773
+ temperature: 1
1577
1774
  }
1578
1775
  });
1579
1776
  if (result === null || typeof result !== "object" || !Array.isArray(result["code_blocks"])) {
@@ -1721,19 +1918,22 @@ ${contextText}`
1721
1918
 
1722
1919
  // src/passes/people.ts
1723
1920
  async function runPeopleExtraction(params) {
1724
- const { client, fileUri, mimeType, model, pass1Results, lang } = params;
1921
+ const { client, fileUri, mimeType, model, pass1Results, lang, canonicalSpeakers } = params;
1725
1922
  const hasAnyTranscript = pass1Results.some((r) => r != null);
1726
1923
  const transcriptText = hasAnyTranscript ? pass1Results.filter((r) => r != null).flatMap(
1727
1924
  (r) => r.transcript_entries.map((t) => `[${t.timestamp}] ${t.speaker}: ${t.text}`)
1728
1925
  ).join("\n") : "[No transcript available]";
1729
1926
  const transcriptContext = `TRANSCRIPT FROM ALL SEGMENTS:
1730
1927
  ${transcriptText}`;
1928
+ const speakerConstraint = canonicalSpeakers && canonicalSpeakers.length > 0 ? `CONFIRMED SPEAKERS: ${canonicalSpeakers.join(", ")}. Extract details about these speakers only.
1929
+
1930
+ ` : "";
1731
1931
  const contents = [
1732
1932
  {
1733
1933
  role: "user",
1734
1934
  parts: [
1735
1935
  { fileData: { fileUri, mimeType } },
1736
- { text: `Analyze the entire video. ${transcriptContext}` }
1936
+ { text: `Analyze the entire video. ${speakerConstraint}${transcriptContext}` }
1737
1937
  ]
1738
1938
  }
1739
1939
  ];
@@ -1745,7 +1945,7 @@ ${transcriptText}`;
1745
1945
  responseSchema: SCHEMA_PASS_3B,
1746
1946
  responseMimeType: "application/json",
1747
1947
  maxOutputTokens: 65536,
1748
- temperature: 0
1948
+ temperature: 1
1749
1949
  }
1750
1950
  });
1751
1951
  if (result === null || typeof result !== "object" || !Array.isArray(result["participants"]) || !Array.isArray(result["relationships"])) {
@@ -1795,7 +1995,7 @@ ${contextText}`
1795
1995
  responseMimeType: "application/json",
1796
1996
  ...resolution !== void 0 ? { mediaResolution: resolution } : {},
1797
1997
  maxOutputTokens: 65536,
1798
- temperature: 0
1998
+ temperature: 1
1799
1999
  }
1800
2000
  });
1801
2001
  if (result === null || typeof result !== "object" || !Array.isArray(result["messages"]) || !Array.isArray(result["links"])) {
@@ -2405,6 +2605,123 @@ function validateCodeReconstruction(params) {
2405
2605
  return { confirmed, uncertain, rejected, warnings };
2406
2606
  }
2407
2607
 
2608
+ // src/core/speaker-reconciliation.ts
2609
+ var SPEAKER_NAME_RE = /^(SPEAKER_\d+)\s*\((.+)\)$/;
2610
+ function parseLabel(label) {
2611
+ const m = SPEAKER_NAME_RE.exec(label.trim());
2612
+ if (m) {
2613
+ return { base: m[1], name: m[2].toLowerCase() };
2614
+ }
2615
+ return { base: label.trim(), name: null };
2616
+ }
2617
+ function formatLabel(base, originalName) {
2618
+ return originalName != null ? `${base} (${originalName})` : base;
2619
+ }
2620
+ function reconcileSpeakers(params) {
2621
+ const { pass1Results } = params;
2622
+ const namedGroups = /* @__PURE__ */ new Map();
2623
+ const unnamedGroups = /* @__PURE__ */ new Map();
2624
+ let nextCanonicalIndex = 0;
2625
+ const rawMapping = /* @__PURE__ */ new Map();
2626
+ function getOrAssignNamed(name, originalName, description) {
2627
+ const existing = namedGroups.get(name);
2628
+ if (existing) {
2629
+ if (description) existing.descriptions.push(description);
2630
+ return existing.canonicalIndex;
2631
+ }
2632
+ const idx = nextCanonicalIndex++;
2633
+ namedGroups.set(name, {
2634
+ canonicalIndex: idx,
2635
+ originalName,
2636
+ descriptions: description ? [description] : []
2637
+ });
2638
+ return idx;
2639
+ }
2640
+ function getOrAssignUnnamed(segmentKey, description) {
2641
+ const existing = unnamedGroups.get(segmentKey);
2642
+ if (existing) {
2643
+ if (description) existing.descriptions.push(description);
2644
+ return existing.canonicalIndex;
2645
+ }
2646
+ const idx = nextCanonicalIndex++;
2647
+ unnamedGroups.set(segmentKey, {
2648
+ canonicalIndex: idx,
2649
+ descriptions: description ? [description] : []
2650
+ });
2651
+ return idx;
2652
+ }
2653
+ for (let segIdx = 0; segIdx < pass1Results.length; segIdx++) {
2654
+ const result = pass1Results[segIdx];
2655
+ if (result == null) continue;
2656
+ const labelsInSegment = /* @__PURE__ */ new Set();
2657
+ for (const entry of result.speaker_summary ?? []) {
2658
+ if (entry.speaker_id) labelsInSegment.add(entry.speaker_id);
2659
+ }
2660
+ for (const entry of result.transcript_entries ?? []) {
2661
+ if (entry.speaker) labelsInSegment.add(entry.speaker);
2662
+ }
2663
+ const descriptionByLabel = /* @__PURE__ */ new Map();
2664
+ for (const entry of result.speaker_summary ?? []) {
2665
+ if (entry.speaker_id) {
2666
+ descriptionByLabel.set(entry.speaker_id, entry.description ?? "");
2667
+ }
2668
+ }
2669
+ for (const label of labelsInSegment) {
2670
+ const mapKey = `${segIdx}:${label}`;
2671
+ if (rawMapping.has(mapKey)) continue;
2672
+ const { name } = parseLabel(label);
2673
+ const description = descriptionByLabel.get(label) ?? "";
2674
+ let canonicalIdx;
2675
+ if (name != null) {
2676
+ canonicalIdx = getOrAssignNamed(
2677
+ name,
2678
+ /* originalName */
2679
+ parseOriginalName(label),
2680
+ description
2681
+ );
2682
+ } else {
2683
+ canonicalIdx = getOrAssignUnnamed(mapKey, description);
2684
+ }
2685
+ rawMapping.set(mapKey, canonicalIdx);
2686
+ }
2687
+ }
2688
+ if (rawMapping.size === 0) {
2689
+ return { mapping: {}, canonicalSpeakers: [] };
2690
+ }
2691
+ const slots = Array.from(
2692
+ { length: nextCanonicalIndex },
2693
+ () => ({ originalName: null, descriptions: [] })
2694
+ );
2695
+ for (const [, group] of namedGroups) {
2696
+ slots[group.canonicalIndex] = {
2697
+ originalName: group.originalName,
2698
+ descriptions: group.descriptions
2699
+ };
2700
+ }
2701
+ for (const [, group] of unnamedGroups) {
2702
+ slots[group.canonicalIndex] = {
2703
+ originalName: null,
2704
+ descriptions: group.descriptions
2705
+ };
2706
+ }
2707
+ const canonicalSpeakers = slots.map((slot, idx) => ({
2708
+ label: formatLabel(formatCanonicalBase(idx), slot.originalName),
2709
+ descriptions: slot.descriptions
2710
+ }));
2711
+ const mapping = {};
2712
+ for (const [mapKey, canonicalIdx] of rawMapping) {
2713
+ mapping[mapKey] = canonicalSpeakers[canonicalIdx].label;
2714
+ }
2715
+ return { mapping, canonicalSpeakers };
2716
+ }
2717
+ function parseOriginalName(label) {
2718
+ const m = SPEAKER_NAME_RE.exec(label.trim());
2719
+ return m ? m[2] : label.trim();
2720
+ }
2721
+ function formatCanonicalBase(index) {
2722
+ return `SPEAKER_${String(index).padStart(2, "0")}`;
2723
+ }
2724
+
2408
2725
  // src/core/pipeline.ts
2409
2726
  var RETRY_DELAYS_MS = [2e3, 4e3, 8e3];
2410
2727
  async function withRetry(fn, label) {
@@ -2484,7 +2801,7 @@ async function runPipeline(config) {
2484
2801
  const results = [];
2485
2802
  const n = segments.length;
2486
2803
  const linkConsensusRuns = 3;
2487
- const callsPerSegment = 2 + (strategy.passes.includes("chat") ? linkConsensusRuns : 0) + (strategy.passes.includes("implicit") ? 1 : 0);
2804
+ const callsPerSegment = 3 + (strategy.passes.includes("chat") ? linkConsensusRuns : 0) + (strategy.passes.includes("implicit") ? 1 : 0);
2488
2805
  const postSegmentCalls = (strategy.passes.includes("people") ? 1 : 0) + (strategy.passes.includes("code") ? 3 : 0) + (strategy.passes.includes("synthesis") ? 1 : 0);
2489
2806
  const totalSteps = n * callsPerSegment + postSegmentCalls;
2490
2807
  let currentStep = 0;
@@ -2501,21 +2818,40 @@ async function runPipeline(config) {
2501
2818
  break;
2502
2819
  }
2503
2820
  const segment = segments[i];
2504
- onProgress?.({ phase: "pass1", segment: i, totalSegments: n, status: "running", totalSteps });
2505
- let pass1 = null;
2506
- const pass1Attempt = await withRetry(
2507
- () => rateLimiter.execute(() => runTranscript({ client, fileUri, mimeType, segment, model, resolution, lang }), { onWait }),
2508
- `segment ${i} pass1`
2821
+ onProgress?.({ phase: "pass1a", segment: i, totalSegments: n, status: "running", totalSteps });
2822
+ const pass1aAttempt = await withRetry(
2823
+ () => rateLimiter.execute(() => runTranscription({ client, fileUri, mimeType, segment, model, resolution, lang }), { onWait }),
2824
+ `segment ${i} pass1a`
2509
2825
  );
2510
- if (pass1Attempt.error !== null) {
2511
- log4.warn(pass1Attempt.error);
2512
- errors.push(pass1Attempt.error);
2513
- } else {
2514
- pass1 = pass1Attempt.result;
2515
- pass1RanOnce = true;
2826
+ let pass1aResult = pass1aAttempt.error !== null ? null : pass1aAttempt.result;
2827
+ if (pass1aAttempt.error !== null) {
2828
+ log4.warn(pass1aAttempt.error);
2829
+ errors.push(pass1aAttempt.error);
2516
2830
  }
2517
2831
  currentStep++;
2518
- onProgress?.({ phase: "pass1", segment: i, totalSegments: n, status: "done", currentStep, totalSteps });
2832
+ onProgress?.({ phase: "pass1a", segment: i, totalSegments: n, status: "done", currentStep, totalSteps });
2833
+ let pass1 = null;
2834
+ if (pass1aResult != null) {
2835
+ onProgress?.({ phase: "pass1b", segment: i, totalSegments: n, status: "running", totalSteps });
2836
+ const p1a = pass1aResult;
2837
+ const pass1bAttempt = await withRetry(
2838
+ () => rateLimiter.execute(() => runDiarization({ client, fileUri, mimeType, segment, model, resolution, lang, pass1aResult: p1a }), { onWait }),
2839
+ `segment ${i} pass1b`
2840
+ );
2841
+ if (pass1bAttempt.error !== null) {
2842
+ log4.warn(pass1bAttempt.error);
2843
+ errors.push(pass1bAttempt.error);
2844
+ pass1 = mergeTranscriptResults(pass1aResult, { speaker_assignments: [], speaker_summary: [] });
2845
+ } else if (pass1bAttempt.result != null) {
2846
+ pass1 = mergeTranscriptResults(pass1aResult, pass1bAttempt.result);
2847
+ }
2848
+ currentStep++;
2849
+ onProgress?.({ phase: "pass1b", segment: i, totalSegments: n, status: "done", currentStep, totalSteps });
2850
+ pass1RanOnce = true;
2851
+ } else {
2852
+ currentStep++;
2853
+ onProgress?.({ phase: "pass1b", segment: i, totalSegments: n, status: "done", currentStep, totalSteps });
2854
+ }
2519
2855
  onProgress?.({ phase: "pass2", segment: i, totalSegments: n, status: "running", totalSteps });
2520
2856
  let pass2 = null;
2521
2857
  const pass2Attempt = await withRetry(
@@ -2632,6 +2968,31 @@ async function runPipeline(config) {
2632
2968
  }
2633
2969
  const pass1Results = results.map((r) => r.pass1);
2634
2970
  const pass2Results = results.map((r) => r.pass2);
2971
+ let canonicalSpeakers = [];
2972
+ try {
2973
+ const reconciliationResult = reconcileSpeakers({ pass1Results });
2974
+ canonicalSpeakers = reconciliationResult.canonicalSpeakers;
2975
+ const { mapping } = reconciliationResult;
2976
+ for (let segIdx = 0; segIdx < pass1Results.length; segIdx++) {
2977
+ const r = pass1Results[segIdx];
2978
+ if (r == null) continue;
2979
+ for (const entry of r.transcript_entries ?? []) {
2980
+ if (entry.speaker) {
2981
+ const canonical = mapping[`${segIdx}:${entry.speaker}`];
2982
+ if (canonical !== void 0) entry.speaker = canonical;
2983
+ }
2984
+ }
2985
+ for (const entry of r.speaker_summary ?? []) {
2986
+ if (entry.speaker_id) {
2987
+ const canonical = mapping[`${segIdx}:${entry.speaker_id}`];
2988
+ if (canonical !== void 0) entry.speaker_id = canonical;
2989
+ }
2990
+ }
2991
+ }
2992
+ } catch (e) {
2993
+ const msg = e instanceof Error ? e.message : String(e);
2994
+ log4.warn(`speaker reconciliation failed, continuing with original labels: ${msg}`);
2995
+ }
2635
2996
  let peopleExtraction = null;
2636
2997
  if (strategy.passes.includes("people")) {
2637
2998
  onProgress?.({ phase: "pass3b", segment: 0, totalSegments: 1, status: "running", totalSteps });
@@ -2643,7 +3004,8 @@ async function runPipeline(config) {
2643
3004
  mimeType,
2644
3005
  model: MODELS.flash,
2645
3006
  pass1Results,
2646
- lang
3007
+ lang,
3008
+ canonicalSpeakers: canonicalSpeakers.map((s) => s.label)
2647
3009
  }),
2648
3010
  { onWait }
2649
3011
  ),
@@ -2751,7 +3113,7 @@ async function runPipeline(config) {
2751
3113
  }
2752
3114
 
2753
3115
  // src/output/generator.ts
2754
- import { mkdir, writeFile } from "fs/promises";
3116
+ import { mkdir, readFile as readFile2, writeFile } from "fs/promises";
2755
3117
  import { join as join3, dirname } from "path";
2756
3118
 
2757
3119
  // src/output/guide.ts
@@ -2769,11 +3131,11 @@ function renderFilesTable(filesGenerated) {
2769
3131
  |------|
2770
3132
  ${rows}`;
2771
3133
  }
2772
- function renderSuggestions(synthesisResult) {
3134
+ function renderSuggestions(synthesisResult, speakerMapping) {
2773
3135
  if (synthesisResult == null || synthesisResult.suggestions.length === 0) {
2774
3136
  return "_No suggestions._";
2775
3137
  }
2776
- return synthesisResult.suggestions.map((s) => `- ${s}`).join("\n");
3138
+ return synthesisResult.suggestions.map((s) => `- ${replaceNamesInText(s, speakerMapping)}`).join("\n");
2777
3139
  }
2778
3140
  function renderVideoType(profile) {
2779
3141
  if (profile == null) return "unknown";
@@ -2820,7 +3182,8 @@ function renderIncompletePasses(pipelineResult) {
2820
3182
  function writeGuide(params) {
2821
3183
  const { title, source, duration, pipelineResult, filesGenerated, speakerMapping } = params;
2822
3184
  const { synthesisResult, videoProfile } = pipelineResult;
2823
- const overview = synthesisResult?.overview ?? "_No summary available \u2014 synthesis pass did not run or produced no output._";
3185
+ const rawOverview = synthesisResult?.overview ?? "_No summary available \u2014 synthesis pass did not run or produced no output._";
3186
+ const overview = replaceNamesInText(rawOverview, speakerMapping);
2824
3187
  const videoType = renderVideoType(videoProfile);
2825
3188
  const sections = [
2826
3189
  `# ${title}`,
@@ -2841,7 +3204,7 @@ function writeGuide(params) {
2841
3204
  "",
2842
3205
  "## Suggestions",
2843
3206
  "",
2844
- renderSuggestions(synthesisResult),
3207
+ renderSuggestions(synthesisResult, speakerMapping),
2845
3208
  "",
2846
3209
  "## Processing Details",
2847
3210
  "",
@@ -3066,58 +3429,58 @@ ${content}`;
3066
3429
  }
3067
3430
 
3068
3431
  // src/output/notes.ts
3069
- function renderDecisions(decisions) {
3432
+ function renderDecisions(decisions, speakerMapping) {
3070
3433
  if (decisions.length === 0) return [];
3071
3434
  const lines = ["## Key Decisions", ""];
3072
3435
  for (const d of decisions) {
3073
- lines.push(`### [${d.timestamp}] ${d.decision}`);
3436
+ lines.push(`### [${d.timestamp}] ${replaceNamesInText(d.decision, speakerMapping)}`);
3074
3437
  lines.push("");
3075
3438
  if (d.context.length > 0) {
3076
- lines.push(d.context);
3439
+ lines.push(replaceNamesInText(d.context, speakerMapping));
3077
3440
  lines.push("");
3078
3441
  }
3079
3442
  }
3080
3443
  return lines;
3081
3444
  }
3082
- function renderConcepts(concepts) {
3445
+ function renderConcepts(concepts, speakerMapping) {
3083
3446
  if (concepts.length === 0) return [];
3084
3447
  const lines = ["## Key Concepts", ""];
3085
3448
  for (const c of concepts) {
3086
- lines.push(`### [${c.timestamp}] ${c.concept}`);
3449
+ lines.push(`### [${c.timestamp}] ${replaceNamesInText(c.concept, speakerMapping)}`);
3087
3450
  lines.push("");
3088
3451
  if (c.explanation.length > 0) {
3089
- lines.push(c.explanation);
3452
+ lines.push(replaceNamesInText(c.explanation, speakerMapping));
3090
3453
  lines.push("");
3091
3454
  }
3092
3455
  }
3093
3456
  return lines;
3094
3457
  }
3095
- function renderTopics(topics) {
3458
+ function renderTopics(topics, speakerMapping) {
3096
3459
  if (topics.length === 0) return [];
3097
3460
  const lines = ["## Topics", ""];
3098
3461
  for (const t of topics) {
3099
3462
  const tsLabel = t.timestamps.length > 0 ? ` _(${t.timestamps.join(", ")})_` : "";
3100
- lines.push(`### ${t.title}${tsLabel}`);
3463
+ lines.push(`### ${replaceNamesInText(t.title, speakerMapping)}${tsLabel}`);
3101
3464
  lines.push("");
3102
3465
  if (t.summary.length > 0) {
3103
- lines.push(t.summary);
3466
+ lines.push(replaceNamesInText(t.summary, speakerMapping));
3104
3467
  lines.push("");
3105
3468
  }
3106
3469
  if (t.key_points.length > 0) {
3107
3470
  for (const kp of t.key_points) {
3108
- lines.push(`- ${kp}`);
3471
+ lines.push(`- ${replaceNamesInText(kp, speakerMapping)}`);
3109
3472
  }
3110
3473
  lines.push("");
3111
3474
  }
3112
3475
  }
3113
3476
  return lines;
3114
3477
  }
3115
- function renderQuestions(questions) {
3478
+ function renderQuestions(questions, speakerMapping) {
3116
3479
  if (questions.length === 0) return [];
3117
3480
  const lines = ["## Questions Raised", ""];
3118
3481
  for (const q of questions) {
3119
3482
  const status = q.answered ? "(answered)" : "(open)";
3120
- lines.push(`- **[${q.timestamp}]** ${q.question} ${status}`);
3483
+ lines.push(`- **[${q.timestamp}]** ${replaceNamesInText(q.question, speakerMapping)} ${status}`);
3121
3484
  }
3122
3485
  lines.push("");
3123
3486
  return lines;
@@ -3128,7 +3491,7 @@ function renderActionItems(items, speakerMapping) {
3128
3491
  for (const a of items) {
3129
3492
  const mentionedBy = applySpeakerMapping(a.mentioned_by, speakerMapping);
3130
3493
  const by = mentionedBy.length > 0 ? ` \u2014 _${mentionedBy}_` : "";
3131
- lines.push(`- **[${a.timestamp}]** ${a.item}${by}`);
3494
+ lines.push(`- **[${a.timestamp}]** ${replaceNamesInText(a.item, speakerMapping)}${by}`);
3132
3495
  }
3133
3496
  lines.push("");
3134
3497
  return lines;
@@ -3142,20 +3505,20 @@ function writeNotes(params) {
3142
3505
  if (!hasMeaningfulContent(synthesisResult)) return null;
3143
3506
  const sections = ["# Notes", ""];
3144
3507
  if (synthesisResult.overview.length > 0) {
3145
- sections.push(synthesisResult.overview);
3508
+ sections.push(replaceNamesInText(synthesisResult.overview, speakerMapping));
3146
3509
  sections.push("");
3147
3510
  }
3148
- sections.push(...renderDecisions(synthesisResult.key_decisions));
3149
- sections.push(...renderConcepts(synthesisResult.key_concepts));
3150
- sections.push(...renderTopics(synthesisResult.topics));
3151
- sections.push(...renderQuestions(synthesisResult.questions_raised));
3511
+ sections.push(...renderDecisions(synthesisResult.key_decisions, speakerMapping));
3512
+ sections.push(...renderConcepts(synthesisResult.key_concepts, speakerMapping));
3513
+ sections.push(...renderTopics(synthesisResult.topics, speakerMapping));
3514
+ sections.push(...renderQuestions(synthesisResult.questions_raised, speakerMapping));
3152
3515
  sections.push(...renderActionItems(synthesisResult.action_items, speakerMapping));
3153
3516
  while (sections[sections.length - 1] === "") sections.pop();
3154
3517
  return sections.join("\n");
3155
3518
  }
3156
3519
 
3157
3520
  // src/output/people.ts
3158
- function renderParticipant(p, index) {
3521
+ function renderParticipant(p, index, speakerMapping) {
3159
3522
  const lines = [];
3160
3523
  lines.push(`## ${index + 1}. ${p.name}`);
3161
3524
  lines.push("");
@@ -3170,7 +3533,7 @@ function renderParticipant(p, index) {
3170
3533
  lines.push("**Contributions:**");
3171
3534
  lines.push("");
3172
3535
  for (const c of p.contributions) {
3173
- lines.push(`- ${c}`);
3536
+ lines.push(`- ${replaceNamesInText(c, speakerMapping)}`);
3174
3537
  }
3175
3538
  lines.push("");
3176
3539
  }
@@ -3285,13 +3648,13 @@ function writePeople(params) {
3285
3648
  for (let i = 0; i < participants.length; i++) {
3286
3649
  const p = participants[i];
3287
3650
  if (p != null) {
3288
- sections.push(...renderParticipant(p, i));
3651
+ sections.push(...renderParticipant(p, i, speakerMapping));
3289
3652
  }
3290
3653
  }
3291
3654
  if (peopleExtraction.relationships.length > 0) {
3292
3655
  sections.push("## Relationships", "");
3293
3656
  for (const r of peopleExtraction.relationships) {
3294
- sections.push(`- ${r}`);
3657
+ sections.push(`- ${replaceNamesInText(r, speakerMapping)}`);
3295
3658
  }
3296
3659
  sections.push("");
3297
3660
  }
@@ -3447,7 +3810,7 @@ function renderSynthesisItems(items, speakerMapping) {
3447
3810
  for (const a of items) {
3448
3811
  const mentionedBy = applySpeakerMapping(a.mentioned_by, speakerMapping);
3449
3812
  const by = mentionedBy.length > 0 ? ` \u2014 _${mentionedBy}_` : "";
3450
- lines.push(`- [ ] **[${a.timestamp}]** ${a.item}${by}`);
3813
+ lines.push(`- [ ] **[${a.timestamp}]** ${replaceNamesInText(a.item, speakerMapping)}${by}`);
3451
3814
  }
3452
3815
  lines.push("");
3453
3816
  return lines;
@@ -3459,7 +3822,7 @@ function renderAssignedTasks(tasks, speakerMapping) {
3459
3822
  const assignee = applySpeakerMapping(t.assignee, speakerMapping);
3460
3823
  const assigneeStr = assignee.length > 0 ? ` \u2192 _${assignee}_` : "";
3461
3824
  const deadline = t.deadline.length > 0 ? ` (due: ${t.deadline})` : "";
3462
- lines.push(`- [ ] **[${t.timestamp}]** ${t.task}${assigneeStr}${deadline}`);
3825
+ lines.push(`- [ ] **[${t.timestamp}]** ${replaceNamesInText(t.task, speakerMapping)}${assigneeStr}${deadline}`);
3463
3826
  }
3464
3827
  lines.push("");
3465
3828
  return lines;
@@ -3513,19 +3876,19 @@ function collectImplicitDecisions(segments) {
3513
3876
  }
3514
3877
  return decisions;
3515
3878
  }
3516
- function renderEmotionalShifts(shifts) {
3879
+ function renderEmotionalShifts(shifts, speakerMapping) {
3517
3880
  if (shifts.length === 0) return [];
3518
3881
  const lines = ["## Emotional Shifts", ""];
3519
3882
  for (const s of shifts) {
3520
3883
  lines.push(`- **[${s.timestamp}]** ${s.from_state} \u2192 ${s.to_state}`);
3521
3884
  if (s.trigger.length > 0) {
3522
- lines.push(` _Trigger: ${s.trigger}_`);
3885
+ lines.push(` _Trigger: ${replaceNamesInText(s.trigger, speakerMapping)}_`);
3523
3886
  }
3524
3887
  }
3525
3888
  lines.push("");
3526
3889
  return lines;
3527
3890
  }
3528
- function renderEmphasisPatterns(patterns) {
3891
+ function renderEmphasisPatterns(patterns, speakerMapping) {
3529
3892
  if (patterns.length === 0) return [];
3530
3893
  const sorted = [...patterns].sort((a, b) => b.times_mentioned - a.times_mentioned);
3531
3894
  const lines = ["## Emphasis Patterns", ""];
@@ -3534,32 +3897,32 @@ function renderEmphasisPatterns(patterns) {
3534
3897
  lines.push(`### ${p.concept} (\xD7${p.times_mentioned})${ts}`);
3535
3898
  lines.push("");
3536
3899
  if (p.significance.length > 0) {
3537
- lines.push(p.significance);
3900
+ lines.push(replaceNamesInText(p.significance, speakerMapping));
3538
3901
  lines.push("");
3539
3902
  }
3540
3903
  }
3541
3904
  return lines;
3542
3905
  }
3543
- function renderImplicitQuestions(questions) {
3906
+ function renderImplicitQuestions(questions, speakerMapping) {
3544
3907
  if (questions.length === 0) return [];
3545
3908
  const lines = ["## Implicit Questions", ""];
3546
3909
  for (const q of questions) {
3547
- lines.push(`- ${q}`);
3910
+ lines.push(`- ${replaceNamesInText(q, speakerMapping)}`);
3548
3911
  }
3549
3912
  lines.push("");
3550
3913
  return lines;
3551
3914
  }
3552
- function renderImplicitDecisions(decisions) {
3915
+ function renderImplicitDecisions(decisions, speakerMapping) {
3553
3916
  if (decisions.length === 0) return [];
3554
3917
  const lines = ["## Implicit Decisions", ""];
3555
3918
  for (const d of decisions) {
3556
- lines.push(`- ${d}`);
3919
+ lines.push(`- ${replaceNamesInText(d, speakerMapping)}`);
3557
3920
  }
3558
3921
  lines.push("");
3559
3922
  return lines;
3560
3923
  }
3561
3924
  function writeInsights(params) {
3562
- const { segments } = params;
3925
+ const { segments, speakerMapping } = params;
3563
3926
  const hasPass3d = segments.some((s) => s.pass3d != null);
3564
3927
  if (!hasPass3d) return null;
3565
3928
  const emotionalShifts = collectEmotionalShifts(segments);
@@ -3570,10 +3933,10 @@ function writeInsights(params) {
3570
3933
  return null;
3571
3934
  }
3572
3935
  const sections = ["# Insights", ""];
3573
- sections.push(...renderEmotionalShifts(emotionalShifts));
3574
- sections.push(...renderEmphasisPatterns(emphasisPatterns));
3575
- sections.push(...renderImplicitQuestions(implicitQuestions));
3576
- sections.push(...renderImplicitDecisions(implicitDecisions));
3936
+ sections.push(...renderEmotionalShifts(emotionalShifts, speakerMapping));
3937
+ sections.push(...renderEmphasisPatterns(emphasisPatterns, speakerMapping));
3938
+ sections.push(...renderImplicitQuestions(implicitQuestions, speakerMapping));
3939
+ sections.push(...renderImplicitDecisions(implicitDecisions, speakerMapping));
3577
3940
  while (sections[sections.length - 1] === "") sections.pop();
3578
3941
  return sections.join("\n");
3579
3942
  }
@@ -4330,6 +4693,11 @@ async function reRenderWithSpeakerMapping(params) {
4330
4693
  };
4331
4694
  async function writeOutputFile(filename, content) {
4332
4695
  const fullPath = join3(outputDir, filename);
4696
+ try {
4697
+ const existing = await readFile2(fullPath, "utf8");
4698
+ if (existing === content) return;
4699
+ } catch {
4700
+ }
4333
4701
  const dir = dirname(fullPath);
4334
4702
  if (dir !== outputDir) {
4335
4703
  await mkdir(dir, { recursive: true });
@@ -4584,10 +4952,14 @@ async function runDistill(args) {
4584
4952
  const result = await handleYouTube(resolved.value, client);
4585
4953
  fileUri = result.fileUri;
4586
4954
  mimeType = result.mimeType;
4587
- duration = await detectDuration({
4588
- ytDlpDuration: result.duration,
4589
- geminiDuration: result.duration
4590
- });
4955
+ try {
4956
+ duration = await detectDuration({
4957
+ ytDlpDuration: result.duration,
4958
+ geminiDuration: result.duration
4959
+ });
4960
+ } catch {
4961
+ duration = 600;
4962
+ }
4591
4963
  if (result.uploadedFileName != null) {
4592
4964
  uploadedFileNames = [result.uploadedFileName];
4593
4965
  }
@@ -5095,7 +5467,7 @@ async function run2(args) {
5095
5467
  }
5096
5468
 
5097
5469
  // src/cli/index.ts
5098
- var version = "0.4.4";
5470
+ var version = "0.4.5";
5099
5471
  var DEFAULT_OUTPUT = "./vidistill-output/";
5100
5472
  var SUBCOMMANDS = {
5101
5473
  mcp: run,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "vidistill",
3
- "version": "0.4.4",
3
+ "version": "0.4.5",
4
4
  "description": "Video intelligence distiller — extract structured notes, transcripts, and insights from any video using Gemini",
5
5
  "type": "module",
6
6
  "license": "MIT",