vidistill 0.4.3 → 0.4.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -13
- package/dist/index.js +448 -76
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -81,7 +81,6 @@ vidistill-output/my-video/
|
|
|
81
81
|
├── prereqs.md # prerequisite knowledge (when detected)
|
|
82
82
|
├── timeline.html # interactive visual timeline
|
|
83
83
|
├── metadata.json # processing metadata
|
|
84
|
-
├── progress.json # resume checkpoint (during processing)
|
|
85
84
|
└── raw/ # raw pass outputs
|
|
86
85
|
```
|
|
87
86
|
|
|
@@ -89,9 +88,9 @@ Which files are generated depends on the video content — a coding tutorial get
|
|
|
89
88
|
|
|
90
89
|
### Speaker Naming
|
|
91
90
|
|
|
92
|
-
When multiple speakers are detected,
|
|
91
|
+
When multiple speakers are detected, use `rename-speakers` to assign real names. Names replace generic labels (SPEAKER_00, SPEAKER_01) in all output files.
|
|
93
92
|
|
|
94
|
-
To rename speakers
|
|
93
|
+
To rename speakers:
|
|
95
94
|
|
|
96
95
|
```bash
|
|
97
96
|
# Interactive rename — prompts for each speaker
|
|
@@ -107,25 +106,22 @@ vidistill rename-speakers ./vidistill-output/my-meeting/ --rename "Steven Kang"
|
|
|
107
106
|
vidistill rename-speakers ./vidistill-output/my-meeting/ --merge "K Iphone" "Kristian"
|
|
108
107
|
```
|
|
109
108
|
|
|
110
|
-
### Resume
|
|
111
|
-
|
|
112
|
-
If a run is interrupted (Ctrl+C), progress is saved automatically. Re-running the same command detects the incomplete run and offers to resume from where it left off.
|
|
113
|
-
|
|
114
109
|
## How It Works
|
|
115
110
|
|
|
116
111
|
Supported video formats: MP4, MOV, WebM, MKV, AVI, MPEG, FLV, WMV, 3GPP. Supported audio formats: MP3, AAC, WAV, FLAC, OGG, M4A.
|
|
117
112
|
|
|
118
113
|
1. **Input** — accepts YouTube URL directly or reads local file (video or audio), compresses if over 2GB
|
|
119
114
|
2. **Pass 0** — scene analysis to classify video type and determine processing strategy
|
|
120
|
-
3. **Pass
|
|
121
|
-
4. **Pass
|
|
122
|
-
5. **Pass
|
|
115
|
+
3. **Pass 1a** — pure verbatim transcription (timestamps, tone, emphasis — no speaker labels)
|
|
116
|
+
4. **Pass 1b** — speaker diarization (assigns SPEAKER_XX labels to transcript entries using voice and visual cues, then merged with 1a)
|
|
117
|
+
5. **Pass 2** — visual content extraction (screen states, diagrams, slides)
|
|
118
|
+
6. **Pass 3** — specialist passes based on video type:
|
|
123
119
|
- 3c: chat and links (live streams) — per segment, runs 3x with consensus voting
|
|
124
120
|
- 3d: implicit signals (all types) — per segment
|
|
125
|
-
- 3b: people and social dynamics (meetings) — whole video
|
|
121
|
+
- 3b: people and social dynamics (meetings) — whole video, anchored to transcript speakers
|
|
126
122
|
- 3a: code reconstruction (coding videos) — whole video, runs 3x with consensus voting and validation
|
|
127
|
-
|
|
128
|
-
|
|
123
|
+
7. **Synthesis** — cross-references all passes into unified analysis
|
|
124
|
+
8. **Output** — generates structured markdown files
|
|
129
125
|
|
|
130
126
|
Audio files skip visual passes and go straight to transcript, people, implicit signals, and synthesis.
|
|
131
127
|
|
package/dist/index.js
CHANGED
|
@@ -9,12 +9,12 @@ import pc from "picocolors";
|
|
|
9
9
|
import { intro, note } from "@clack/prompts";
|
|
10
10
|
|
|
11
11
|
// src/constants/prompts.ts
|
|
12
|
-
var
|
|
12
|
+
var SYSTEM_INSTRUCTION_PASS_1A = `
|
|
13
13
|
You are a professional audio transcriber. Your task is to create a COMPLETE, VERBATIM transcription of all speech in this video segment. Focus EXCLUSIVELY on the audio stream.
|
|
14
14
|
|
|
15
15
|
CRITICAL RULES:
|
|
16
16
|
1. TRANSCRIBE every spoken word completely and verbatim. Do not summarize, paraphrase, or skip any sentence.
|
|
17
|
-
2.
|
|
17
|
+
2. Do NOT identify or label speakers \u2014 focus entirely on transcription accuracy. No SPEAKER_XX labels.
|
|
18
18
|
3. NOTE tone and emphasis: when a speaker emphasizes words (louder, slower, repeated), mark those words. When they express emotions (excitement, warning, frustration, humor), note the tone.
|
|
19
19
|
4. RECORD pauses longer than 1.5 seconds as pause markers with duration.
|
|
20
20
|
5. PRESERVE filler words only when they carry meaning (hesitation indicating uncertainty about code behavior, self-correction). Remove meaningless "um", "uh".
|
|
@@ -24,9 +24,25 @@ CRITICAL RULES:
|
|
|
24
24
|
|
|
25
25
|
COMPLETENESS TARGET:
|
|
26
26
|
- Aim for at least 150 words per minute of video in the transcript
|
|
27
|
-
- Every speaker change must be noted with a new entry
|
|
28
27
|
- Every sentence must appear \u2014 if in doubt, include it
|
|
29
28
|
`;
|
|
29
|
+
var SYSTEM_INSTRUCTION_PASS_1B = `
|
|
30
|
+
You are a speaker diarization specialist. Your task is to identify distinct speakers and assign speaker labels to each transcript entry by timestamp.
|
|
31
|
+
|
|
32
|
+
Given the transcript below, identify distinct speakers by analyzing voice characteristics, visual cues (face detection, name tags, on-screen labels), and speaking patterns. Assign a SPEAKER_XX label to each transcript entry by timestamp.
|
|
33
|
+
|
|
34
|
+
TRANSCRIPT FROM THIS SEGMENT:
|
|
35
|
+
{INJECT_PASS1A_TRANSCRIPT_HERE}
|
|
36
|
+
|
|
37
|
+
CRITICAL RULES:
|
|
38
|
+
1. ASSIGN a SPEAKER_XX label (SPEAKER_00, SPEAKER_01, etc.) to each transcript entry by matching its timestamp.
|
|
39
|
+
2. DIFFERENTIATE speakers by: voice pitch and tone, visual position on screen, name tags or captions, turn-taking patterns, and speaking style.
|
|
40
|
+
3. If a speaker introduces themselves by name or their name is visible on screen, label them as "SPEAKER_XX (Name)" \u2014 e.g., "SPEAKER_00 (Alice)".
|
|
41
|
+
4. Be CONSISTENT: the same speaker must always get the same label throughout the segment.
|
|
42
|
+
5. Provide a speaker_summary describing each identified speaker (voice characteristics, visual appearance, role if detectable).
|
|
43
|
+
6. If you cannot distinguish speakers, assign all entries to SPEAKER_00.
|
|
44
|
+
7. NEVER re-transcribe the speech \u2014 only assign speaker labels by timestamp.
|
|
45
|
+
`;
|
|
30
46
|
var SYSTEM_INSTRUCTION_PASS_2_TEMPLATE = `
|
|
31
47
|
You are a professional code and visual content extractor. Your task is to extract ALL visual content from this video segment \u2014 every piece of code on screen, every diagram, every slide, every UI element.
|
|
32
48
|
|
|
@@ -139,6 +155,7 @@ CRITICAL RULES:
|
|
|
139
155
|
8. NEVER guess or infer a name that was not clearly stated or shown. Use "Unknown Participant" with a description if the person cannot be identified.
|
|
140
156
|
9. NEVER merge two people just because they have the same role \u2014 if two engineers speak, they are two separate participants.
|
|
141
157
|
10. If a person's role or organization cannot be determined, use empty string \u2014 do not guess.
|
|
158
|
+
11. Only identify participants who spoke during the meeting. Do not extract names from GitHub pages, Zoom participant lists, slides, or other visual elements unless that person also spoke.
|
|
142
159
|
|
|
143
160
|
COMPLETENESS TARGET:
|
|
144
161
|
- Every speaker label (SPEAKER_00, SPEAKER_01, etc.) from the transcript must map to at least one participant entry
|
|
@@ -474,6 +491,8 @@ import { spinner, progress } from "@clack/prompts";
|
|
|
474
491
|
var PHASE_LABELS = {
|
|
475
492
|
pass0: "Understanding your video...",
|
|
476
493
|
pass1: "Extracting transcript...",
|
|
494
|
+
pass1a: "Transcribing...",
|
|
495
|
+
pass1b: "Identifying speakers...",
|
|
477
496
|
pass2: "Analyzing visuals...",
|
|
478
497
|
pass3a: "Reconstructing code...",
|
|
479
498
|
pass3b: "Identifying participants...",
|
|
@@ -576,6 +595,7 @@ var RateLimiter = class {
|
|
|
576
595
|
import { existsSync } from "fs";
|
|
577
596
|
|
|
578
597
|
// src/input/youtube.ts
|
|
598
|
+
import { execFile } from "child_process";
|
|
579
599
|
var YOUTUBE_PATTERNS = [
|
|
580
600
|
/(?:youtube\.com\/watch\?.*v=|youtu\.be\/|youtube\.com\/embed\/|youtube\.com\/v\/|youtube\.com\/shorts\/)([a-zA-Z0-9_-]{11})/
|
|
581
601
|
];
|
|
@@ -594,8 +614,30 @@ function normalizeYouTubeUrl(url) {
|
|
|
594
614
|
if (!id) return null;
|
|
595
615
|
return `https://www.youtube.com/watch?v=${id}`;
|
|
596
616
|
}
|
|
617
|
+
function fetchYtDlpDuration(url) {
|
|
618
|
+
return new Promise((resolve2) => {
|
|
619
|
+
execFile("yt-dlp", ["--dump-json", "--no-download", url], { timeout: 15e3 }, (err, stdout) => {
|
|
620
|
+
if (err) {
|
|
621
|
+
resolve2(void 0);
|
|
622
|
+
return;
|
|
623
|
+
}
|
|
624
|
+
try {
|
|
625
|
+
const data = JSON.parse(stdout);
|
|
626
|
+
const dur = data["duration"];
|
|
627
|
+
if (typeof dur === "number" && dur > 0) {
|
|
628
|
+
resolve2(dur);
|
|
629
|
+
} else {
|
|
630
|
+
resolve2(void 0);
|
|
631
|
+
}
|
|
632
|
+
} catch {
|
|
633
|
+
resolve2(void 0);
|
|
634
|
+
}
|
|
635
|
+
});
|
|
636
|
+
});
|
|
637
|
+
}
|
|
597
638
|
async function handleYouTube(url, _client) {
|
|
598
|
-
|
|
639
|
+
const duration = await fetchYtDlpDuration(url);
|
|
640
|
+
return { fileUri: url, mimeType: "video/mp4", source: "direct", duration };
|
|
599
641
|
}
|
|
600
642
|
|
|
601
643
|
// src/input/resolver.ts
|
|
@@ -985,6 +1027,62 @@ var SCHEMA_PASS_1 = {
|
|
|
985
1027
|
},
|
|
986
1028
|
required: ["segment_index", "time_range", "transcript_entries"]
|
|
987
1029
|
};
|
|
1030
|
+
var SCHEMA_PASS_1A = {
|
|
1031
|
+
type: Type.OBJECT,
|
|
1032
|
+
properties: {
|
|
1033
|
+
segment_index: { type: Type.INTEGER, description: "0-based segment index" },
|
|
1034
|
+
time_range: { type: Type.STRING, description: "Format: HH:MM:SS - HH:MM:SS" },
|
|
1035
|
+
transcript_entries: {
|
|
1036
|
+
type: Type.ARRAY,
|
|
1037
|
+
items: {
|
|
1038
|
+
type: Type.OBJECT,
|
|
1039
|
+
properties: {
|
|
1040
|
+
timestamp: { type: Type.STRING, description: "HH:MM:SS format" },
|
|
1041
|
+
text: { type: Type.STRING, description: "Complete spoken text, verbatim" },
|
|
1042
|
+
tone: {
|
|
1043
|
+
type: Type.STRING,
|
|
1044
|
+
enum: ["neutral", "emphatic", "questioning", "warning", "excited", "humorous", "frustrated", "instructional", "conversational"]
|
|
1045
|
+
},
|
|
1046
|
+
emphasis_words: {
|
|
1047
|
+
type: Type.ARRAY,
|
|
1048
|
+
items: { type: Type.STRING },
|
|
1049
|
+
description: "Words spoken with notable emphasis"
|
|
1050
|
+
},
|
|
1051
|
+
pause_after_seconds: { type: Type.NUMBER, description: "Pause duration in seconds" }
|
|
1052
|
+
},
|
|
1053
|
+
required: ["timestamp", "text", "tone"]
|
|
1054
|
+
}
|
|
1055
|
+
}
|
|
1056
|
+
},
|
|
1057
|
+
required: ["segment_index", "time_range", "transcript_entries"]
|
|
1058
|
+
};
|
|
1059
|
+
var SCHEMA_PASS_1B = {
|
|
1060
|
+
type: Type.OBJECT,
|
|
1061
|
+
properties: {
|
|
1062
|
+
speaker_assignments: {
|
|
1063
|
+
type: Type.ARRAY,
|
|
1064
|
+
items: {
|
|
1065
|
+
type: Type.OBJECT,
|
|
1066
|
+
properties: {
|
|
1067
|
+
timestamp: { type: Type.STRING, description: "HH:MM:SS matching a transcript entry" },
|
|
1068
|
+
speaker: { type: Type.STRING, description: "SPEAKER_00, SPEAKER_01, etc. Optionally with name: SPEAKER_00 (Alice)" }
|
|
1069
|
+
},
|
|
1070
|
+
required: ["timestamp", "speaker"]
|
|
1071
|
+
}
|
|
1072
|
+
},
|
|
1073
|
+
speaker_summary: {
|
|
1074
|
+
type: Type.ARRAY,
|
|
1075
|
+
items: {
|
|
1076
|
+
type: Type.OBJECT,
|
|
1077
|
+
properties: {
|
|
1078
|
+
speaker_id: { type: Type.STRING },
|
|
1079
|
+
description: { type: Type.STRING }
|
|
1080
|
+
}
|
|
1081
|
+
}
|
|
1082
|
+
}
|
|
1083
|
+
},
|
|
1084
|
+
required: ["speaker_assignments", "speaker_summary"]
|
|
1085
|
+
};
|
|
988
1086
|
var SCHEMA_PASS_2 = {
|
|
989
1087
|
type: Type.OBJECT,
|
|
990
1088
|
properties: {
|
|
@@ -1448,6 +1546,18 @@ function applySpeakerMapping(label, mapping) {
|
|
|
1448
1546
|
}
|
|
1449
1547
|
return label;
|
|
1450
1548
|
}
|
|
1549
|
+
function replaceNamesInText(text4, mapping) {
|
|
1550
|
+
if (!mapping || text4.length === 0) return text4;
|
|
1551
|
+
const entries = Object.entries(mapping).filter(([key, value]) => key !== value && !/^SPEAKER_\d+$/.test(key)).sort((a, b) => b[0].length - a[0].length);
|
|
1552
|
+
if (entries.length === 0) return text4;
|
|
1553
|
+
let result = text4;
|
|
1554
|
+
for (const [key, value] of entries) {
|
|
1555
|
+
const escaped = key.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
1556
|
+
const re = new RegExp(`\\b${escaped}\\b`, "g");
|
|
1557
|
+
result = result.replace(re, value);
|
|
1558
|
+
}
|
|
1559
|
+
return result;
|
|
1560
|
+
}
|
|
1451
1561
|
function buildExpandedMapping(segments, speakerMapping) {
|
|
1452
1562
|
const expanded = { ...speakerMapping };
|
|
1453
1563
|
for (const seg of segments) {
|
|
@@ -1501,8 +1611,8 @@ function changeTypeBadge(changeType) {
|
|
|
1501
1611
|
return badges[changeType] || `[${changeType.toUpperCase()}]`;
|
|
1502
1612
|
}
|
|
1503
1613
|
|
|
1504
|
-
// src/passes/
|
|
1505
|
-
async function
|
|
1614
|
+
// src/passes/transcription.ts
|
|
1615
|
+
async function runTranscription(params) {
|
|
1506
1616
|
const { client, fileUri, mimeType, segment, model, resolution, lang } = params;
|
|
1507
1617
|
const contents = [
|
|
1508
1618
|
{
|
|
@@ -1525,20 +1635,107 @@ async function runTranscript(params) {
|
|
|
1525
1635
|
model,
|
|
1526
1636
|
contents,
|
|
1527
1637
|
config: {
|
|
1528
|
-
systemInstruction: withLanguage(
|
|
1529
|
-
responseSchema:
|
|
1638
|
+
systemInstruction: withLanguage(SYSTEM_INSTRUCTION_PASS_1A, lang),
|
|
1639
|
+
responseSchema: SCHEMA_PASS_1A,
|
|
1530
1640
|
responseMimeType: "application/json",
|
|
1531
1641
|
...resolution !== void 0 ? { mediaResolution: resolution } : {},
|
|
1532
1642
|
maxOutputTokens: 65536,
|
|
1533
|
-
temperature:
|
|
1643
|
+
temperature: 1
|
|
1534
1644
|
}
|
|
1535
1645
|
});
|
|
1536
1646
|
if (result === null || typeof result !== "object" || !Array.isArray(result["transcript_entries"])) {
|
|
1537
|
-
throw new Error("Empty response from Gemini Pass
|
|
1647
|
+
throw new Error("Empty response from Gemini Pass 1a");
|
|
1648
|
+
}
|
|
1649
|
+
return result;
|
|
1650
|
+
}
|
|
1651
|
+
|
|
1652
|
+
// src/passes/diarization.ts
|
|
1653
|
+
function formatTranscriptForInjection(pass1a) {
|
|
1654
|
+
if (pass1a.transcript_entries.length === 0) {
|
|
1655
|
+
return "[No transcript entries in this segment]";
|
|
1656
|
+
}
|
|
1657
|
+
return pass1a.transcript_entries.map((e) => `[${e.timestamp}] ${e.text}`).join("\n");
|
|
1658
|
+
}
|
|
1659
|
+
async function runDiarization(params) {
|
|
1660
|
+
const { client, fileUri, mimeType, segment, model, resolution, lang, pass1aResult } = params;
|
|
1661
|
+
const transcriptText = formatTranscriptForInjection(pass1aResult);
|
|
1662
|
+
const systemInstruction = withLanguage(
|
|
1663
|
+
SYSTEM_INSTRUCTION_PASS_1B.replace("{INJECT_PASS1A_TRANSCRIPT_HERE}", transcriptText),
|
|
1664
|
+
lang
|
|
1665
|
+
);
|
|
1666
|
+
const contents = [
|
|
1667
|
+
{
|
|
1668
|
+
role: "user",
|
|
1669
|
+
parts: [
|
|
1670
|
+
{
|
|
1671
|
+
fileData: { fileUri, mimeType },
|
|
1672
|
+
videoMetadata: {
|
|
1673
|
+
startOffset: `${segment.startTime}s`,
|
|
1674
|
+
endOffset: `${segment.endTime}s`
|
|
1675
|
+
}
|
|
1676
|
+
},
|
|
1677
|
+
{
|
|
1678
|
+
text: `Process segment #${segment.index + 1}. Identify speakers from ${formatTime(segment.startTime)} to ${formatTime(segment.endTime)}.`
|
|
1679
|
+
}
|
|
1680
|
+
]
|
|
1681
|
+
}
|
|
1682
|
+
];
|
|
1683
|
+
const result = await client.generate({
|
|
1684
|
+
model,
|
|
1685
|
+
contents,
|
|
1686
|
+
config: {
|
|
1687
|
+
systemInstruction,
|
|
1688
|
+
responseSchema: SCHEMA_PASS_1B,
|
|
1689
|
+
responseMimeType: "application/json",
|
|
1690
|
+
...resolution !== void 0 ? { mediaResolution: resolution } : {},
|
|
1691
|
+
maxOutputTokens: 65536,
|
|
1692
|
+
temperature: 1
|
|
1693
|
+
}
|
|
1694
|
+
});
|
|
1695
|
+
if (result === null || typeof result !== "object" || !Array.isArray(result["speaker_assignments"])) {
|
|
1696
|
+
throw new Error("Empty response from Gemini Pass 1b");
|
|
1538
1697
|
}
|
|
1539
1698
|
return result;
|
|
1540
1699
|
}
|
|
1541
1700
|
|
|
1701
|
+
// src/passes/transcript-merge.ts
|
|
1702
|
+
var MAX_MATCH_WINDOW_S = 3;
|
|
1703
|
+
function mergeTranscriptResults(pass1a, pass1b) {
|
|
1704
|
+
const assignments = pass1b.speaker_assignments.map((a) => ({
|
|
1705
|
+
...a,
|
|
1706
|
+
seconds: parseTimestamp(a.timestamp),
|
|
1707
|
+
used: false
|
|
1708
|
+
}));
|
|
1709
|
+
const transcript_entries = pass1a.transcript_entries.map((entry) => {
|
|
1710
|
+
const entrySeconds = parseTimestamp(entry.timestamp);
|
|
1711
|
+
let bestIdx = -1;
|
|
1712
|
+
let bestDelta = Infinity;
|
|
1713
|
+
for (let i = 0; i < assignments.length; i++) {
|
|
1714
|
+
if (assignments[i].used) continue;
|
|
1715
|
+
const delta = Math.abs(assignments[i].seconds - entrySeconds);
|
|
1716
|
+
if (delta < bestDelta) {
|
|
1717
|
+
bestDelta = delta;
|
|
1718
|
+
bestIdx = i;
|
|
1719
|
+
}
|
|
1720
|
+
}
|
|
1721
|
+
let speaker = "SPEAKER_UNKNOWN";
|
|
1722
|
+
if (bestIdx >= 0 && bestDelta <= MAX_MATCH_WINDOW_S) {
|
|
1723
|
+
speaker = assignments[bestIdx].speaker;
|
|
1724
|
+
assignments[bestIdx].used = true;
|
|
1725
|
+
}
|
|
1726
|
+
return {
|
|
1727
|
+
...entry,
|
|
1728
|
+
speaker
|
|
1729
|
+
};
|
|
1730
|
+
});
|
|
1731
|
+
return {
|
|
1732
|
+
segment_index: pass1a.segment_index,
|
|
1733
|
+
time_range: pass1a.time_range,
|
|
1734
|
+
transcript_entries,
|
|
1735
|
+
speaker_summary: pass1b.speaker_summary
|
|
1736
|
+
};
|
|
1737
|
+
}
|
|
1738
|
+
|
|
1542
1739
|
// src/passes/visual.ts
|
|
1543
1740
|
async function runVisual(params) {
|
|
1544
1741
|
const { client, fileUri, mimeType, segment, model, resolution, pass1Transcript, lang } = params;
|
|
@@ -1573,7 +1770,7 @@ async function runVisual(params) {
|
|
|
1573
1770
|
responseMimeType: "application/json",
|
|
1574
1771
|
...resolution !== void 0 ? { mediaResolution: resolution } : {},
|
|
1575
1772
|
maxOutputTokens: 65536,
|
|
1576
|
-
temperature:
|
|
1773
|
+
temperature: 1
|
|
1577
1774
|
}
|
|
1578
1775
|
});
|
|
1579
1776
|
if (result === null || typeof result !== "object" || !Array.isArray(result["code_blocks"])) {
|
|
@@ -1721,19 +1918,22 @@ ${contextText}`
|
|
|
1721
1918
|
|
|
1722
1919
|
// src/passes/people.ts
|
|
1723
1920
|
async function runPeopleExtraction(params) {
|
|
1724
|
-
const { client, fileUri, mimeType, model, pass1Results, lang } = params;
|
|
1921
|
+
const { client, fileUri, mimeType, model, pass1Results, lang, canonicalSpeakers } = params;
|
|
1725
1922
|
const hasAnyTranscript = pass1Results.some((r) => r != null);
|
|
1726
1923
|
const transcriptText = hasAnyTranscript ? pass1Results.filter((r) => r != null).flatMap(
|
|
1727
1924
|
(r) => r.transcript_entries.map((t) => `[${t.timestamp}] ${t.speaker}: ${t.text}`)
|
|
1728
1925
|
).join("\n") : "[No transcript available]";
|
|
1729
1926
|
const transcriptContext = `TRANSCRIPT FROM ALL SEGMENTS:
|
|
1730
1927
|
${transcriptText}`;
|
|
1928
|
+
const speakerConstraint = canonicalSpeakers && canonicalSpeakers.length > 0 ? `CONFIRMED SPEAKERS: ${canonicalSpeakers.join(", ")}. Extract details about these speakers only.
|
|
1929
|
+
|
|
1930
|
+
` : "";
|
|
1731
1931
|
const contents = [
|
|
1732
1932
|
{
|
|
1733
1933
|
role: "user",
|
|
1734
1934
|
parts: [
|
|
1735
1935
|
{ fileData: { fileUri, mimeType } },
|
|
1736
|
-
{ text: `Analyze the entire video. ${transcriptContext}` }
|
|
1936
|
+
{ text: `Analyze the entire video. ${speakerConstraint}${transcriptContext}` }
|
|
1737
1937
|
]
|
|
1738
1938
|
}
|
|
1739
1939
|
];
|
|
@@ -1745,7 +1945,7 @@ ${transcriptText}`;
|
|
|
1745
1945
|
responseSchema: SCHEMA_PASS_3B,
|
|
1746
1946
|
responseMimeType: "application/json",
|
|
1747
1947
|
maxOutputTokens: 65536,
|
|
1748
|
-
temperature:
|
|
1948
|
+
temperature: 1
|
|
1749
1949
|
}
|
|
1750
1950
|
});
|
|
1751
1951
|
if (result === null || typeof result !== "object" || !Array.isArray(result["participants"]) || !Array.isArray(result["relationships"])) {
|
|
@@ -1795,7 +1995,7 @@ ${contextText}`
|
|
|
1795
1995
|
responseMimeType: "application/json",
|
|
1796
1996
|
...resolution !== void 0 ? { mediaResolution: resolution } : {},
|
|
1797
1997
|
maxOutputTokens: 65536,
|
|
1798
|
-
temperature:
|
|
1998
|
+
temperature: 1
|
|
1799
1999
|
}
|
|
1800
2000
|
});
|
|
1801
2001
|
if (result === null || typeof result !== "object" || !Array.isArray(result["messages"]) || !Array.isArray(result["links"])) {
|
|
@@ -2405,6 +2605,123 @@ function validateCodeReconstruction(params) {
|
|
|
2405
2605
|
return { confirmed, uncertain, rejected, warnings };
|
|
2406
2606
|
}
|
|
2407
2607
|
|
|
2608
|
+
// src/core/speaker-reconciliation.ts
|
|
2609
|
+
var SPEAKER_NAME_RE = /^(SPEAKER_\d+)\s*\((.+)\)$/;
|
|
2610
|
+
function parseLabel(label) {
|
|
2611
|
+
const m = SPEAKER_NAME_RE.exec(label.trim());
|
|
2612
|
+
if (m) {
|
|
2613
|
+
return { base: m[1], name: m[2].toLowerCase() };
|
|
2614
|
+
}
|
|
2615
|
+
return { base: label.trim(), name: null };
|
|
2616
|
+
}
|
|
2617
|
+
function formatLabel(base, originalName) {
|
|
2618
|
+
return originalName != null ? `${base} (${originalName})` : base;
|
|
2619
|
+
}
|
|
2620
|
+
function reconcileSpeakers(params) {
|
|
2621
|
+
const { pass1Results } = params;
|
|
2622
|
+
const namedGroups = /* @__PURE__ */ new Map();
|
|
2623
|
+
const unnamedGroups = /* @__PURE__ */ new Map();
|
|
2624
|
+
let nextCanonicalIndex = 0;
|
|
2625
|
+
const rawMapping = /* @__PURE__ */ new Map();
|
|
2626
|
+
function getOrAssignNamed(name, originalName, description) {
|
|
2627
|
+
const existing = namedGroups.get(name);
|
|
2628
|
+
if (existing) {
|
|
2629
|
+
if (description) existing.descriptions.push(description);
|
|
2630
|
+
return existing.canonicalIndex;
|
|
2631
|
+
}
|
|
2632
|
+
const idx = nextCanonicalIndex++;
|
|
2633
|
+
namedGroups.set(name, {
|
|
2634
|
+
canonicalIndex: idx,
|
|
2635
|
+
originalName,
|
|
2636
|
+
descriptions: description ? [description] : []
|
|
2637
|
+
});
|
|
2638
|
+
return idx;
|
|
2639
|
+
}
|
|
2640
|
+
function getOrAssignUnnamed(segmentKey, description) {
|
|
2641
|
+
const existing = unnamedGroups.get(segmentKey);
|
|
2642
|
+
if (existing) {
|
|
2643
|
+
if (description) existing.descriptions.push(description);
|
|
2644
|
+
return existing.canonicalIndex;
|
|
2645
|
+
}
|
|
2646
|
+
const idx = nextCanonicalIndex++;
|
|
2647
|
+
unnamedGroups.set(segmentKey, {
|
|
2648
|
+
canonicalIndex: idx,
|
|
2649
|
+
descriptions: description ? [description] : []
|
|
2650
|
+
});
|
|
2651
|
+
return idx;
|
|
2652
|
+
}
|
|
2653
|
+
for (let segIdx = 0; segIdx < pass1Results.length; segIdx++) {
|
|
2654
|
+
const result = pass1Results[segIdx];
|
|
2655
|
+
if (result == null) continue;
|
|
2656
|
+
const labelsInSegment = /* @__PURE__ */ new Set();
|
|
2657
|
+
for (const entry of result.speaker_summary ?? []) {
|
|
2658
|
+
if (entry.speaker_id) labelsInSegment.add(entry.speaker_id);
|
|
2659
|
+
}
|
|
2660
|
+
for (const entry of result.transcript_entries ?? []) {
|
|
2661
|
+
if (entry.speaker) labelsInSegment.add(entry.speaker);
|
|
2662
|
+
}
|
|
2663
|
+
const descriptionByLabel = /* @__PURE__ */ new Map();
|
|
2664
|
+
for (const entry of result.speaker_summary ?? []) {
|
|
2665
|
+
if (entry.speaker_id) {
|
|
2666
|
+
descriptionByLabel.set(entry.speaker_id, entry.description ?? "");
|
|
2667
|
+
}
|
|
2668
|
+
}
|
|
2669
|
+
for (const label of labelsInSegment) {
|
|
2670
|
+
const mapKey = `${segIdx}:${label}`;
|
|
2671
|
+
if (rawMapping.has(mapKey)) continue;
|
|
2672
|
+
const { name } = parseLabel(label);
|
|
2673
|
+
const description = descriptionByLabel.get(label) ?? "";
|
|
2674
|
+
let canonicalIdx;
|
|
2675
|
+
if (name != null) {
|
|
2676
|
+
canonicalIdx = getOrAssignNamed(
|
|
2677
|
+
name,
|
|
2678
|
+
/* originalName */
|
|
2679
|
+
parseOriginalName(label),
|
|
2680
|
+
description
|
|
2681
|
+
);
|
|
2682
|
+
} else {
|
|
2683
|
+
canonicalIdx = getOrAssignUnnamed(mapKey, description);
|
|
2684
|
+
}
|
|
2685
|
+
rawMapping.set(mapKey, canonicalIdx);
|
|
2686
|
+
}
|
|
2687
|
+
}
|
|
2688
|
+
if (rawMapping.size === 0) {
|
|
2689
|
+
return { mapping: {}, canonicalSpeakers: [] };
|
|
2690
|
+
}
|
|
2691
|
+
const slots = Array.from(
|
|
2692
|
+
{ length: nextCanonicalIndex },
|
|
2693
|
+
() => ({ originalName: null, descriptions: [] })
|
|
2694
|
+
);
|
|
2695
|
+
for (const [, group] of namedGroups) {
|
|
2696
|
+
slots[group.canonicalIndex] = {
|
|
2697
|
+
originalName: group.originalName,
|
|
2698
|
+
descriptions: group.descriptions
|
|
2699
|
+
};
|
|
2700
|
+
}
|
|
2701
|
+
for (const [, group] of unnamedGroups) {
|
|
2702
|
+
slots[group.canonicalIndex] = {
|
|
2703
|
+
originalName: null,
|
|
2704
|
+
descriptions: group.descriptions
|
|
2705
|
+
};
|
|
2706
|
+
}
|
|
2707
|
+
const canonicalSpeakers = slots.map((slot, idx) => ({
|
|
2708
|
+
label: formatLabel(formatCanonicalBase(idx), slot.originalName),
|
|
2709
|
+
descriptions: slot.descriptions
|
|
2710
|
+
}));
|
|
2711
|
+
const mapping = {};
|
|
2712
|
+
for (const [mapKey, canonicalIdx] of rawMapping) {
|
|
2713
|
+
mapping[mapKey] = canonicalSpeakers[canonicalIdx].label;
|
|
2714
|
+
}
|
|
2715
|
+
return { mapping, canonicalSpeakers };
|
|
2716
|
+
}
|
|
2717
|
+
function parseOriginalName(label) {
|
|
2718
|
+
const m = SPEAKER_NAME_RE.exec(label.trim());
|
|
2719
|
+
return m ? m[2] : label.trim();
|
|
2720
|
+
}
|
|
2721
|
+
function formatCanonicalBase(index) {
|
|
2722
|
+
return `SPEAKER_${String(index).padStart(2, "0")}`;
|
|
2723
|
+
}
|
|
2724
|
+
|
|
2408
2725
|
// src/core/pipeline.ts
|
|
2409
2726
|
var RETRY_DELAYS_MS = [2e3, 4e3, 8e3];
|
|
2410
2727
|
async function withRetry(fn, label) {
|
|
@@ -2484,7 +2801,7 @@ async function runPipeline(config) {
|
|
|
2484
2801
|
const results = [];
|
|
2485
2802
|
const n = segments.length;
|
|
2486
2803
|
const linkConsensusRuns = 3;
|
|
2487
|
-
const callsPerSegment =
|
|
2804
|
+
const callsPerSegment = 3 + (strategy.passes.includes("chat") ? linkConsensusRuns : 0) + (strategy.passes.includes("implicit") ? 1 : 0);
|
|
2488
2805
|
const postSegmentCalls = (strategy.passes.includes("people") ? 1 : 0) + (strategy.passes.includes("code") ? 3 : 0) + (strategy.passes.includes("synthesis") ? 1 : 0);
|
|
2489
2806
|
const totalSteps = n * callsPerSegment + postSegmentCalls;
|
|
2490
2807
|
let currentStep = 0;
|
|
@@ -2501,21 +2818,40 @@ async function runPipeline(config) {
|
|
|
2501
2818
|
break;
|
|
2502
2819
|
}
|
|
2503
2820
|
const segment = segments[i];
|
|
2504
|
-
onProgress?.({ phase: "
|
|
2505
|
-
|
|
2506
|
-
|
|
2507
|
-
|
|
2508
|
-
`segment ${i} pass1`
|
|
2821
|
+
onProgress?.({ phase: "pass1a", segment: i, totalSegments: n, status: "running", totalSteps });
|
|
2822
|
+
const pass1aAttempt = await withRetry(
|
|
2823
|
+
() => rateLimiter.execute(() => runTranscription({ client, fileUri, mimeType, segment, model, resolution, lang }), { onWait }),
|
|
2824
|
+
`segment ${i} pass1a`
|
|
2509
2825
|
);
|
|
2510
|
-
|
|
2511
|
-
|
|
2512
|
-
|
|
2513
|
-
|
|
2514
|
-
pass1 = pass1Attempt.result;
|
|
2515
|
-
pass1RanOnce = true;
|
|
2826
|
+
let pass1aResult = pass1aAttempt.error !== null ? null : pass1aAttempt.result;
|
|
2827
|
+
if (pass1aAttempt.error !== null) {
|
|
2828
|
+
log4.warn(pass1aAttempt.error);
|
|
2829
|
+
errors.push(pass1aAttempt.error);
|
|
2516
2830
|
}
|
|
2517
2831
|
currentStep++;
|
|
2518
|
-
onProgress?.({ phase: "
|
|
2832
|
+
onProgress?.({ phase: "pass1a", segment: i, totalSegments: n, status: "done", currentStep, totalSteps });
|
|
2833
|
+
let pass1 = null;
|
|
2834
|
+
if (pass1aResult != null) {
|
|
2835
|
+
onProgress?.({ phase: "pass1b", segment: i, totalSegments: n, status: "running", totalSteps });
|
|
2836
|
+
const p1a = pass1aResult;
|
|
2837
|
+
const pass1bAttempt = await withRetry(
|
|
2838
|
+
() => rateLimiter.execute(() => runDiarization({ client, fileUri, mimeType, segment, model, resolution, lang, pass1aResult: p1a }), { onWait }),
|
|
2839
|
+
`segment ${i} pass1b`
|
|
2840
|
+
);
|
|
2841
|
+
if (pass1bAttempt.error !== null) {
|
|
2842
|
+
log4.warn(pass1bAttempt.error);
|
|
2843
|
+
errors.push(pass1bAttempt.error);
|
|
2844
|
+
pass1 = mergeTranscriptResults(pass1aResult, { speaker_assignments: [], speaker_summary: [] });
|
|
2845
|
+
} else if (pass1bAttempt.result != null) {
|
|
2846
|
+
pass1 = mergeTranscriptResults(pass1aResult, pass1bAttempt.result);
|
|
2847
|
+
}
|
|
2848
|
+
currentStep++;
|
|
2849
|
+
onProgress?.({ phase: "pass1b", segment: i, totalSegments: n, status: "done", currentStep, totalSteps });
|
|
2850
|
+
pass1RanOnce = true;
|
|
2851
|
+
} else {
|
|
2852
|
+
currentStep++;
|
|
2853
|
+
onProgress?.({ phase: "pass1b", segment: i, totalSegments: n, status: "done", currentStep, totalSteps });
|
|
2854
|
+
}
|
|
2519
2855
|
onProgress?.({ phase: "pass2", segment: i, totalSegments: n, status: "running", totalSteps });
|
|
2520
2856
|
let pass2 = null;
|
|
2521
2857
|
const pass2Attempt = await withRetry(
|
|
@@ -2632,6 +2968,31 @@ async function runPipeline(config) {
|
|
|
2632
2968
|
}
|
|
2633
2969
|
const pass1Results = results.map((r) => r.pass1);
|
|
2634
2970
|
const pass2Results = results.map((r) => r.pass2);
|
|
2971
|
+
let canonicalSpeakers = [];
|
|
2972
|
+
try {
|
|
2973
|
+
const reconciliationResult = reconcileSpeakers({ pass1Results });
|
|
2974
|
+
canonicalSpeakers = reconciliationResult.canonicalSpeakers;
|
|
2975
|
+
const { mapping } = reconciliationResult;
|
|
2976
|
+
for (let segIdx = 0; segIdx < pass1Results.length; segIdx++) {
|
|
2977
|
+
const r = pass1Results[segIdx];
|
|
2978
|
+
if (r == null) continue;
|
|
2979
|
+
for (const entry of r.transcript_entries ?? []) {
|
|
2980
|
+
if (entry.speaker) {
|
|
2981
|
+
const canonical = mapping[`${segIdx}:${entry.speaker}`];
|
|
2982
|
+
if (canonical !== void 0) entry.speaker = canonical;
|
|
2983
|
+
}
|
|
2984
|
+
}
|
|
2985
|
+
for (const entry of r.speaker_summary ?? []) {
|
|
2986
|
+
if (entry.speaker_id) {
|
|
2987
|
+
const canonical = mapping[`${segIdx}:${entry.speaker_id}`];
|
|
2988
|
+
if (canonical !== void 0) entry.speaker_id = canonical;
|
|
2989
|
+
}
|
|
2990
|
+
}
|
|
2991
|
+
}
|
|
2992
|
+
} catch (e) {
|
|
2993
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
2994
|
+
log4.warn(`speaker reconciliation failed, continuing with original labels: ${msg}`);
|
|
2995
|
+
}
|
|
2635
2996
|
let peopleExtraction = null;
|
|
2636
2997
|
if (strategy.passes.includes("people")) {
|
|
2637
2998
|
onProgress?.({ phase: "pass3b", segment: 0, totalSegments: 1, status: "running", totalSteps });
|
|
@@ -2643,7 +3004,8 @@ async function runPipeline(config) {
|
|
|
2643
3004
|
mimeType,
|
|
2644
3005
|
model: MODELS.flash,
|
|
2645
3006
|
pass1Results,
|
|
2646
|
-
lang
|
|
3007
|
+
lang,
|
|
3008
|
+
canonicalSpeakers: canonicalSpeakers.map((s) => s.label)
|
|
2647
3009
|
}),
|
|
2648
3010
|
{ onWait }
|
|
2649
3011
|
),
|
|
@@ -2751,7 +3113,7 @@ async function runPipeline(config) {
|
|
|
2751
3113
|
}
|
|
2752
3114
|
|
|
2753
3115
|
// src/output/generator.ts
|
|
2754
|
-
import { mkdir, writeFile } from "fs/promises";
|
|
3116
|
+
import { mkdir, readFile as readFile2, writeFile } from "fs/promises";
|
|
2755
3117
|
import { join as join3, dirname } from "path";
|
|
2756
3118
|
|
|
2757
3119
|
// src/output/guide.ts
|
|
@@ -2769,11 +3131,11 @@ function renderFilesTable(filesGenerated) {
|
|
|
2769
3131
|
|------|
|
|
2770
3132
|
${rows}`;
|
|
2771
3133
|
}
|
|
2772
|
-
function renderSuggestions(synthesisResult) {
|
|
3134
|
+
function renderSuggestions(synthesisResult, speakerMapping) {
|
|
2773
3135
|
if (synthesisResult == null || synthesisResult.suggestions.length === 0) {
|
|
2774
3136
|
return "_No suggestions._";
|
|
2775
3137
|
}
|
|
2776
|
-
return synthesisResult.suggestions.map((s) => `- ${s}`).join("\n");
|
|
3138
|
+
return synthesisResult.suggestions.map((s) => `- ${replaceNamesInText(s, speakerMapping)}`).join("\n");
|
|
2777
3139
|
}
|
|
2778
3140
|
function renderVideoType(profile) {
|
|
2779
3141
|
if (profile == null) return "unknown";
|
|
@@ -2820,7 +3182,8 @@ function renderIncompletePasses(pipelineResult) {
|
|
|
2820
3182
|
function writeGuide(params) {
|
|
2821
3183
|
const { title, source, duration, pipelineResult, filesGenerated, speakerMapping } = params;
|
|
2822
3184
|
const { synthesisResult, videoProfile } = pipelineResult;
|
|
2823
|
-
const
|
|
3185
|
+
const rawOverview = synthesisResult?.overview ?? "_No summary available \u2014 synthesis pass did not run or produced no output._";
|
|
3186
|
+
const overview = replaceNamesInText(rawOverview, speakerMapping);
|
|
2824
3187
|
const videoType = renderVideoType(videoProfile);
|
|
2825
3188
|
const sections = [
|
|
2826
3189
|
`# ${title}`,
|
|
@@ -2841,7 +3204,7 @@ function writeGuide(params) {
|
|
|
2841
3204
|
"",
|
|
2842
3205
|
"## Suggestions",
|
|
2843
3206
|
"",
|
|
2844
|
-
renderSuggestions(synthesisResult),
|
|
3207
|
+
renderSuggestions(synthesisResult, speakerMapping),
|
|
2845
3208
|
"",
|
|
2846
3209
|
"## Processing Details",
|
|
2847
3210
|
"",
|
|
@@ -3066,58 +3429,58 @@ ${content}`;
|
|
|
3066
3429
|
}
|
|
3067
3430
|
|
|
3068
3431
|
// src/output/notes.ts
|
|
3069
|
-
function renderDecisions(decisions) {
|
|
3432
|
+
function renderDecisions(decisions, speakerMapping) {
|
|
3070
3433
|
if (decisions.length === 0) return [];
|
|
3071
3434
|
const lines = ["## Key Decisions", ""];
|
|
3072
3435
|
for (const d of decisions) {
|
|
3073
|
-
lines.push(`### [${d.timestamp}] ${d.decision}`);
|
|
3436
|
+
lines.push(`### [${d.timestamp}] ${replaceNamesInText(d.decision, speakerMapping)}`);
|
|
3074
3437
|
lines.push("");
|
|
3075
3438
|
if (d.context.length > 0) {
|
|
3076
|
-
lines.push(d.context);
|
|
3439
|
+
lines.push(replaceNamesInText(d.context, speakerMapping));
|
|
3077
3440
|
lines.push("");
|
|
3078
3441
|
}
|
|
3079
3442
|
}
|
|
3080
3443
|
return lines;
|
|
3081
3444
|
}
|
|
3082
|
-
function renderConcepts(concepts) {
|
|
3445
|
+
function renderConcepts(concepts, speakerMapping) {
|
|
3083
3446
|
if (concepts.length === 0) return [];
|
|
3084
3447
|
const lines = ["## Key Concepts", ""];
|
|
3085
3448
|
for (const c of concepts) {
|
|
3086
|
-
lines.push(`### [${c.timestamp}] ${c.concept}`);
|
|
3449
|
+
lines.push(`### [${c.timestamp}] ${replaceNamesInText(c.concept, speakerMapping)}`);
|
|
3087
3450
|
lines.push("");
|
|
3088
3451
|
if (c.explanation.length > 0) {
|
|
3089
|
-
lines.push(c.explanation);
|
|
3452
|
+
lines.push(replaceNamesInText(c.explanation, speakerMapping));
|
|
3090
3453
|
lines.push("");
|
|
3091
3454
|
}
|
|
3092
3455
|
}
|
|
3093
3456
|
return lines;
|
|
3094
3457
|
}
|
|
3095
|
-
function renderTopics(topics) {
|
|
3458
|
+
function renderTopics(topics, speakerMapping) {
|
|
3096
3459
|
if (topics.length === 0) return [];
|
|
3097
3460
|
const lines = ["## Topics", ""];
|
|
3098
3461
|
for (const t of topics) {
|
|
3099
3462
|
const tsLabel = t.timestamps.length > 0 ? ` _(${t.timestamps.join(", ")})_` : "";
|
|
3100
|
-
lines.push(`### ${t.title}${tsLabel}`);
|
|
3463
|
+
lines.push(`### ${replaceNamesInText(t.title, speakerMapping)}${tsLabel}`);
|
|
3101
3464
|
lines.push("");
|
|
3102
3465
|
if (t.summary.length > 0) {
|
|
3103
|
-
lines.push(t.summary);
|
|
3466
|
+
lines.push(replaceNamesInText(t.summary, speakerMapping));
|
|
3104
3467
|
lines.push("");
|
|
3105
3468
|
}
|
|
3106
3469
|
if (t.key_points.length > 0) {
|
|
3107
3470
|
for (const kp of t.key_points) {
|
|
3108
|
-
lines.push(`- ${kp}`);
|
|
3471
|
+
lines.push(`- ${replaceNamesInText(kp, speakerMapping)}`);
|
|
3109
3472
|
}
|
|
3110
3473
|
lines.push("");
|
|
3111
3474
|
}
|
|
3112
3475
|
}
|
|
3113
3476
|
return lines;
|
|
3114
3477
|
}
|
|
3115
|
-
function renderQuestions(questions) {
|
|
3478
|
+
function renderQuestions(questions, speakerMapping) {
|
|
3116
3479
|
if (questions.length === 0) return [];
|
|
3117
3480
|
const lines = ["## Questions Raised", ""];
|
|
3118
3481
|
for (const q of questions) {
|
|
3119
3482
|
const status = q.answered ? "(answered)" : "(open)";
|
|
3120
|
-
lines.push(`- **[${q.timestamp}]** ${q.question} ${status}`);
|
|
3483
|
+
lines.push(`- **[${q.timestamp}]** ${replaceNamesInText(q.question, speakerMapping)} ${status}`);
|
|
3121
3484
|
}
|
|
3122
3485
|
lines.push("");
|
|
3123
3486
|
return lines;
|
|
@@ -3128,7 +3491,7 @@ function renderActionItems(items, speakerMapping) {
|
|
|
3128
3491
|
for (const a of items) {
|
|
3129
3492
|
const mentionedBy = applySpeakerMapping(a.mentioned_by, speakerMapping);
|
|
3130
3493
|
const by = mentionedBy.length > 0 ? ` \u2014 _${mentionedBy}_` : "";
|
|
3131
|
-
lines.push(`- **[${a.timestamp}]** ${a.item}${by}`);
|
|
3494
|
+
lines.push(`- **[${a.timestamp}]** ${replaceNamesInText(a.item, speakerMapping)}${by}`);
|
|
3132
3495
|
}
|
|
3133
3496
|
lines.push("");
|
|
3134
3497
|
return lines;
|
|
@@ -3142,20 +3505,20 @@ function writeNotes(params) {
|
|
|
3142
3505
|
if (!hasMeaningfulContent(synthesisResult)) return null;
|
|
3143
3506
|
const sections = ["# Notes", ""];
|
|
3144
3507
|
if (synthesisResult.overview.length > 0) {
|
|
3145
|
-
sections.push(synthesisResult.overview);
|
|
3508
|
+
sections.push(replaceNamesInText(synthesisResult.overview, speakerMapping));
|
|
3146
3509
|
sections.push("");
|
|
3147
3510
|
}
|
|
3148
|
-
sections.push(...renderDecisions(synthesisResult.key_decisions));
|
|
3149
|
-
sections.push(...renderConcepts(synthesisResult.key_concepts));
|
|
3150
|
-
sections.push(...renderTopics(synthesisResult.topics));
|
|
3151
|
-
sections.push(...renderQuestions(synthesisResult.questions_raised));
|
|
3511
|
+
sections.push(...renderDecisions(synthesisResult.key_decisions, speakerMapping));
|
|
3512
|
+
sections.push(...renderConcepts(synthesisResult.key_concepts, speakerMapping));
|
|
3513
|
+
sections.push(...renderTopics(synthesisResult.topics, speakerMapping));
|
|
3514
|
+
sections.push(...renderQuestions(synthesisResult.questions_raised, speakerMapping));
|
|
3152
3515
|
sections.push(...renderActionItems(synthesisResult.action_items, speakerMapping));
|
|
3153
3516
|
while (sections[sections.length - 1] === "") sections.pop();
|
|
3154
3517
|
return sections.join("\n");
|
|
3155
3518
|
}
|
|
3156
3519
|
|
|
3157
3520
|
// src/output/people.ts
|
|
3158
|
-
function renderParticipant(p, index) {
|
|
3521
|
+
function renderParticipant(p, index, speakerMapping) {
|
|
3159
3522
|
const lines = [];
|
|
3160
3523
|
lines.push(`## ${index + 1}. ${p.name}`);
|
|
3161
3524
|
lines.push("");
|
|
@@ -3170,7 +3533,7 @@ function renderParticipant(p, index) {
|
|
|
3170
3533
|
lines.push("**Contributions:**");
|
|
3171
3534
|
lines.push("");
|
|
3172
3535
|
for (const c of p.contributions) {
|
|
3173
|
-
lines.push(`- ${c}`);
|
|
3536
|
+
lines.push(`- ${replaceNamesInText(c, speakerMapping)}`);
|
|
3174
3537
|
}
|
|
3175
3538
|
lines.push("");
|
|
3176
3539
|
}
|
|
@@ -3285,13 +3648,13 @@ function writePeople(params) {
|
|
|
3285
3648
|
for (let i = 0; i < participants.length; i++) {
|
|
3286
3649
|
const p = participants[i];
|
|
3287
3650
|
if (p != null) {
|
|
3288
|
-
sections.push(...renderParticipant(p, i));
|
|
3651
|
+
sections.push(...renderParticipant(p, i, speakerMapping));
|
|
3289
3652
|
}
|
|
3290
3653
|
}
|
|
3291
3654
|
if (peopleExtraction.relationships.length > 0) {
|
|
3292
3655
|
sections.push("## Relationships", "");
|
|
3293
3656
|
for (const r of peopleExtraction.relationships) {
|
|
3294
|
-
sections.push(`- ${r}`);
|
|
3657
|
+
sections.push(`- ${replaceNamesInText(r, speakerMapping)}`);
|
|
3295
3658
|
}
|
|
3296
3659
|
sections.push("");
|
|
3297
3660
|
}
|
|
@@ -3447,7 +3810,7 @@ function renderSynthesisItems(items, speakerMapping) {
|
|
|
3447
3810
|
for (const a of items) {
|
|
3448
3811
|
const mentionedBy = applySpeakerMapping(a.mentioned_by, speakerMapping);
|
|
3449
3812
|
const by = mentionedBy.length > 0 ? ` \u2014 _${mentionedBy}_` : "";
|
|
3450
|
-
lines.push(`- [ ] **[${a.timestamp}]** ${a.item}${by}`);
|
|
3813
|
+
lines.push(`- [ ] **[${a.timestamp}]** ${replaceNamesInText(a.item, speakerMapping)}${by}`);
|
|
3451
3814
|
}
|
|
3452
3815
|
lines.push("");
|
|
3453
3816
|
return lines;
|
|
@@ -3459,7 +3822,7 @@ function renderAssignedTasks(tasks, speakerMapping) {
|
|
|
3459
3822
|
const assignee = applySpeakerMapping(t.assignee, speakerMapping);
|
|
3460
3823
|
const assigneeStr = assignee.length > 0 ? ` \u2192 _${assignee}_` : "";
|
|
3461
3824
|
const deadline = t.deadline.length > 0 ? ` (due: ${t.deadline})` : "";
|
|
3462
|
-
lines.push(`- [ ] **[${t.timestamp}]** ${t.task}${assigneeStr}${deadline}`);
|
|
3825
|
+
lines.push(`- [ ] **[${t.timestamp}]** ${replaceNamesInText(t.task, speakerMapping)}${assigneeStr}${deadline}`);
|
|
3463
3826
|
}
|
|
3464
3827
|
lines.push("");
|
|
3465
3828
|
return lines;
|
|
@@ -3513,19 +3876,19 @@ function collectImplicitDecisions(segments) {
|
|
|
3513
3876
|
}
|
|
3514
3877
|
return decisions;
|
|
3515
3878
|
}
|
|
3516
|
-
function renderEmotionalShifts(shifts) {
|
|
3879
|
+
function renderEmotionalShifts(shifts, speakerMapping) {
|
|
3517
3880
|
if (shifts.length === 0) return [];
|
|
3518
3881
|
const lines = ["## Emotional Shifts", ""];
|
|
3519
3882
|
for (const s of shifts) {
|
|
3520
3883
|
lines.push(`- **[${s.timestamp}]** ${s.from_state} \u2192 ${s.to_state}`);
|
|
3521
3884
|
if (s.trigger.length > 0) {
|
|
3522
|
-
lines.push(` _Trigger: ${s.trigger}_`);
|
|
3885
|
+
lines.push(` _Trigger: ${replaceNamesInText(s.trigger, speakerMapping)}_`);
|
|
3523
3886
|
}
|
|
3524
3887
|
}
|
|
3525
3888
|
lines.push("");
|
|
3526
3889
|
return lines;
|
|
3527
3890
|
}
|
|
3528
|
-
function renderEmphasisPatterns(patterns) {
|
|
3891
|
+
function renderEmphasisPatterns(patterns, speakerMapping) {
|
|
3529
3892
|
if (patterns.length === 0) return [];
|
|
3530
3893
|
const sorted = [...patterns].sort((a, b) => b.times_mentioned - a.times_mentioned);
|
|
3531
3894
|
const lines = ["## Emphasis Patterns", ""];
|
|
@@ -3534,32 +3897,32 @@ function renderEmphasisPatterns(patterns) {
|
|
|
3534
3897
|
lines.push(`### ${p.concept} (\xD7${p.times_mentioned})${ts}`);
|
|
3535
3898
|
lines.push("");
|
|
3536
3899
|
if (p.significance.length > 0) {
|
|
3537
|
-
lines.push(p.significance);
|
|
3900
|
+
lines.push(replaceNamesInText(p.significance, speakerMapping));
|
|
3538
3901
|
lines.push("");
|
|
3539
3902
|
}
|
|
3540
3903
|
}
|
|
3541
3904
|
return lines;
|
|
3542
3905
|
}
|
|
3543
|
-
function renderImplicitQuestions(questions) {
|
|
3906
|
+
function renderImplicitQuestions(questions, speakerMapping) {
|
|
3544
3907
|
if (questions.length === 0) return [];
|
|
3545
3908
|
const lines = ["## Implicit Questions", ""];
|
|
3546
3909
|
for (const q of questions) {
|
|
3547
|
-
lines.push(`- ${q}`);
|
|
3910
|
+
lines.push(`- ${replaceNamesInText(q, speakerMapping)}`);
|
|
3548
3911
|
}
|
|
3549
3912
|
lines.push("");
|
|
3550
3913
|
return lines;
|
|
3551
3914
|
}
|
|
3552
|
-
function renderImplicitDecisions(decisions) {
|
|
3915
|
+
function renderImplicitDecisions(decisions, speakerMapping) {
|
|
3553
3916
|
if (decisions.length === 0) return [];
|
|
3554
3917
|
const lines = ["## Implicit Decisions", ""];
|
|
3555
3918
|
for (const d of decisions) {
|
|
3556
|
-
lines.push(`- ${d}`);
|
|
3919
|
+
lines.push(`- ${replaceNamesInText(d, speakerMapping)}`);
|
|
3557
3920
|
}
|
|
3558
3921
|
lines.push("");
|
|
3559
3922
|
return lines;
|
|
3560
3923
|
}
|
|
3561
3924
|
function writeInsights(params) {
|
|
3562
|
-
const { segments } = params;
|
|
3925
|
+
const { segments, speakerMapping } = params;
|
|
3563
3926
|
const hasPass3d = segments.some((s) => s.pass3d != null);
|
|
3564
3927
|
if (!hasPass3d) return null;
|
|
3565
3928
|
const emotionalShifts = collectEmotionalShifts(segments);
|
|
@@ -3570,10 +3933,10 @@ function writeInsights(params) {
|
|
|
3570
3933
|
return null;
|
|
3571
3934
|
}
|
|
3572
3935
|
const sections = ["# Insights", ""];
|
|
3573
|
-
sections.push(...renderEmotionalShifts(emotionalShifts));
|
|
3574
|
-
sections.push(...renderEmphasisPatterns(emphasisPatterns));
|
|
3575
|
-
sections.push(...renderImplicitQuestions(implicitQuestions));
|
|
3576
|
-
sections.push(...renderImplicitDecisions(implicitDecisions));
|
|
3936
|
+
sections.push(...renderEmotionalShifts(emotionalShifts, speakerMapping));
|
|
3937
|
+
sections.push(...renderEmphasisPatterns(emphasisPatterns, speakerMapping));
|
|
3938
|
+
sections.push(...renderImplicitQuestions(implicitQuestions, speakerMapping));
|
|
3939
|
+
sections.push(...renderImplicitDecisions(implicitDecisions, speakerMapping));
|
|
3577
3940
|
while (sections[sections.length - 1] === "") sections.pop();
|
|
3578
3941
|
return sections.join("\n");
|
|
3579
3942
|
}
|
|
@@ -4330,6 +4693,11 @@ async function reRenderWithSpeakerMapping(params) {
|
|
|
4330
4693
|
};
|
|
4331
4694
|
async function writeOutputFile(filename, content) {
|
|
4332
4695
|
const fullPath = join3(outputDir, filename);
|
|
4696
|
+
try {
|
|
4697
|
+
const existing = await readFile2(fullPath, "utf8");
|
|
4698
|
+
if (existing === content) return;
|
|
4699
|
+
} catch {
|
|
4700
|
+
}
|
|
4333
4701
|
const dir = dirname(fullPath);
|
|
4334
4702
|
if (dir !== outputDir) {
|
|
4335
4703
|
await mkdir(dir, { recursive: true });
|
|
@@ -4584,10 +4952,14 @@ async function runDistill(args) {
|
|
|
4584
4952
|
const result = await handleYouTube(resolved.value, client);
|
|
4585
4953
|
fileUri = result.fileUri;
|
|
4586
4954
|
mimeType = result.mimeType;
|
|
4587
|
-
|
|
4588
|
-
|
|
4589
|
-
|
|
4590
|
-
|
|
4955
|
+
try {
|
|
4956
|
+
duration = await detectDuration({
|
|
4957
|
+
ytDlpDuration: result.duration,
|
|
4958
|
+
geminiDuration: result.duration
|
|
4959
|
+
});
|
|
4960
|
+
} catch {
|
|
4961
|
+
duration = 600;
|
|
4962
|
+
}
|
|
4591
4963
|
if (result.uploadedFileName != null) {
|
|
4592
4964
|
uploadedFileNames = [result.uploadedFileName];
|
|
4593
4965
|
}
|
|
@@ -5095,7 +5467,7 @@ async function run2(args) {
|
|
|
5095
5467
|
}
|
|
5096
5468
|
|
|
5097
5469
|
// src/cli/index.ts
|
|
5098
|
-
var version = "0.4.
|
|
5470
|
+
var version = "0.4.5";
|
|
5099
5471
|
var DEFAULT_OUTPUT = "./vidistill-output/";
|
|
5100
5472
|
var SUBCOMMANDS = {
|
|
5101
5473
|
mcp: run,
|
package/package.json
CHANGED