vidistill 0.2.2 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +48 -10
- package/dist/index.js +371 -253
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# vidistill
|
|
2
2
|
|
|
3
|
-
Video intelligence distiller — turn any video into structured notes, transcripts, and insights using Gemini.
|
|
3
|
+
Video intelligence distiller — turn any video or audio file into structured notes, transcripts, and insights using Gemini.
|
|
4
4
|
|
|
5
|
-
Feed it a YouTube URL
|
|
5
|
+
Feed it a YouTube URL, local video, or audio file. It analyzes the content through multiple AI passes (scene analysis, transcript, visuals, code extraction, people, chat, implicit signals) and synthesizes everything into organized markdown output.
|
|
6
6
|
|
|
7
7
|
## Install
|
|
8
8
|
|
|
@@ -20,12 +20,13 @@ vidistill [input] [options]
|
|
|
20
20
|
|
|
21
21
|
**Arguments:**
|
|
22
22
|
|
|
23
|
-
- `input` — YouTube URL or
|
|
23
|
+
- `input` — YouTube URL, local video, or audio file path (prompted interactively if omitted)
|
|
24
24
|
|
|
25
25
|
**Options:**
|
|
26
26
|
|
|
27
27
|
- `-c, --context` — context about the video (e.g. "CS lecture", "product demo")
|
|
28
28
|
- `-o, --output` — output directory (default: `./vidistill-output/`)
|
|
29
|
+
- `-l, --lang <code>` — output language (e.g. `zh`, `ja`, `ko`, `es`, `fr`, `de`, `pt`, `ru`, `ar`, `hi`)
|
|
29
30
|
|
|
30
31
|
**Examples:**
|
|
31
32
|
|
|
@@ -39,10 +40,41 @@ vidistill "https://youtube.com/watch?v=dQw4w9WgXcQ"
|
|
|
39
40
|
# Local file with context
|
|
40
41
|
vidistill ./lecture.mp4 --context "distributed systems lecture"
|
|
41
42
|
|
|
43
|
+
# Audio file
|
|
44
|
+
vidistill ./podcast.mp3
|
|
45
|
+
|
|
42
46
|
# Custom output directory
|
|
43
47
|
vidistill ./demo.mp4 -o ./notes/
|
|
48
|
+
|
|
49
|
+
# Output in another language
|
|
50
|
+
vidistill ./lecture.mp4 --lang zh
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### Extract
|
|
54
|
+
|
|
55
|
+
Pull specific data from a previously processed video or re-run a targeted pass on a video file.
|
|
56
|
+
|
|
57
|
+
```
|
|
58
|
+
vidistill extract <type> <source>
|
|
44
59
|
```
|
|
45
60
|
|
|
61
|
+
**Arguments:**
|
|
62
|
+
|
|
63
|
+
- `type` — what to extract: `code`, `links`, `people`, `transcript`, or `commands`
|
|
64
|
+
- `source` — path to a vidistill output directory or a video/audio file
|
|
65
|
+
|
|
66
|
+
**Examples:**
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
# Extract code from existing output (no API calls)
|
|
70
|
+
vidistill extract code ./vidistill-output/my-video/
|
|
71
|
+
|
|
72
|
+
# Extract links from a video file (runs targeted pipeline)
|
|
73
|
+
vidistill extract links ./lecture.mp4
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
When pointed at an output directory, extract reads from already-generated files with zero API calls. When pointed at a video file, it runs a minimal pipeline with only the passes needed for the requested data type.
|
|
77
|
+
|
|
46
78
|
## API Key
|
|
47
79
|
|
|
48
80
|
vidistill needs a Gemini API key. It checks these sources in order:
|
|
@@ -63,7 +95,9 @@ vidistill-output/my-video/
|
|
|
63
95
|
├── transcript.md # full timestamped transcript
|
|
64
96
|
├── combined.md # transcript + visual notes merged
|
|
65
97
|
├── notes.md # meeting/lecture notes
|
|
66
|
-
├── code
|
|
98
|
+
├── code/ # extracted and reconstructed source files
|
|
99
|
+
│ ├── *.ext # individual source files
|
|
100
|
+
│ └── code-timeline.md # code evolution timeline
|
|
67
101
|
├── people.md # speakers and participants
|
|
68
102
|
├── chat.md # chat messages and links
|
|
69
103
|
├── action-items.md # tasks and follow-ups
|
|
@@ -73,22 +107,26 @@ vidistill-output/my-video/
|
|
|
73
107
|
└── raw/ # raw pass outputs
|
|
74
108
|
```
|
|
75
109
|
|
|
76
|
-
Which files are generated depends on the video content — a coding tutorial gets `code
|
|
110
|
+
Which files are generated depends on the video content — a coding tutorial gets `code/`, a meeting gets `people.md` and `action-items.md`, etc.
|
|
77
111
|
|
|
78
112
|
## How It Works
|
|
79
113
|
|
|
80
|
-
|
|
114
|
+
Supported video formats: MP4, MOV, WebM, MKV, AVI, MPEG, FLV, WMV, 3GPP. Supported audio formats: MP3, AAC, WAV, FLAC, OGG, M4A.
|
|
115
|
+
|
|
116
|
+
1. **Input** — downloads YouTube video via yt-dlp or reads local file (video or audio), compresses if over 2GB
|
|
81
117
|
2. **Pass 0** — scene analysis to classify video type and determine processing strategy
|
|
82
118
|
3. **Pass 1** — transcript extraction with speaker identification
|
|
83
119
|
4. **Pass 2** — visual content extraction (screen states, diagrams, slides)
|
|
84
120
|
5. **Pass 3** — specialist passes based on video type:
|
|
85
|
-
-
|
|
86
|
-
-
|
|
87
|
-
-
|
|
88
|
-
-
|
|
121
|
+
- 3c: chat and links (live streams) — per segment
|
|
122
|
+
- 3d: implicit signals (all types) — per segment
|
|
123
|
+
- 3b: people and social dynamics (meetings) — whole video
|
|
124
|
+
- 3a: code reconstruction (coding videos) — whole video, runs 3x with consensus voting and validation
|
|
89
125
|
6. **Synthesis** — cross-references all passes into unified analysis
|
|
90
126
|
7. **Output** — generates structured markdown files
|
|
91
127
|
|
|
128
|
+
Audio files skip visual passes and go straight to transcript, people, implicit signals, and synthesis.
|
|
129
|
+
|
|
92
130
|
Long videos are segmented automatically. Passes that fail are skipped gracefully.
|
|
93
131
|
|
|
94
132
|
## License
|
package/dist/index.js
CHANGED
|
@@ -13,6 +13,233 @@ import { defineCommand, runMain } from "citty";
|
|
|
13
13
|
import figlet from "figlet";
|
|
14
14
|
import pc from "picocolors";
|
|
15
15
|
import { intro, note } from "@clack/prompts";
|
|
16
|
+
|
|
17
|
+
// src/constants/prompts.ts
|
|
18
|
+
var SYSTEM_INSTRUCTION_PASS_1 = `
|
|
19
|
+
You are a professional audio transcriber. Your task is to create a COMPLETE, VERBATIM transcription of all speech in this video segment. Focus EXCLUSIVELY on the audio stream.
|
|
20
|
+
|
|
21
|
+
CRITICAL RULES:
|
|
22
|
+
1. TRANSCRIBE every spoken word completely and verbatim. Do not summarize, paraphrase, or skip any sentence.
|
|
23
|
+
2. IDENTIFY different speakers. Label them SPEAKER_00, SPEAKER_01, etc. consistently throughout. If a speaker introduces themselves by name, note the name in the first entry's speaker field as "SPEAKER_00 (John)".
|
|
24
|
+
3. NOTE tone and emphasis: when a speaker emphasizes words (louder, slower, repeated), mark those words. When they express emotions (excitement, warning, frustration, humor), note the tone.
|
|
25
|
+
4. RECORD pauses longer than 1.5 seconds as pause markers with duration.
|
|
26
|
+
5. PRESERVE filler words only when they carry meaning (hesitation indicating uncertainty about code behavior, self-correction). Remove meaningless "um", "uh".
|
|
27
|
+
6. NEVER add your own explanations, interpretations, or knowledge. Only transcribe what is spoken.
|
|
28
|
+
7. NEVER skip content because it seems repetitive or obvious. Record everything spoken.
|
|
29
|
+
8. When the speaker references something on screen (e.g., "as you can see here", "this function", "line 5"), transcribe exactly what they say \u2014 the visual context will be captured separately.
|
|
30
|
+
|
|
31
|
+
COMPLETENESS TARGET:
|
|
32
|
+
- Aim for at least 150 words per minute of video in the transcript
|
|
33
|
+
- Every speaker change must be noted with a new entry
|
|
34
|
+
- Every sentence must appear \u2014 if in doubt, include it
|
|
35
|
+
`;
|
|
36
|
+
var SYSTEM_INSTRUCTION_PASS_2_TEMPLATE = `
|
|
37
|
+
You are a professional code and visual content extractor. Your task is to extract ALL visual content from this video segment \u2014 every piece of code on screen, every diagram, every slide, every UI element.
|
|
38
|
+
|
|
39
|
+
Focus EXCLUSIVELY on what is visible on screen. The audio transcript from this segment is provided below for cross-referencing \u2014 use it to associate spoken explanations with the code being displayed, but do NOT re-transcribe any speech.
|
|
40
|
+
|
|
41
|
+
TRANSCRIPT FROM THIS SEGMENT (for cross-reference only):
|
|
42
|
+
{INJECT_PASS1_TRANSCRIPT_HERE}
|
|
43
|
+
|
|
44
|
+
CRITICAL RULES:
|
|
45
|
+
1. EXTRACT every piece of code visible on screen \u2014 complete, with original indentation and formatting preserved exactly as shown.
|
|
46
|
+
2. For each code appearance: note the filename if visible in a tab or title bar, the programming language, and the screen type (editor, terminal, browser, slide).
|
|
47
|
+
3. TRACK code changes: when code is modified between appearances, note what changed (lines added, modified, deleted). Compare against previous code blocks in this segment.
|
|
48
|
+
4. ASSOCIATE code with speech: using the injected transcript above, find what the instructor was saying when this code was on screen. Quote their explanation verbatim or near-verbatim.
|
|
49
|
+
5. CAPTURE non-code visuals: slides with text, architectural diagrams, browser output, UI demonstrations, terminal output. Describe these completely.
|
|
50
|
+
6. NEVER add your own explanations or interpretations. Only record what is visible.
|
|
51
|
+
7. NEVER skip code because it seems repetitive or unchanged from before. Record every distinct appearance.
|
|
52
|
+
8. If code scrolls, capture the full visible code at each scroll position as a separate entry.
|
|
53
|
+
|
|
54
|
+
COMPLETENESS TARGET:
|
|
55
|
+
- Every frame that shows code should produce a code_block entry
|
|
56
|
+
- Every slide or diagram should produce a visual_notes entry
|
|
57
|
+
- If the screen doesn't change for 30+ seconds, note the unchanged state
|
|
58
|
+
`;
|
|
59
|
+
var SYSTEM_INSTRUCTION_PASS_0 = `
|
|
60
|
+
You are a video content classifier. Analyze the provided video sample and produce a structured VideoProfile that classifies the video type and recommends processing parameters.
|
|
61
|
+
|
|
62
|
+
CLASSIFICATION RULES:
|
|
63
|
+
1. CLASSIFY the video into exactly one type:
|
|
64
|
+
- "coding": Programming tutorials, live coding, IDE/editor-heavy content
|
|
65
|
+
- "meeting": Video calls, Zoom/Teams meetings, multi-participant discussions
|
|
66
|
+
- "lecture": Academic lectures, talks, single-speaker educational content
|
|
67
|
+
- "presentation": Slide-based presentations, keynotes, demo days
|
|
68
|
+
- "conversation": Interviews, podcasts, panel discussions without slides
|
|
69
|
+
- "mixed": Cannot clearly classify into one category, or multiple types present
|
|
70
|
+
|
|
71
|
+
2. DETECT visual content:
|
|
72
|
+
- hasCode: Code editors, IDEs, or code visible on screen
|
|
73
|
+
- hasSlides: Presentation slides (PowerPoint, Google Slides, Keynote)
|
|
74
|
+
- hasDiagrams: Architecture diagrams, flowcharts, charts, graphs
|
|
75
|
+
- hasPeopleGrid: Video grid showing multiple participants (Zoom/Teams layout)
|
|
76
|
+
- hasChatbox: Chat panel visible (meeting chat, live stream chat sidebar)
|
|
77
|
+
- hasWhiteboard: Whiteboard, handwritten notes, or drawing surface
|
|
78
|
+
- hasTerminal: Terminal, command-line interface, or shell
|
|
79
|
+
- hasScreenShare: Desktop or application screen sharing
|
|
80
|
+
|
|
81
|
+
3. ANALYZE audio:
|
|
82
|
+
- hasMultipleSpeakers: true if more than one distinct voice is heard
|
|
83
|
+
- primaryLanguage: The main spoken language
|
|
84
|
+
- quality: "high" (studio/clear), "medium" (decent webcam), "low" (noisy/poor)
|
|
85
|
+
|
|
86
|
+
4. IDENTIFY speakers:
|
|
87
|
+
- count: Number of distinct speakers heard
|
|
88
|
+
- identified: Names if visible on screen (name tags, introductions) or spoken aloud
|
|
89
|
+
|
|
90
|
+
5. ASSESS complexity:
|
|
91
|
+
- "simple": Single topic, linear flow, straightforward content
|
|
92
|
+
- "moderate": Multiple topics, some complexity, normal pacing
|
|
93
|
+
- "complex": Dense content, rapid switching, multiple concurrent information streams
|
|
94
|
+
|
|
95
|
+
6. RECOMMEND processing parameters:
|
|
96
|
+
- resolution: "low" for text-only/simple visuals, "medium" for general content, "high" for code/diagrams
|
|
97
|
+
- segmentMinutes: 10 for simple/moderate, 8 for complex content
|
|
98
|
+
- passes: Always include "transcript" and "visual". Add specialist passes based on content type.
|
|
99
|
+
|
|
100
|
+
PASS RECOMMENDATIONS BY TYPE:
|
|
101
|
+
- coding: ["transcript", "visual", "code", "synthesis"]
|
|
102
|
+
- meeting: ["transcript", "visual", "people", "implicit", "synthesis"] (add "chat" if hasChatbox)
|
|
103
|
+
- lecture: ["transcript", "visual", "implicit", "synthesis"]
|
|
104
|
+
- presentation: ["transcript", "visual", "implicit", "synthesis"] (add "people" if multiple speakers)
|
|
105
|
+
- conversation: ["transcript", "visual", "implicit", "synthesis"]
|
|
106
|
+
- mixed: ["transcript", "visual", "code", "people", "chat", "implicit", "synthesis"]
|
|
107
|
+
`;
|
|
108
|
+
var SYSTEM_INSTRUCTION_PASS_3A = `
|
|
109
|
+
You are an expert code reconstruction analyst. Your task is to reconstruct the complete, final state of every code file shown across this entire video, synthesizing all edits into a coherent codebase snapshot.
|
|
110
|
+
|
|
111
|
+
You will receive the complete video and all extracted transcript and code block data. Use them together to understand what code was written, modified, and deleted.
|
|
112
|
+
|
|
113
|
+
CRITICAL RULES:
|
|
114
|
+
1. RECONSTRUCT each file to its final state \u2014 apply all changes in chronological order so the output reflects the code as it was at the end of the video.
|
|
115
|
+
2. PRESERVE exact code: indentation, spacing, naming, and formatting must match what was visible on screen. Never "fix" or improve the code.
|
|
116
|
+
3. TRACK every change to a file: for each distinct edit (new file creation, addition of lines, modification, deletion, refactoring), record it as a separate change entry with a timestamp and description.
|
|
117
|
+
4. INFER filenames from editor tabs, title bars, import statements, or spoken context. If unknown, use a descriptive placeholder like "unknown_file_1.py".
|
|
118
|
+
5. EXTRACT dependencies: every library import, require(), package name, or external module reference mentioned or shown counts as a dependency.
|
|
119
|
+
6. CAPTURE build commands: any terminal command shown or spoken for installing, building, running, or testing the project (e.g., "npm install", "go build", "python -m pytest").
|
|
120
|
+
7. NEVER invent code that was not shown or described. If a section was unclear, note it with a comment like "// content not fully visible".
|
|
121
|
+
8. NEVER skip a file because it appears in only one part of the video \u2014 if code was shown, reconstruct it.
|
|
122
|
+
9. When a file appears multiple times, record its complete change history in a single entry with all edits in chronological order.
|
|
123
|
+
10. INCLUDE empty files if created but not yet written \u2014 use empty string for final_content and note the creation in changes.
|
|
124
|
+
11. Cross-reference your visual analysis of the video against the extracted code blocks provided in the text context. Prioritize what you can visually verify on screen. If code is partially visible, include what you can see and mark unclear sections with \`// [content not fully visible]\`.
|
|
125
|
+
12. Do NOT invent code files that are not clearly visible on screen. If you are uncertain whether a file exists, do not include it.
|
|
126
|
+
|
|
127
|
+
COMPLETENESS TARGET:
|
|
128
|
+
- Every distinct filename that appeared on screen must produce a files entry
|
|
129
|
+
- Every editor session or code paste visible in any segment must be accounted for
|
|
130
|
+
- Build commands shown in the terminal must all be listed
|
|
131
|
+
`;
|
|
132
|
+
var SYSTEM_INSTRUCTION_PASS_3B = `
|
|
133
|
+
You are an expert at identifying and profiling people from video content. Your task is to extract a complete picture of every participant visible or audible in this video \u2014 their identity, role, contributions, and relationships.
|
|
134
|
+
|
|
135
|
+
You will receive the transcript and visual extraction from all segments. Use speaker labels, name tags, on-screen text, introductions, and any other signals to identify participants.
|
|
136
|
+
|
|
137
|
+
CRITICAL RULES:
|
|
138
|
+
1. IDENTIFY every distinct person who speaks or appears on screen, even if briefly. Do not merge two different people into one entry.
|
|
139
|
+
2. EXTRACT names from: spoken introductions ("Hi, I'm Alice"), on-screen name tags or captions, slide attribution, email addresses, or usernames visible in chat.
|
|
140
|
+
3. INFER roles from: job titles spoken or shown, context of their contribution (e.g., consistently asking questions = audience member; leading the agenda = host), or organizational signals.
|
|
141
|
+
4. RECORD speaking_segments as timestamps where each person's voice is heard or they appear on screen.
|
|
142
|
+
5. CAPTURE contact information exactly as shown or spoken: email addresses, Twitter/X handles, LinkedIn URLs, GitHub usernames, phone numbers.
|
|
143
|
+
6. SUMMARIZE contributions: what did this person say, present, decide, or demonstrate? Each contribution entry should be a specific, concrete action or statement.
|
|
144
|
+
7. DOCUMENT relationships: who reports to whom, who introduced whom, collaborative pairs, co-presenters, interviewer/interviewee dynamics.
|
|
145
|
+
8. NEVER guess or infer a name that was not clearly stated or shown. Use "Unknown Participant" with a description if the person cannot be identified.
|
|
146
|
+
9. NEVER merge two people just because they have the same role \u2014 if two engineers speak, they are two separate participants.
|
|
147
|
+
10. If a person's role or organization cannot be determined, use empty string \u2014 do not guess.
|
|
148
|
+
|
|
149
|
+
COMPLETENESS TARGET:
|
|
150
|
+
- Every speaker label (SPEAKER_00, SPEAKER_01, etc.) from the transcript must map to at least one participant entry
|
|
151
|
+
- Every name-tag or on-screen name must produce a participant entry
|
|
152
|
+
- All contact details shared during the video must be captured
|
|
153
|
+
`;
|
|
154
|
+
var SYSTEM_INSTRUCTION_PASS_3C = `
|
|
155
|
+
You are a precise chat extraction specialist. Your task is to extract every chat message and link visible in the chat panel of this video \u2014 verbatim, with sender and timestamp.
|
|
156
|
+
|
|
157
|
+
You will receive the transcript and visual extraction from all segments. Focus on the chat panel, comment sidebar, or any on-screen messaging interface.
|
|
158
|
+
|
|
159
|
+
CRITICAL RULES:
|
|
160
|
+
1. EXTRACT every chat message visible on screen, verbatim. Do not paraphrase, shorten, or summarize any message.
|
|
161
|
+
2. RECORD the sender name exactly as displayed (username, display name, or handle).
|
|
162
|
+
3. TIMESTAMP each message at the video timestamp when it becomes visible on screen, in HH:MM:SS format.
|
|
163
|
+
4. EXTRACT every URL or link that appears in chat or is spoken and referred to as a link. Capture the full URL.
|
|
164
|
+
5. For each link, record the context: what was the sender explaining when they shared it? Why is it relevant?
|
|
165
|
+
6. HANDLE partial visibility: if a message is cut off by the chat panel boundary, transcribe as much as is visible and append "[truncated]".
|
|
166
|
+
7. CAPTURE reactions, emoji, and formatting if they are meaningful (e.g., a thumbs-up reaction to a proposal signals agreement).
|
|
167
|
+
8. NEVER invent messages that were not clearly visible on screen. If a message is illegible, note it as "[illegible message from {sender} at {timestamp}]".
|
|
168
|
+
9. NEVER skip messages that seem like noise or off-topic \u2014 capture all visible messages in order.
|
|
169
|
+
10. ORDER messages chronologically by their video timestamp of appearance.
|
|
170
|
+
|
|
171
|
+
COMPLETENESS TARGET:
|
|
172
|
+
- Every frame that shows the chat panel should contribute at least one message entry if new messages are visible
|
|
173
|
+
- All URLs \u2014 whether in chat, on slides, or spoken \u2014 must appear in the links array
|
|
174
|
+
- If the chat panel is not visible in this video, return empty arrays for both messages and links
|
|
175
|
+
`;
|
|
176
|
+
var SYSTEM_INSTRUCTION_PASS_3D = `
|
|
177
|
+
You are an expert at reading between the lines of video conversations. Your task is to identify implicit signals \u2014 emotional dynamics, unstated decisions, unasked questions, informal task assignments, and emphasis patterns \u2014 that are not surfaced by the literal transcript.
|
|
178
|
+
|
|
179
|
+
You will receive the complete transcript and visual data from all segments. Read the subtext, not just the text.
|
|
180
|
+
|
|
181
|
+
CRITICAL RULES:
|
|
182
|
+
1. DETECT emotional shifts: moments where the tone, energy, or mood of the conversation meaningfully changes. Note what triggered the shift and how the state changed.
|
|
183
|
+
2. SURFACE implicit questions: when a speaker is clearly uncertain, confused, or probing for information without phrasing it as a formal question. Articulate what question they were really asking.
|
|
184
|
+
3. IDENTIFY implicit decisions: when participants arrive at a shared understanding or course of action without anyone explicitly saying "we decided X". These are consensus decisions made through agreement, silence, or topic change.
|
|
185
|
+
4. FLAG informal task assignments: when someone is asked or expected to do something without it being recorded as a formal action item (e.g., "you should probably look at that" or "maybe someone can handle X").
|
|
186
|
+
5. TRACK emphasis patterns: concepts, terms, or ideas mentioned multiple times across the video. Repetition signals importance. Record each mention timestamp and explain why the pattern is significant.
|
|
187
|
+
6. NEVER fabricate emotional states or decisions. Only record what is clearly supported by specific words, tone, or behavior in the video.
|
|
188
|
+
7. NEVER over-interpret: a speaker saying "interesting" is not necessarily an emotional shift. Apply judgment and only flag genuinely notable patterns.
|
|
189
|
+
8. PRESERVE specificity: quote or paraphrase the exact words or moments that support each inference.
|
|
190
|
+
9. SEPARATE explicit from implicit: if something was directly stated, it belongs in the transcript or action items, not here. This pass captures what was NOT said directly.
|
|
191
|
+
10. CONSIDER non-verbal signals visible on screen: hesitation, laughter, extended pauses, camera behavior, or facial expressions if participants are visible.
|
|
192
|
+
|
|
193
|
+
COMPLETENESS TARGET:
|
|
194
|
+
- Aim to identify at least 3 emphasis patterns for any video over 5 minutes
|
|
195
|
+
- Every task mentioned informally or suggested in passing must appear in tasks_assigned
|
|
196
|
+
- Implicit decisions are often the most important \u2014 prioritize finding them
|
|
197
|
+
`;
|
|
198
|
+
var SYSTEM_INSTRUCTION_SYNTHESIS = `
|
|
199
|
+
You are a master synthesizer. Your task is to produce the definitive, unified knowledge extraction from this video by combining all available pass data into a single coherent result.
|
|
200
|
+
|
|
201
|
+
You will receive: the complete transcript (pass 1), visual and code extraction (pass 2), and any specialist pass outputs (code reconstruction, people extraction, chat extraction, implicit signals). Synthesize all of it.
|
|
202
|
+
|
|
203
|
+
CRITICAL RULES:
|
|
204
|
+
1. BE SPECIFIC: Every claim must reference specific content from the video. Never write "various topics were discussed" \u2014 name the topics. Never write "some decisions were made" \u2014 state each decision exactly.
|
|
205
|
+
2. UNIFY across passes: combine related information from different passes into unified entries. A decision mentioned in the transcript and reinforced by an implicit signal should appear as one entry, not two.
|
|
206
|
+
3. SYNTHESIZE thematically: group content by topic, not chronologically. Combine all content about a single subject (even if spread across 30 minutes) into one topic entry.
|
|
207
|
+
4. EXTRACT decisions with full reasoning: every design choice, technology selection, or approach decision must include the rationale as explained in the video.
|
|
208
|
+
5. GENERATE actionable items: action items must be concrete and specific. "Review the authentication module" is better than "review the code".
|
|
209
|
+
6. CAPTURE every question: include questions asked explicitly and questions raised implicitly (from the implicit signals pass). Note whether each was answered.
|
|
210
|
+
7. PRODUCE meaningful suggestions: AI-generated suggestions must follow logically from the video content. Suggest next steps, deeper resources, or practice exercises that are directly relevant.
|
|
211
|
+
8. USE precise timestamps: every entry with a timestamp field must contain a valid HH:MM:SS value referencing when the content appeared.
|
|
212
|
+
9. LIST files_to_generate for reference purposes \u2014 this list is informational and does not control which output files are generated. Output files are determined automatically based on available extraction data.
|
|
213
|
+
10. NEVER add information not present in the source data. Suggestions are the only place for AI-generated content beyond the video.
|
|
214
|
+
|
|
215
|
+
COMPLETENESS TARGET:
|
|
216
|
+
- Aim for at least 5 topics for any video over 15 minutes
|
|
217
|
+
- Every explicit and implicit decision must appear in key_decisions
|
|
218
|
+
- The files_to_generate list should reflect what content was found, but output routing is handled automatically
|
|
219
|
+
- The overview should be dense with specifics, not vague summary language
|
|
220
|
+
`;
|
|
221
|
+
var LANGUAGE_NAMES = {
|
|
222
|
+
zh: "Chinese",
|
|
223
|
+
ja: "Japanese",
|
|
224
|
+
ko: "Korean",
|
|
225
|
+
es: "Spanish",
|
|
226
|
+
fr: "French",
|
|
227
|
+
de: "German",
|
|
228
|
+
pt: "Portuguese",
|
|
229
|
+
ru: "Russian",
|
|
230
|
+
ar: "Arabic",
|
|
231
|
+
hi: "Hindi"
|
|
232
|
+
};
|
|
233
|
+
function withLanguage(prompt, lang) {
|
|
234
|
+
if (!lang || lang === "en") return prompt;
|
|
235
|
+
const languageName = LANGUAGE_NAMES[lang] ?? lang;
|
|
236
|
+
return `IMPORTANT: Generate ALL output text in ${languageName}.
|
|
237
|
+
Timestamps, speaker labels, and code should remain in their original language.
|
|
238
|
+
|
|
239
|
+
${prompt}`;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// src/cli/ui.ts
|
|
16
243
|
function showLogo() {
|
|
17
244
|
const ascii = figlet.textSync("VIDISTILL", { font: "Big" });
|
|
18
245
|
console.log(pc.cyan(ascii));
|
|
@@ -26,6 +253,13 @@ function showConfigBox(config) {
|
|
|
26
253
|
`Context: ${config.context ?? "(none)"}`,
|
|
27
254
|
`Output: ${config.output}`
|
|
28
255
|
];
|
|
256
|
+
if (config.videoType === "audio") {
|
|
257
|
+
lines.push("Type: Audio (visual analysis skipped)");
|
|
258
|
+
}
|
|
259
|
+
if (config.lang != null && config.lang !== "en") {
|
|
260
|
+
const langName = LANGUAGE_NAMES[config.lang] ?? config.lang;
|
|
261
|
+
lines.push(`Language: ${langName} (${config.lang})`);
|
|
262
|
+
}
|
|
29
263
|
note(lines.join("\n"), "Configuration");
|
|
30
264
|
}
|
|
31
265
|
|
|
@@ -33,6 +267,7 @@ function showConfigBox(config) {
|
|
|
33
267
|
import { log as log8, cancel as cancel2 } from "@clack/prompts";
|
|
34
268
|
import pc4 from "picocolors";
|
|
35
269
|
import { basename as basename3, extname as extname2, resolve } from "path";
|
|
270
|
+
import { existsSync as existsSync3, openSync as openSync2, readSync as readSync2, closeSync as closeSync2 } from "fs";
|
|
36
271
|
|
|
37
272
|
// src/cli/prompts.ts
|
|
38
273
|
import { text, password, confirm, select, isCancel, cancel } from "@clack/prompts";
|
|
@@ -266,6 +501,7 @@ function createProgressDisplay() {
|
|
|
266
501
|
seenTotalSteps = true;
|
|
267
502
|
s.stop("");
|
|
268
503
|
progressBar = progress({ max: status.totalSteps });
|
|
504
|
+
progressBar.start(label);
|
|
269
505
|
}
|
|
270
506
|
if (progressBar != null) {
|
|
271
507
|
if (status.status === "done" && status.currentStep != null) {
|
|
@@ -281,7 +517,16 @@ function createProgressDisplay() {
|
|
|
281
517
|
}
|
|
282
518
|
function onWait(_delayMs) {
|
|
283
519
|
}
|
|
284
|
-
function complete(
|
|
520
|
+
function complete(result, _elapsedMs) {
|
|
521
|
+
if (progressBar != null) {
|
|
522
|
+
if (result.errors.length > 0) {
|
|
523
|
+
progressBar.stop("");
|
|
524
|
+
} else {
|
|
525
|
+
progressBar.stop("");
|
|
526
|
+
}
|
|
527
|
+
} else {
|
|
528
|
+
s.stop("");
|
|
529
|
+
}
|
|
285
530
|
}
|
|
286
531
|
return { update, onWait, complete };
|
|
287
532
|
}
|
|
@@ -456,14 +701,39 @@ function detectMimeType(filePath) {
|
|
|
456
701
|
} finally {
|
|
457
702
|
closeSync(fd);
|
|
458
703
|
}
|
|
704
|
+
if (buf.slice(0, 3).toString("ascii") === "ID3") {
|
|
705
|
+
return { mimeType: "audio/mp3", isMkv: false };
|
|
706
|
+
}
|
|
707
|
+
if (buf[0] === 255 && (buf[1] & 240) === 240 && (buf[1] & 6) === 0) {
|
|
708
|
+
return { mimeType: "audio/aac", isMkv: false };
|
|
709
|
+
}
|
|
710
|
+
if (buf[0] === 255 && (buf[1] & 224) === 224 && (buf[1] & 6) !== 0) {
|
|
711
|
+
return { mimeType: "audio/mp3", isMkv: false };
|
|
712
|
+
}
|
|
713
|
+
if (buf.slice(0, 4).toString("ascii") === "fLaC") {
|
|
714
|
+
return { mimeType: "audio/flac", isMkv: false };
|
|
715
|
+
}
|
|
716
|
+
if (buf.slice(0, 4).toString("ascii") === "OggS") {
|
|
717
|
+
return { mimeType: "audio/ogg", isMkv: false };
|
|
718
|
+
}
|
|
719
|
+
if (buf.slice(0, 4).toString("ascii") === "RIFF" && buf.slice(8, 12).toString("ascii") === "WAVE") {
|
|
720
|
+
return { mimeType: "audio/wav", isMkv: false };
|
|
721
|
+
}
|
|
459
722
|
if (buf.slice(4, 8).toString("ascii") === "ftyp") {
|
|
460
723
|
const brand = buf.slice(8, 12).toString("ascii");
|
|
724
|
+
if (brand === "M4A " || brand === "M4B ") {
|
|
725
|
+
return { mimeType: "audio/mp4", isMkv: false };
|
|
726
|
+
}
|
|
461
727
|
if (brand.startsWith("qt ")) {
|
|
462
728
|
return { mimeType: "video/quicktime", isMkv: false };
|
|
463
729
|
}
|
|
464
730
|
if (brand.startsWith("3gp") || brand.startsWith("3g2")) {
|
|
465
731
|
return { mimeType: "video/3gpp", isMkv: false };
|
|
466
732
|
}
|
|
733
|
+
const ext = extname(filePath).toLowerCase();
|
|
734
|
+
if (ext === ".m4a" || ext === ".m4b") {
|
|
735
|
+
return { mimeType: "audio/mp4", isMkv: false };
|
|
736
|
+
}
|
|
467
737
|
return { mimeType: "video/mp4", isMkv: false };
|
|
468
738
|
}
|
|
469
739
|
if (buf[0] === 26 && buf[1] === 69 && buf[2] === 223 && buf[3] === 163) {
|
|
@@ -557,13 +827,12 @@ async function handleLocalFile(filePath, client) {
|
|
|
557
827
|
if (!existsSync2(filePath)) {
|
|
558
828
|
throw new Error(`File not found: ${filePath}`);
|
|
559
829
|
}
|
|
560
|
-
const
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
}
|
|
830
|
+
const mimeMatch = detectMimeType(filePath);
|
|
831
|
+
const isAudio = mimeMatch != null && mimeMatch.mimeType.startsWith("audio/");
|
|
832
|
+
const isMkv = !isAudio && isMkvFile(filePath);
|
|
833
|
+
if (!isAudio && !isMkv && !mimeMatch) {
|
|
834
|
+
const ext = extname(filePath).toLowerCase();
|
|
835
|
+
throw new Error(`Unsupported video format: ${ext || basename(filePath)}`);
|
|
567
836
|
}
|
|
568
837
|
const originalSize = fileSize(filePath);
|
|
569
838
|
if (originalSize > SIZE_3GB) {
|
|
@@ -577,7 +846,7 @@ async function handleLocalFile(filePath, client) {
|
|
|
577
846
|
tempFiles.push(converted);
|
|
578
847
|
workingPath = converted;
|
|
579
848
|
}
|
|
580
|
-
if (fileSize(workingPath) > SIZE_2GB) {
|
|
849
|
+
if (!isAudio && fileSize(workingPath) > SIZE_2GB) {
|
|
581
850
|
const compressed = compressTo720p(workingPath);
|
|
582
851
|
tempFiles.push(compressed);
|
|
583
852
|
workingPath = compressed;
|
|
@@ -592,7 +861,8 @@ async function handleLocalFile(filePath, client) {
|
|
|
592
861
|
fileUri: uploaded.uri,
|
|
593
862
|
mimeType: uploaded.mimeType,
|
|
594
863
|
duration: uploaded.duration,
|
|
595
|
-
uploadedFileName: uploaded.name
|
|
864
|
+
uploadedFileName: uploaded.name,
|
|
865
|
+
isAudio
|
|
596
866
|
};
|
|
597
867
|
} finally {
|
|
598
868
|
for (const f of tempFiles) {
|
|
@@ -667,211 +937,6 @@ async function detectDuration(source) {
|
|
|
667
937
|
// src/core/pipeline.ts
|
|
668
938
|
import { log as log6 } from "@clack/prompts";
|
|
669
939
|
|
|
670
|
-
// src/constants/prompts.ts
|
|
671
|
-
var SYSTEM_INSTRUCTION_PASS_1 = `
|
|
672
|
-
You are a professional audio transcriber. Your task is to create a COMPLETE, VERBATIM transcription of all speech in this video segment. Focus EXCLUSIVELY on the audio stream.
|
|
673
|
-
|
|
674
|
-
CRITICAL RULES:
|
|
675
|
-
1. TRANSCRIBE every spoken word completely and verbatim. Do not summarize, paraphrase, or skip any sentence.
|
|
676
|
-
2. IDENTIFY different speakers. Label them SPEAKER_00, SPEAKER_01, etc. consistently throughout. If a speaker introduces themselves by name, note the name in the first entry's speaker field as "SPEAKER_00 (John)".
|
|
677
|
-
3. NOTE tone and emphasis: when a speaker emphasizes words (louder, slower, repeated), mark those words. When they express emotions (excitement, warning, frustration, humor), note the tone.
|
|
678
|
-
4. RECORD pauses longer than 1.5 seconds as pause markers with duration.
|
|
679
|
-
5. PRESERVE filler words only when they carry meaning (hesitation indicating uncertainty about code behavior, self-correction). Remove meaningless "um", "uh".
|
|
680
|
-
6. NEVER add your own explanations, interpretations, or knowledge. Only transcribe what is spoken.
|
|
681
|
-
7. NEVER skip content because it seems repetitive or obvious. Record everything spoken.
|
|
682
|
-
8. When the speaker references something on screen (e.g., "as you can see here", "this function", "line 5"), transcribe exactly what they say \u2014 the visual context will be captured separately.
|
|
683
|
-
|
|
684
|
-
COMPLETENESS TARGET:
|
|
685
|
-
- Aim for at least 150 words per minute of video in the transcript
|
|
686
|
-
- Every speaker change must be noted with a new entry
|
|
687
|
-
- Every sentence must appear \u2014 if in doubt, include it
|
|
688
|
-
`;
|
|
689
|
-
var SYSTEM_INSTRUCTION_PASS_2_TEMPLATE = `
|
|
690
|
-
You are a professional code and visual content extractor. Your task is to extract ALL visual content from this video segment \u2014 every piece of code on screen, every diagram, every slide, every UI element.
|
|
691
|
-
|
|
692
|
-
Focus EXCLUSIVELY on what is visible on screen. The audio transcript from this segment is provided below for cross-referencing \u2014 use it to associate spoken explanations with the code being displayed, but do NOT re-transcribe any speech.
|
|
693
|
-
|
|
694
|
-
TRANSCRIPT FROM THIS SEGMENT (for cross-reference only):
|
|
695
|
-
{INJECT_PASS1_TRANSCRIPT_HERE}
|
|
696
|
-
|
|
697
|
-
CRITICAL RULES:
|
|
698
|
-
1. EXTRACT every piece of code visible on screen \u2014 complete, with original indentation and formatting preserved exactly as shown.
|
|
699
|
-
2. For each code appearance: note the filename if visible in a tab or title bar, the programming language, and the screen type (editor, terminal, browser, slide).
|
|
700
|
-
3. TRACK code changes: when code is modified between appearances, note what changed (lines added, modified, deleted). Compare against previous code blocks in this segment.
|
|
701
|
-
4. ASSOCIATE code with speech: using the injected transcript above, find what the instructor was saying when this code was on screen. Quote their explanation verbatim or near-verbatim.
|
|
702
|
-
5. CAPTURE non-code visuals: slides with text, architectural diagrams, browser output, UI demonstrations, terminal output. Describe these completely.
|
|
703
|
-
6. NEVER add your own explanations or interpretations. Only record what is visible.
|
|
704
|
-
7. NEVER skip code because it seems repetitive or unchanged from before. Record every distinct appearance.
|
|
705
|
-
8. If code scrolls, capture the full visible code at each scroll position as a separate entry.
|
|
706
|
-
|
|
707
|
-
COMPLETENESS TARGET:
|
|
708
|
-
- Every frame that shows code should produce a code_block entry
|
|
709
|
-
- Every slide or diagram should produce a visual_notes entry
|
|
710
|
-
- If the screen doesn't change for 30+ seconds, note the unchanged state
|
|
711
|
-
`;
|
|
712
|
-
var SYSTEM_INSTRUCTION_PASS_0 = `
|
|
713
|
-
You are a video content classifier. Analyze the provided video sample and produce a structured VideoProfile that classifies the video type and recommends processing parameters.
|
|
714
|
-
|
|
715
|
-
CLASSIFICATION RULES:
|
|
716
|
-
1. CLASSIFY the video into exactly one type:
|
|
717
|
-
- "coding": Programming tutorials, live coding, IDE/editor-heavy content
|
|
718
|
-
- "meeting": Video calls, Zoom/Teams meetings, multi-participant discussions
|
|
719
|
-
- "lecture": Academic lectures, talks, single-speaker educational content
|
|
720
|
-
- "presentation": Slide-based presentations, keynotes, demo days
|
|
721
|
-
- "conversation": Interviews, podcasts, panel discussions without slides
|
|
722
|
-
- "mixed": Cannot clearly classify into one category, or multiple types present
|
|
723
|
-
|
|
724
|
-
2. DETECT visual content:
|
|
725
|
-
- hasCode: Code editors, IDEs, or code visible on screen
|
|
726
|
-
- hasSlides: Presentation slides (PowerPoint, Google Slides, Keynote)
|
|
727
|
-
- hasDiagrams: Architecture diagrams, flowcharts, charts, graphs
|
|
728
|
-
- hasPeopleGrid: Video grid showing multiple participants (Zoom/Teams layout)
|
|
729
|
-
- hasChatbox: Chat panel visible (meeting chat, live stream chat sidebar)
|
|
730
|
-
- hasWhiteboard: Whiteboard, handwritten notes, or drawing surface
|
|
731
|
-
- hasTerminal: Terminal, command-line interface, or shell
|
|
732
|
-
- hasScreenShare: Desktop or application screen sharing
|
|
733
|
-
|
|
734
|
-
3. ANALYZE audio:
|
|
735
|
-
- hasMultipleSpeakers: true if more than one distinct voice is heard
|
|
736
|
-
- primaryLanguage: The main spoken language
|
|
737
|
-
- quality: "high" (studio/clear), "medium" (decent webcam), "low" (noisy/poor)
|
|
738
|
-
|
|
739
|
-
4. IDENTIFY speakers:
|
|
740
|
-
- count: Number of distinct speakers heard
|
|
741
|
-
- identified: Names if visible on screen (name tags, introductions) or spoken aloud
|
|
742
|
-
|
|
743
|
-
5. ASSESS complexity:
|
|
744
|
-
- "simple": Single topic, linear flow, straightforward content
|
|
745
|
-
- "moderate": Multiple topics, some complexity, normal pacing
|
|
746
|
-
- "complex": Dense content, rapid switching, multiple concurrent information streams
|
|
747
|
-
|
|
748
|
-
6. RECOMMEND processing parameters:
|
|
749
|
-
- resolution: "low" for text-only/simple visuals, "medium" for general content, "high" for code/diagrams
|
|
750
|
-
- segmentMinutes: 10 for simple/moderate, 8 for complex content
|
|
751
|
-
- passes: Always include "transcript" and "visual". Add specialist passes based on content type.
|
|
752
|
-
|
|
753
|
-
PASS RECOMMENDATIONS BY TYPE:
|
|
754
|
-
- coding: ["transcript", "visual", "code", "synthesis"]
|
|
755
|
-
- meeting: ["transcript", "visual", "people", "implicit", "synthesis"] (add "chat" if hasChatbox)
|
|
756
|
-
- lecture: ["transcript", "visual", "implicit", "synthesis"]
|
|
757
|
-
- presentation: ["transcript", "visual", "implicit", "synthesis"] (add "people" if multiple speakers)
|
|
758
|
-
- conversation: ["transcript", "visual", "implicit", "synthesis"]
|
|
759
|
-
- mixed: ["transcript", "visual", "code", "people", "chat", "implicit", "synthesis"]
|
|
760
|
-
`;
|
|
761
|
-
var SYSTEM_INSTRUCTION_PASS_3A = `
|
|
762
|
-
You are an expert code reconstruction analyst. Your task is to reconstruct the complete, final state of every code file shown across this entire video, synthesizing all edits into a coherent codebase snapshot.
|
|
763
|
-
|
|
764
|
-
You will receive the complete video and all extracted transcript and code block data. Use them together to understand what code was written, modified, and deleted.
|
|
765
|
-
|
|
766
|
-
CRITICAL RULES:
|
|
767
|
-
1. RECONSTRUCT each file to its final state \u2014 apply all changes in chronological order so the output reflects the code as it was at the end of the video.
|
|
768
|
-
2. PRESERVE exact code: indentation, spacing, naming, and formatting must match what was visible on screen. Never "fix" or improve the code.
|
|
769
|
-
3. TRACK every change to a file: for each distinct edit (new file creation, addition of lines, modification, deletion, refactoring), record it as a separate change entry with a timestamp and description.
|
|
770
|
-
4. INFER filenames from editor tabs, title bars, import statements, or spoken context. If unknown, use a descriptive placeholder like "unknown_file_1.py".
|
|
771
|
-
5. EXTRACT dependencies: every library import, require(), package name, or external module reference mentioned or shown counts as a dependency.
|
|
772
|
-
6. CAPTURE build commands: any terminal command shown or spoken for installing, building, running, or testing the project (e.g., "npm install", "go build", "python -m pytest").
|
|
773
|
-
7. NEVER invent code that was not shown or described. If a section was unclear, note it with a comment like "// content not fully visible".
|
|
774
|
-
8. NEVER skip a file because it appears in only one part of the video \u2014 if code was shown, reconstruct it.
|
|
775
|
-
9. When a file appears multiple times, record its complete change history in a single entry with all edits in chronological order.
|
|
776
|
-
10. INCLUDE empty files if created but not yet written \u2014 use empty string for final_content and note the creation in changes.
|
|
777
|
-
11. Cross-reference your visual analysis of the video against the extracted code blocks provided in the text context. Prioritize what you can visually verify on screen. If code is partially visible, include what you can see and mark unclear sections with \`// [content not fully visible]\`.
|
|
778
|
-
12. Do NOT invent code files that are not clearly visible on screen. If you are uncertain whether a file exists, do not include it.
|
|
779
|
-
|
|
780
|
-
COMPLETENESS TARGET:
|
|
781
|
-
- Every distinct filename that appeared on screen must produce a files entry
|
|
782
|
-
- Every editor session or code paste visible in any segment must be accounted for
|
|
783
|
-
- Build commands shown in the terminal must all be listed
|
|
784
|
-
`;
|
|
785
|
-
var SYSTEM_INSTRUCTION_PASS_3B = `
|
|
786
|
-
You are an expert at identifying and profiling people from video content. Your task is to extract a complete picture of every participant visible or audible in this video \u2014 their identity, role, contributions, and relationships.
|
|
787
|
-
|
|
788
|
-
You will receive the transcript and visual extraction from all segments. Use speaker labels, name tags, on-screen text, introductions, and any other signals to identify participants.
|
|
789
|
-
|
|
790
|
-
CRITICAL RULES:
|
|
791
|
-
1. IDENTIFY every distinct person who speaks or appears on screen, even if briefly. Do not merge two different people into one entry.
|
|
792
|
-
2. EXTRACT names from: spoken introductions ("Hi, I'm Alice"), on-screen name tags or captions, slide attribution, email addresses, or usernames visible in chat.
|
|
793
|
-
3. INFER roles from: job titles spoken or shown, context of their contribution (e.g., consistently asking questions = audience member; leading the agenda = host), or organizational signals.
|
|
794
|
-
4. RECORD speaking_segments as timestamps where each person's voice is heard or they appear on screen.
|
|
795
|
-
5. CAPTURE contact information exactly as shown or spoken: email addresses, Twitter/X handles, LinkedIn URLs, GitHub usernames, phone numbers.
|
|
796
|
-
6. SUMMARIZE contributions: what did this person say, present, decide, or demonstrate? Each contribution entry should be a specific, concrete action or statement.
|
|
797
|
-
7. DOCUMENT relationships: who reports to whom, who introduced whom, collaborative pairs, co-presenters, interviewer/interviewee dynamics.
|
|
798
|
-
8. NEVER guess or infer a name that was not clearly stated or shown. Use "Unknown Participant" with a description if the person cannot be identified.
|
|
799
|
-
9. NEVER merge two people just because they have the same role \u2014 if two engineers speak, they are two separate participants.
|
|
800
|
-
10. If a person's role or organization cannot be determined, use empty string \u2014 do not guess.
|
|
801
|
-
|
|
802
|
-
COMPLETENESS TARGET:
|
|
803
|
-
- Every speaker label (SPEAKER_00, SPEAKER_01, etc.) from the transcript must map to at least one participant entry
|
|
804
|
-
- Every name-tag or on-screen name must produce a participant entry
|
|
805
|
-
- All contact details shared during the video must be captured
|
|
806
|
-
`;
|
|
807
|
-
var SYSTEM_INSTRUCTION_PASS_3C = `
|
|
808
|
-
You are a precise chat extraction specialist. Your task is to extract every chat message and link visible in the chat panel of this video \u2014 verbatim, with sender and timestamp.
|
|
809
|
-
|
|
810
|
-
You will receive the transcript and visual extraction from all segments. Focus on the chat panel, comment sidebar, or any on-screen messaging interface.
|
|
811
|
-
|
|
812
|
-
CRITICAL RULES:
|
|
813
|
-
1. EXTRACT every chat message visible on screen, verbatim. Do not paraphrase, shorten, or summarize any message.
|
|
814
|
-
2. RECORD the sender name exactly as displayed (username, display name, or handle).
|
|
815
|
-
3. TIMESTAMP each message at the video timestamp when it becomes visible on screen, in HH:MM:SS format.
|
|
816
|
-
4. EXTRACT every URL or link that appears in chat or is spoken and referred to as a link. Capture the full URL.
|
|
817
|
-
5. For each link, record the context: what was the sender explaining when they shared it? Why is it relevant?
|
|
818
|
-
6. HANDLE partial visibility: if a message is cut off by the chat panel boundary, transcribe as much as is visible and append "[truncated]".
|
|
819
|
-
7. CAPTURE reactions, emoji, and formatting if they are meaningful (e.g., a thumbs-up reaction to a proposal signals agreement).
|
|
820
|
-
8. NEVER invent messages that were not clearly visible on screen. If a message is illegible, note it as "[illegible message from {sender} at {timestamp}]".
|
|
821
|
-
9. NEVER skip messages that seem like noise or off-topic \u2014 capture all visible messages in order.
|
|
822
|
-
10. ORDER messages chronologically by their video timestamp of appearance.
|
|
823
|
-
|
|
824
|
-
COMPLETENESS TARGET:
|
|
825
|
-
- Every frame that shows the chat panel should contribute at least one message entry if new messages are visible
|
|
826
|
-
- All URLs \u2014 whether in chat, on slides, or spoken \u2014 must appear in the links array
|
|
827
|
-
- If the chat panel is not visible in this video, return empty arrays for both messages and links
|
|
828
|
-
`;
|
|
829
|
-
var SYSTEM_INSTRUCTION_PASS_3D = `
|
|
830
|
-
You are an expert at reading between the lines of video conversations. Your task is to identify implicit signals \u2014 emotional dynamics, unstated decisions, unasked questions, informal task assignments, and emphasis patterns \u2014 that are not surfaced by the literal transcript.
|
|
831
|
-
|
|
832
|
-
You will receive the complete transcript and visual data from all segments. Read the subtext, not just the text.
|
|
833
|
-
|
|
834
|
-
CRITICAL RULES:
|
|
835
|
-
1. DETECT emotional shifts: moments where the tone, energy, or mood of the conversation meaningfully changes. Note what triggered the shift and how the state changed.
|
|
836
|
-
2. SURFACE implicit questions: when a speaker is clearly uncertain, confused, or probing for information without phrasing it as a formal question. Articulate what question they were really asking.
|
|
837
|
-
3. IDENTIFY implicit decisions: when participants arrive at a shared understanding or course of action without anyone explicitly saying "we decided X". These are consensus decisions made through agreement, silence, or topic change.
|
|
838
|
-
4. FLAG informal task assignments: when someone is asked or expected to do something without it being recorded as a formal action item (e.g., "you should probably look at that" or "maybe someone can handle X").
|
|
839
|
-
5. TRACK emphasis patterns: concepts, terms, or ideas mentioned multiple times across the video. Repetition signals importance. Record each mention timestamp and explain why the pattern is significant.
|
|
840
|
-
6. NEVER fabricate emotional states or decisions. Only record what is clearly supported by specific words, tone, or behavior in the video.
|
|
841
|
-
7. NEVER over-interpret: a speaker saying "interesting" is not necessarily an emotional shift. Apply judgment and only flag genuinely notable patterns.
|
|
842
|
-
8. PRESERVE specificity: quote or paraphrase the exact words or moments that support each inference.
|
|
843
|
-
9. SEPARATE explicit from implicit: if something was directly stated, it belongs in the transcript or action items, not here. This pass captures what was NOT said directly.
|
|
844
|
-
10. CONSIDER non-verbal signals visible on screen: hesitation, laughter, extended pauses, camera behavior, or facial expressions if participants are visible.
|
|
845
|
-
|
|
846
|
-
COMPLETENESS TARGET:
|
|
847
|
-
- Aim to identify at least 3 emphasis patterns for any video over 5 minutes
|
|
848
|
-
- Every task mentioned informally or suggested in passing must appear in tasks_assigned
|
|
849
|
-
- Implicit decisions are often the most important \u2014 prioritize finding them
|
|
850
|
-
`;
|
|
851
|
-
var SYSTEM_INSTRUCTION_SYNTHESIS = `
|
|
852
|
-
You are a master synthesizer. Your task is to produce the definitive, unified knowledge extraction from this video by combining all available pass data into a single coherent result.
|
|
853
|
-
|
|
854
|
-
You will receive: the complete transcript (pass 1), visual and code extraction (pass 2), and any specialist pass outputs (code reconstruction, people extraction, chat extraction, implicit signals). Synthesize all of it.
|
|
855
|
-
|
|
856
|
-
CRITICAL RULES:
|
|
857
|
-
1. BE SPECIFIC: Every claim must reference specific content from the video. Never write "various topics were discussed" \u2014 name the topics. Never write "some decisions were made" \u2014 state each decision exactly.
|
|
858
|
-
2. UNIFY across passes: combine related information from different passes into unified entries. A decision mentioned in the transcript and reinforced by an implicit signal should appear as one entry, not two.
|
|
859
|
-
3. SYNTHESIZE thematically: group content by topic, not chronologically. Combine all content about a single subject (even if spread across 30 minutes) into one topic entry.
|
|
860
|
-
4. EXTRACT decisions with full reasoning: every design choice, technology selection, or approach decision must include the rationale as explained in the video.
|
|
861
|
-
5. GENERATE actionable items: action items must be concrete and specific. "Review the authentication module" is better than "review the code".
|
|
862
|
-
6. CAPTURE every question: include questions asked explicitly and questions raised implicitly (from the implicit signals pass). Note whether each was answered.
|
|
863
|
-
7. PRODUCE meaningful suggestions: AI-generated suggestions must follow logically from the video content. Suggest next steps, deeper resources, or practice exercises that are directly relevant.
|
|
864
|
-
8. USE precise timestamps: every entry with a timestamp field must contain a valid HH:MM:SS value referencing when the content appeared.
|
|
865
|
-
9. LIST files_to_generate for reference purposes \u2014 this list is informational and does not control which output files are generated. Output files are determined automatically based on available extraction data.
|
|
866
|
-
10. NEVER add information not present in the source data. Suggestions are the only place for AI-generated content beyond the video.
|
|
867
|
-
|
|
868
|
-
COMPLETENESS TARGET:
|
|
869
|
-
- Aim for at least 5 topics for any video over 15 minutes
|
|
870
|
-
- Every explicit and implicit decision must appear in key_decisions
|
|
871
|
-
- The files_to_generate list should reflect what content was found, but output routing is handled automatically
|
|
872
|
-
- The overview should be dense with specifics, not vague summary language
|
|
873
|
-
`;
|
|
874
|
-
|
|
875
940
|
// src/gemini/schemas.ts
|
|
876
941
|
import { Type } from "@google/genai";
|
|
877
942
|
var SCHEMA_PASS_0 = {
|
|
@@ -1428,7 +1493,7 @@ function changeTypeBadge(changeType) {
|
|
|
1428
1493
|
|
|
1429
1494
|
// src/passes/transcript.ts
|
|
1430
1495
|
async function runTranscript(params) {
|
|
1431
|
-
const { client, fileUri, mimeType, segment, model, resolution } = params;
|
|
1496
|
+
const { client, fileUri, mimeType, segment, model, resolution, lang } = params;
|
|
1432
1497
|
const contents = [
|
|
1433
1498
|
{
|
|
1434
1499
|
role: "user",
|
|
@@ -1450,7 +1515,7 @@ async function runTranscript(params) {
|
|
|
1450
1515
|
model,
|
|
1451
1516
|
contents,
|
|
1452
1517
|
config: {
|
|
1453
|
-
systemInstruction: SYSTEM_INSTRUCTION_PASS_1,
|
|
1518
|
+
systemInstruction: withLanguage(SYSTEM_INSTRUCTION_PASS_1, lang),
|
|
1454
1519
|
responseSchema: SCHEMA_PASS_1,
|
|
1455
1520
|
responseMimeType: "application/json",
|
|
1456
1521
|
...resolution !== void 0 ? { mediaResolution: resolution } : {},
|
|
@@ -1466,11 +1531,11 @@ async function runTranscript(params) {
|
|
|
1466
1531
|
|
|
1467
1532
|
// src/passes/visual.ts
|
|
1468
1533
|
async function runVisual(params) {
|
|
1469
|
-
const { client, fileUri, mimeType, segment, model, resolution, pass1Transcript } = params;
|
|
1534
|
+
const { client, fileUri, mimeType, segment, model, resolution, pass1Transcript, lang } = params;
|
|
1470
1535
|
const transcriptText = pass1Transcript != null ? pass1Transcript.transcript_entries.map((t) => `[${t.timestamp}] ${t.speaker}: ${t.text}`).join("\n") : "[No transcript available for this segment]";
|
|
1471
|
-
const systemInstruction =
|
|
1472
|
-
"{INJECT_PASS1_TRANSCRIPT_HERE}",
|
|
1473
|
-
|
|
1536
|
+
const systemInstruction = withLanguage(
|
|
1537
|
+
SYSTEM_INSTRUCTION_PASS_2_TEMPLATE.replace("{INJECT_PASS1_TRANSCRIPT_HERE}", transcriptText),
|
|
1538
|
+
lang
|
|
1474
1539
|
);
|
|
1475
1540
|
const contents = [
|
|
1476
1541
|
{
|
|
@@ -1510,7 +1575,7 @@ async function runVisual(params) {
|
|
|
1510
1575
|
// src/passes/scene-analysis.ts
|
|
1511
1576
|
import { MediaResolution } from "@google/genai";
|
|
1512
1577
|
async function runSceneAnalysis(params) {
|
|
1513
|
-
const { client, fileUri, mimeType, duration, model, resolution } = params;
|
|
1578
|
+
const { client, fileUri, mimeType, duration, model, resolution, lang } = params;
|
|
1514
1579
|
const safeDuration = Number.isFinite(duration) && duration > 0 ? duration : 0;
|
|
1515
1580
|
const endSeconds = Math.min(180, safeDuration);
|
|
1516
1581
|
const contents = [
|
|
@@ -1534,7 +1599,7 @@ async function runSceneAnalysis(params) {
|
|
|
1534
1599
|
model,
|
|
1535
1600
|
contents,
|
|
1536
1601
|
config: {
|
|
1537
|
-
systemInstruction: SYSTEM_INSTRUCTION_PASS_0,
|
|
1602
|
+
systemInstruction: withLanguage(SYSTEM_INSTRUCTION_PASS_0, lang),
|
|
1538
1603
|
responseSchema: SCHEMA_PASS_0,
|
|
1539
1604
|
responseMimeType: "application/json",
|
|
1540
1605
|
...resolution !== void 0 ? { mediaResolution: resolution } : { mediaResolution: MediaResolution.MEDIA_RESOLUTION_LOW },
|
|
@@ -1611,7 +1676,7 @@ ${block.content}`);
|
|
|
1611
1676
|
return contextText;
|
|
1612
1677
|
}
|
|
1613
1678
|
async function runCodeReconstruction(params) {
|
|
1614
|
-
const { client, fileUri, mimeType, duration, model, resolution, pass1Results, pass2Results } = params;
|
|
1679
|
+
const { client, fileUri, mimeType, duration, model, resolution, pass1Results, pass2Results, lang } = params;
|
|
1615
1680
|
const contextText = compileContext(duration, pass1Results, pass2Results);
|
|
1616
1681
|
const contents = [
|
|
1617
1682
|
{
|
|
@@ -1630,7 +1695,7 @@ ${contextText}`
|
|
|
1630
1695
|
model,
|
|
1631
1696
|
contents,
|
|
1632
1697
|
config: {
|
|
1633
|
-
systemInstruction: SYSTEM_INSTRUCTION_PASS_3A,
|
|
1698
|
+
systemInstruction: withLanguage(SYSTEM_INSTRUCTION_PASS_3A, lang),
|
|
1634
1699
|
responseSchema: SCHEMA_PASS_3A,
|
|
1635
1700
|
responseMimeType: "application/json",
|
|
1636
1701
|
...resolution !== void 0 ? { mediaResolution: resolution } : {},
|
|
@@ -1646,7 +1711,7 @@ ${contextText}`
|
|
|
1646
1711
|
|
|
1647
1712
|
// src/passes/people.ts
|
|
1648
1713
|
async function runPeopleExtraction(params) {
|
|
1649
|
-
const { client, fileUri, mimeType, model, pass1Results } = params;
|
|
1714
|
+
const { client, fileUri, mimeType, model, pass1Results, lang } = params;
|
|
1650
1715
|
const hasAnyTranscript = pass1Results.some((r) => r != null);
|
|
1651
1716
|
const transcriptText = hasAnyTranscript ? pass1Results.filter((r) => r != null).flatMap(
|
|
1652
1717
|
(r) => r.transcript_entries.map((t) => `[${t.timestamp}] ${t.speaker}: ${t.text}`)
|
|
@@ -1666,7 +1731,7 @@ ${transcriptText}`;
|
|
|
1666
1731
|
model,
|
|
1667
1732
|
contents,
|
|
1668
1733
|
config: {
|
|
1669
|
-
systemInstruction: SYSTEM_INSTRUCTION_PASS_3B,
|
|
1734
|
+
systemInstruction: withLanguage(SYSTEM_INSTRUCTION_PASS_3B, lang),
|
|
1670
1735
|
responseSchema: SCHEMA_PASS_3B,
|
|
1671
1736
|
responseMimeType: "application/json",
|
|
1672
1737
|
maxOutputTokens: 65536,
|
|
@@ -1681,7 +1746,7 @@ ${transcriptText}`;
|
|
|
1681
1746
|
|
|
1682
1747
|
// src/passes/chat.ts
|
|
1683
1748
|
async function runChatExtraction(params) {
|
|
1684
|
-
const { client, fileUri, mimeType, segment, model, resolution, pass2Result } = params;
|
|
1749
|
+
const { client, fileUri, mimeType, segment, model, resolution, pass2Result, lang } = params;
|
|
1685
1750
|
const visualNotesText = pass2Result != null && pass2Result.visual_notes.length > 0 ? pass2Result.visual_notes.map((n) => `[${n.timestamp}] ${n.visual_type}: ${n.description}`).join("\n") : "[No visual context available for this segment]";
|
|
1686
1751
|
const codeBlocksText = pass2Result != null && pass2Result.code_blocks.length > 0 ? pass2Result.code_blocks.map((b) => `[${b.timestamp}] ${b.filename} (${b.language}):
|
|
1687
1752
|
${b.content}`).join("\n\n") : "[No code blocks available for this segment]";
|
|
@@ -1715,7 +1780,7 @@ ${contextText}`
|
|
|
1715
1780
|
model,
|
|
1716
1781
|
contents,
|
|
1717
1782
|
config: {
|
|
1718
|
-
systemInstruction: SYSTEM_INSTRUCTION_PASS_3C,
|
|
1783
|
+
systemInstruction: withLanguage(SYSTEM_INSTRUCTION_PASS_3C, lang),
|
|
1719
1784
|
responseSchema: SCHEMA_PASS_3C,
|
|
1720
1785
|
responseMimeType: "application/json",
|
|
1721
1786
|
...resolution !== void 0 ? { mediaResolution: resolution } : {},
|
|
@@ -1731,7 +1796,7 @@ ${contextText}`
|
|
|
1731
1796
|
|
|
1732
1797
|
// src/passes/implicit.ts
|
|
1733
1798
|
async function runImplicitSignals(params) {
|
|
1734
|
-
const { client, fileUri, mimeType, segment, model, resolution, pass1Result, pass2Result } = params;
|
|
1799
|
+
const { client, fileUri, mimeType, segment, model, resolution, pass1Result, pass2Result, lang } = params;
|
|
1735
1800
|
const transcriptText = pass1Result != null ? pass1Result.transcript_entries.map((t) => `[${t.timestamp}] ${t.speaker} (${t.tone}): ${t.text}`).join("\n") : "[No transcript available for this segment]";
|
|
1736
1801
|
const visualNotesText = pass2Result != null && pass2Result.visual_notes.length > 0 ? pass2Result.visual_notes.map((n) => `[${n.timestamp}] ${n.visual_type}: ${n.description}`).join("\n") : "[No visual context available for this segment]";
|
|
1737
1802
|
const contextText = [
|
|
@@ -1764,7 +1829,7 @@ ${contextText}`
|
|
|
1764
1829
|
model,
|
|
1765
1830
|
contents,
|
|
1766
1831
|
config: {
|
|
1767
|
-
systemInstruction: SYSTEM_INSTRUCTION_PASS_3D,
|
|
1832
|
+
systemInstruction: withLanguage(SYSTEM_INSTRUCTION_PASS_3D, lang),
|
|
1768
1833
|
responseSchema: SCHEMA_PASS_3D,
|
|
1769
1834
|
responseMimeType: "application/json",
|
|
1770
1835
|
...resolution !== void 0 ? { mediaResolution: resolution } : {},
|
|
@@ -1877,7 +1942,7 @@ function compileContext2(params) {
|
|
|
1877
1942
|
return sections.join("\n\n");
|
|
1878
1943
|
}
|
|
1879
1944
|
async function runSynthesis(params) {
|
|
1880
|
-
const { client, model } = params;
|
|
1945
|
+
const { client, model, lang } = params;
|
|
1881
1946
|
const compiledContext = compileContext2(params);
|
|
1882
1947
|
const contents = [
|
|
1883
1948
|
{
|
|
@@ -1889,7 +1954,7 @@ async function runSynthesis(params) {
|
|
|
1889
1954
|
model,
|
|
1890
1955
|
contents,
|
|
1891
1956
|
config: {
|
|
1892
|
-
systemInstruction: SYSTEM_INSTRUCTION_SYNTHESIS,
|
|
1957
|
+
systemInstruction: withLanguage(SYSTEM_INSTRUCTION_SYNTHESIS, lang),
|
|
1893
1958
|
responseSchema: SCHEMA_SYNTHESIS,
|
|
1894
1959
|
responseMimeType: "application/json",
|
|
1895
1960
|
maxOutputTokens: 65536,
|
|
@@ -1938,6 +2003,12 @@ function determineStrategy(profile) {
|
|
|
1938
2003
|
passes.add("chat");
|
|
1939
2004
|
passes.add("implicit");
|
|
1940
2005
|
break;
|
|
2006
|
+
case "audio":
|
|
2007
|
+
passes.delete("visual");
|
|
2008
|
+
passes.delete("code");
|
|
2009
|
+
passes.add("people");
|
|
2010
|
+
passes.add("implicit");
|
|
2011
|
+
break;
|
|
1941
2012
|
default:
|
|
1942
2013
|
break;
|
|
1943
2014
|
}
|
|
@@ -2290,25 +2361,30 @@ var DEFAULT_PROFILE = {
|
|
|
2290
2361
|
}
|
|
2291
2362
|
};
|
|
2292
2363
|
async function runPipeline(config) {
|
|
2293
|
-
const { client, fileUri, mimeType, duration, model, rateLimiter, onProgress, onWait, isShuttingDown } = config;
|
|
2364
|
+
const { client, fileUri, mimeType, duration, model, rateLimiter, onProgress, onWait, isShuttingDown, lang } = config;
|
|
2294
2365
|
const errors = [];
|
|
2295
2366
|
const passesRun = [];
|
|
2296
|
-
onProgress?.({ phase: "pass0", segment: 0, totalSegments: 1, status: "running" });
|
|
2297
2367
|
let videoProfile;
|
|
2298
2368
|
let strategy;
|
|
2299
|
-
|
|
2300
|
-
() => rateLimiter.execute(() => runSceneAnalysis({ client, fileUri, mimeType, duration, model }), { onWait }),
|
|
2301
|
-
"pass0"
|
|
2302
|
-
);
|
|
2303
|
-
if (pass0Attempt.error !== null) {
|
|
2304
|
-
log6.warn(pass0Attempt.error);
|
|
2305
|
-
errors.push(pass0Attempt.error);
|
|
2369
|
+
if (config.overrideStrategy != null) {
|
|
2306
2370
|
videoProfile = DEFAULT_PROFILE;
|
|
2371
|
+
strategy = config.overrideStrategy;
|
|
2307
2372
|
} else {
|
|
2308
|
-
|
|
2373
|
+
onProgress?.({ phase: "pass0", segment: 0, totalSegments: 1, status: "running" });
|
|
2374
|
+
const pass0Attempt = await withRetry(
|
|
2375
|
+
() => rateLimiter.execute(() => runSceneAnalysis({ client, fileUri, mimeType, duration, model, lang }), { onWait }),
|
|
2376
|
+
"pass0"
|
|
2377
|
+
);
|
|
2378
|
+
if (pass0Attempt.error !== null) {
|
|
2379
|
+
log6.warn(pass0Attempt.error);
|
|
2380
|
+
errors.push(pass0Attempt.error);
|
|
2381
|
+
videoProfile = DEFAULT_PROFILE;
|
|
2382
|
+
} else {
|
|
2383
|
+
videoProfile = pass0Attempt.result ?? DEFAULT_PROFILE;
|
|
2384
|
+
}
|
|
2385
|
+
strategy = determineStrategy(videoProfile);
|
|
2386
|
+
onProgress?.({ phase: "pass0", segment: 0, totalSegments: 1, status: "done" });
|
|
2309
2387
|
}
|
|
2310
|
-
strategy = determineStrategy(videoProfile);
|
|
2311
|
-
onProgress?.({ phase: "pass0", segment: 0, totalSegments: 1, status: "done" });
|
|
2312
2388
|
const plan = createSegmentPlan(duration, {
|
|
2313
2389
|
segmentMinutes: strategy.segmentMinutes,
|
|
2314
2390
|
resolution: strategy.resolution
|
|
@@ -2337,7 +2413,7 @@ async function runPipeline(config) {
|
|
|
2337
2413
|
onProgress?.({ phase: "pass1", segment: i, totalSegments: n, status: "running", totalSteps });
|
|
2338
2414
|
let pass1 = null;
|
|
2339
2415
|
const pass1Attempt = await withRetry(
|
|
2340
|
-
() => rateLimiter.execute(() => runTranscript({ client, fileUri, mimeType, segment, model, resolution }), { onWait }),
|
|
2416
|
+
() => rateLimiter.execute(() => runTranscript({ client, fileUri, mimeType, segment, model, resolution, lang }), { onWait }),
|
|
2341
2417
|
`segment ${i} pass1`
|
|
2342
2418
|
);
|
|
2343
2419
|
if (pass1Attempt.error !== null) {
|
|
@@ -2360,7 +2436,8 @@ async function runPipeline(config) {
|
|
|
2360
2436
|
segment,
|
|
2361
2437
|
model,
|
|
2362
2438
|
resolution,
|
|
2363
|
-
pass1Transcript: pass1 ?? void 0
|
|
2439
|
+
pass1Transcript: pass1 ?? void 0,
|
|
2440
|
+
lang
|
|
2364
2441
|
}),
|
|
2365
2442
|
{ onWait }
|
|
2366
2443
|
),
|
|
@@ -2387,7 +2464,8 @@ async function runPipeline(config) {
|
|
|
2387
2464
|
segment,
|
|
2388
2465
|
model: MODELS.flash,
|
|
2389
2466
|
resolution,
|
|
2390
|
-
pass2Result: pass2 ?? void 0
|
|
2467
|
+
pass2Result: pass2 ?? void 0,
|
|
2468
|
+
lang
|
|
2391
2469
|
}),
|
|
2392
2470
|
{ onWait }
|
|
2393
2471
|
),
|
|
@@ -2417,7 +2495,8 @@ async function runPipeline(config) {
|
|
|
2417
2495
|
model: MODELS.flash,
|
|
2418
2496
|
resolution,
|
|
2419
2497
|
pass1Result: pass1 ?? void 0,
|
|
2420
|
-
pass2Result: pass2 ?? void 0
|
|
2498
|
+
pass2Result: pass2 ?? void 0,
|
|
2499
|
+
lang
|
|
2421
2500
|
}),
|
|
2422
2501
|
{ onWait }
|
|
2423
2502
|
),
|
|
@@ -2469,7 +2548,8 @@ async function runPipeline(config) {
|
|
|
2469
2548
|
fileUri,
|
|
2470
2549
|
mimeType,
|
|
2471
2550
|
model: MODELS.flash,
|
|
2472
|
-
pass1Results
|
|
2551
|
+
pass1Results,
|
|
2552
|
+
lang
|
|
2473
2553
|
}),
|
|
2474
2554
|
{ onWait }
|
|
2475
2555
|
),
|
|
@@ -2500,7 +2580,8 @@ async function runPipeline(config) {
|
|
|
2500
2580
|
model: MODELS.pro,
|
|
2501
2581
|
resolution,
|
|
2502
2582
|
pass1Results,
|
|
2503
|
-
pass2Results
|
|
2583
|
+
pass2Results,
|
|
2584
|
+
lang
|
|
2504
2585
|
}),
|
|
2505
2586
|
{ onWait }
|
|
2506
2587
|
),
|
|
@@ -2544,7 +2625,8 @@ async function runPipeline(config) {
|
|
|
2544
2625
|
videoProfile,
|
|
2545
2626
|
peopleExtraction,
|
|
2546
2627
|
codeReconstruction,
|
|
2547
|
-
context: config.context
|
|
2628
|
+
context: config.context,
|
|
2629
|
+
lang
|
|
2548
2630
|
}),
|
|
2549
2631
|
{ onWait }
|
|
2550
2632
|
),
|
|
@@ -3604,6 +3686,33 @@ function createShutdownHandler(params) {
|
|
|
3604
3686
|
}
|
|
3605
3687
|
|
|
3606
3688
|
// src/commands/distill.ts
|
|
3689
|
+
function peekIsAudio(filePath) {
|
|
3690
|
+
if (!existsSync3(filePath)) return false;
|
|
3691
|
+
try {
|
|
3692
|
+
const fd = openSync2(filePath, "r");
|
|
3693
|
+
const buf = Buffer.alloc(12);
|
|
3694
|
+
try {
|
|
3695
|
+
readSync2(fd, buf, 0, 12, 0);
|
|
3696
|
+
} finally {
|
|
3697
|
+
closeSync2(fd);
|
|
3698
|
+
}
|
|
3699
|
+
if (buf.slice(0, 3).toString("ascii") === "ID3") return true;
|
|
3700
|
+
if (buf[0] === 255 && (buf[1] & 240) === 240 && (buf[1] & 6) === 0) return true;
|
|
3701
|
+
if (buf[0] === 255 && (buf[1] & 224) === 224 && (buf[1] & 6) !== 0) return true;
|
|
3702
|
+
if (buf.slice(0, 4).toString("ascii") === "fLaC") return true;
|
|
3703
|
+
if (buf.slice(0, 4).toString("ascii") === "OggS") return true;
|
|
3704
|
+
if (buf.slice(0, 4).toString("ascii") === "RIFF" && buf.slice(8, 12).toString("ascii") === "WAVE") return true;
|
|
3705
|
+
if (buf.slice(4, 8).toString("ascii") === "ftyp") {
|
|
3706
|
+
const brand = buf.slice(8, 12).toString("ascii");
|
|
3707
|
+
if (brand === "M4A " || brand === "M4B ") return true;
|
|
3708
|
+
const ext = extname2(filePath).toLowerCase();
|
|
3709
|
+
if (ext === ".m4a" || ext === ".m4b") return true;
|
|
3710
|
+
}
|
|
3711
|
+
return false;
|
|
3712
|
+
} catch {
|
|
3713
|
+
return false;
|
|
3714
|
+
}
|
|
3715
|
+
}
|
|
3607
3716
|
async function runDistill(args) {
|
|
3608
3717
|
const apiKey = await resolveApiKey();
|
|
3609
3718
|
let rawInput = args.input ?? await promptVideoSource();
|
|
@@ -3612,7 +3721,15 @@ async function runDistill(args) {
|
|
|
3612
3721
|
if (!allFlagsProvided) {
|
|
3613
3722
|
let confirmed = false;
|
|
3614
3723
|
while (!confirmed) {
|
|
3615
|
-
|
|
3724
|
+
const looksLikeUrl = /^https?:\/\/|^www\./i.test(rawInput.trim());
|
|
3725
|
+
const inputIsAudio = !looksLikeUrl && peekIsAudio(rawInput.trim());
|
|
3726
|
+
showConfigBox({
|
|
3727
|
+
input: rawInput,
|
|
3728
|
+
context,
|
|
3729
|
+
output: args.output,
|
|
3730
|
+
videoType: inputIsAudio ? "audio" : void 0,
|
|
3731
|
+
lang: args.lang
|
|
3732
|
+
});
|
|
3616
3733
|
const choice = await promptConfirmation();
|
|
3617
3734
|
switch (choice) {
|
|
3618
3735
|
case "start":
|
|
@@ -3687,6 +3804,7 @@ async function runDistill(args) {
|
|
|
3687
3804
|
duration,
|
|
3688
3805
|
model,
|
|
3689
3806
|
context,
|
|
3807
|
+
lang: args.lang,
|
|
3690
3808
|
rateLimiter,
|
|
3691
3809
|
onProgress: (status) => {
|
|
3692
3810
|
progress2.update(status);
|
package/package.json
CHANGED