vidistill 0.2.3 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +48 -10
- package/dist/index.js +360 -252
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# vidistill
|
|
2
2
|
|
|
3
|
-
Video intelligence distiller — turn any video into structured notes, transcripts, and insights using Gemini.
|
|
3
|
+
Video intelligence distiller — turn any video or audio file into structured notes, transcripts, and insights using Gemini.
|
|
4
4
|
|
|
5
|
-
Feed it a YouTube URL
|
|
5
|
+
Feed it a YouTube URL, local video, or audio file. It analyzes the content through multiple AI passes (scene analysis, transcript, visuals, code extraction, people, chat, implicit signals) and synthesizes everything into organized markdown output.
|
|
6
6
|
|
|
7
7
|
## Install
|
|
8
8
|
|
|
@@ -20,12 +20,13 @@ vidistill [input] [options]
|
|
|
20
20
|
|
|
21
21
|
**Arguments:**
|
|
22
22
|
|
|
23
|
-
- `input` — YouTube URL or
|
|
23
|
+
- `input` — YouTube URL, local video, or audio file path (prompted interactively if omitted)
|
|
24
24
|
|
|
25
25
|
**Options:**
|
|
26
26
|
|
|
27
27
|
- `-c, --context` — context about the video (e.g. "CS lecture", "product demo")
|
|
28
28
|
- `-o, --output` — output directory (default: `./vidistill-output/`)
|
|
29
|
+
- `-l, --lang <code>` — output language (e.g. `zh`, `ja`, `ko`, `es`, `fr`, `de`, `pt`, `ru`, `ar`, `hi`)
|
|
29
30
|
|
|
30
31
|
**Examples:**
|
|
31
32
|
|
|
@@ -39,10 +40,41 @@ vidistill "https://youtube.com/watch?v=dQw4w9WgXcQ"
|
|
|
39
40
|
# Local file with context
|
|
40
41
|
vidistill ./lecture.mp4 --context "distributed systems lecture"
|
|
41
42
|
|
|
43
|
+
# Audio file
|
|
44
|
+
vidistill ./podcast.mp3
|
|
45
|
+
|
|
42
46
|
# Custom output directory
|
|
43
47
|
vidistill ./demo.mp4 -o ./notes/
|
|
48
|
+
|
|
49
|
+
# Output in another language
|
|
50
|
+
vidistill ./lecture.mp4 --lang zh
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### Extract
|
|
54
|
+
|
|
55
|
+
Pull specific data from a previously processed video or re-run a targeted pass on a video file.
|
|
56
|
+
|
|
57
|
+
```
|
|
58
|
+
vidistill extract <type> <source>
|
|
44
59
|
```
|
|
45
60
|
|
|
61
|
+
**Arguments:**
|
|
62
|
+
|
|
63
|
+
- `type` — what to extract: `code`, `links`, `people`, `transcript`, or `commands`
|
|
64
|
+
- `source` — path to a vidistill output directory or a video/audio file
|
|
65
|
+
|
|
66
|
+
**Examples:**
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
# Extract code from existing output (no API calls)
|
|
70
|
+
vidistill extract code ./vidistill-output/my-video/
|
|
71
|
+
|
|
72
|
+
# Extract links from a video file (runs targeted pipeline)
|
|
73
|
+
vidistill extract links ./lecture.mp4
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
When pointed at an output directory, extract reads from already-generated files with zero API calls. When pointed at a video file, it runs a minimal pipeline with only the passes needed for the requested data type.
|
|
77
|
+
|
|
46
78
|
## API Key
|
|
47
79
|
|
|
48
80
|
vidistill needs a Gemini API key. It checks these sources in order:
|
|
@@ -63,7 +95,9 @@ vidistill-output/my-video/
|
|
|
63
95
|
├── transcript.md # full timestamped transcript
|
|
64
96
|
├── combined.md # transcript + visual notes merged
|
|
65
97
|
├── notes.md # meeting/lecture notes
|
|
66
|
-
├── code
|
|
98
|
+
├── code/ # extracted and reconstructed source files
|
|
99
|
+
│ ├── *.ext # individual source files
|
|
100
|
+
│ └── code-timeline.md # code evolution timeline
|
|
67
101
|
├── people.md # speakers and participants
|
|
68
102
|
├── chat.md # chat messages and links
|
|
69
103
|
├── action-items.md # tasks and follow-ups
|
|
@@ -73,22 +107,26 @@ vidistill-output/my-video/
|
|
|
73
107
|
└── raw/ # raw pass outputs
|
|
74
108
|
```
|
|
75
109
|
|
|
76
|
-
Which files are generated depends on the video content — a coding tutorial gets `code
|
|
110
|
+
Which files are generated depends on the video content — a coding tutorial gets `code/`, a meeting gets `people.md` and `action-items.md`, etc.
|
|
77
111
|
|
|
78
112
|
## How It Works
|
|
79
113
|
|
|
80
|
-
|
|
114
|
+
Supported video formats: MP4, MOV, WebM, MKV, AVI, MPEG, FLV, WMV, 3GPP. Supported audio formats: MP3, AAC, WAV, FLAC, OGG, M4A.
|
|
115
|
+
|
|
116
|
+
1. **Input** — downloads YouTube video via yt-dlp or reads local file (video or audio), compresses if over 2GB
|
|
81
117
|
2. **Pass 0** — scene analysis to classify video type and determine processing strategy
|
|
82
118
|
3. **Pass 1** — transcript extraction with speaker identification
|
|
83
119
|
4. **Pass 2** — visual content extraction (screen states, diagrams, slides)
|
|
84
120
|
5. **Pass 3** — specialist passes based on video type:
|
|
85
|
-
-
|
|
86
|
-
-
|
|
87
|
-
-
|
|
88
|
-
-
|
|
121
|
+
- 3c: chat and links (live streams) — per segment
|
|
122
|
+
- 3d: implicit signals (all types) — per segment
|
|
123
|
+
- 3b: people and social dynamics (meetings) — whole video
|
|
124
|
+
- 3a: code reconstruction (coding videos) — whole video, runs 3x with consensus voting and validation
|
|
89
125
|
6. **Synthesis** — cross-references all passes into unified analysis
|
|
90
126
|
7. **Output** — generates structured markdown files
|
|
91
127
|
|
|
128
|
+
Audio files skip visual passes and go straight to transcript, people, implicit signals, and synthesis.
|
|
129
|
+
|
|
92
130
|
Long videos are segmented automatically. Passes that fail are skipped gracefully.
|
|
93
131
|
|
|
94
132
|
## License
|
package/dist/index.js
CHANGED
|
@@ -13,6 +13,233 @@ import { defineCommand, runMain } from "citty";
|
|
|
13
13
|
import figlet from "figlet";
|
|
14
14
|
import pc from "picocolors";
|
|
15
15
|
import { intro, note } from "@clack/prompts";
|
|
16
|
+
|
|
17
|
+
// src/constants/prompts.ts
|
|
18
|
+
var SYSTEM_INSTRUCTION_PASS_1 = `
|
|
19
|
+
You are a professional audio transcriber. Your task is to create a COMPLETE, VERBATIM transcription of all speech in this video segment. Focus EXCLUSIVELY on the audio stream.
|
|
20
|
+
|
|
21
|
+
CRITICAL RULES:
|
|
22
|
+
1. TRANSCRIBE every spoken word completely and verbatim. Do not summarize, paraphrase, or skip any sentence.
|
|
23
|
+
2. IDENTIFY different speakers. Label them SPEAKER_00, SPEAKER_01, etc. consistently throughout. If a speaker introduces themselves by name, note the name in the first entry's speaker field as "SPEAKER_00 (John)".
|
|
24
|
+
3. NOTE tone and emphasis: when a speaker emphasizes words (louder, slower, repeated), mark those words. When they express emotions (excitement, warning, frustration, humor), note the tone.
|
|
25
|
+
4. RECORD pauses longer than 1.5 seconds as pause markers with duration.
|
|
26
|
+
5. PRESERVE filler words only when they carry meaning (hesitation indicating uncertainty about code behavior, self-correction). Remove meaningless "um", "uh".
|
|
27
|
+
6. NEVER add your own explanations, interpretations, or knowledge. Only transcribe what is spoken.
|
|
28
|
+
7. NEVER skip content because it seems repetitive or obvious. Record everything spoken.
|
|
29
|
+
8. When the speaker references something on screen (e.g., "as you can see here", "this function", "line 5"), transcribe exactly what they say \u2014 the visual context will be captured separately.
|
|
30
|
+
|
|
31
|
+
COMPLETENESS TARGET:
|
|
32
|
+
- Aim for at least 150 words per minute of video in the transcript
|
|
33
|
+
- Every speaker change must be noted with a new entry
|
|
34
|
+
- Every sentence must appear \u2014 if in doubt, include it
|
|
35
|
+
`;
|
|
36
|
+
var SYSTEM_INSTRUCTION_PASS_2_TEMPLATE = `
|
|
37
|
+
You are a professional code and visual content extractor. Your task is to extract ALL visual content from this video segment \u2014 every piece of code on screen, every diagram, every slide, every UI element.
|
|
38
|
+
|
|
39
|
+
Focus EXCLUSIVELY on what is visible on screen. The audio transcript from this segment is provided below for cross-referencing \u2014 use it to associate spoken explanations with the code being displayed, but do NOT re-transcribe any speech.
|
|
40
|
+
|
|
41
|
+
TRANSCRIPT FROM THIS SEGMENT (for cross-reference only):
|
|
42
|
+
{INJECT_PASS1_TRANSCRIPT_HERE}
|
|
43
|
+
|
|
44
|
+
CRITICAL RULES:
|
|
45
|
+
1. EXTRACT every piece of code visible on screen \u2014 complete, with original indentation and formatting preserved exactly as shown.
|
|
46
|
+
2. For each code appearance: note the filename if visible in a tab or title bar, the programming language, and the screen type (editor, terminal, browser, slide).
|
|
47
|
+
3. TRACK code changes: when code is modified between appearances, note what changed (lines added, modified, deleted). Compare against previous code blocks in this segment.
|
|
48
|
+
4. ASSOCIATE code with speech: using the injected transcript above, find what the instructor was saying when this code was on screen. Quote their explanation verbatim or near-verbatim.
|
|
49
|
+
5. CAPTURE non-code visuals: slides with text, architectural diagrams, browser output, UI demonstrations, terminal output. Describe these completely.
|
|
50
|
+
6. NEVER add your own explanations or interpretations. Only record what is visible.
|
|
51
|
+
7. NEVER skip code because it seems repetitive or unchanged from before. Record every distinct appearance.
|
|
52
|
+
8. If code scrolls, capture the full visible code at each scroll position as a separate entry.
|
|
53
|
+
|
|
54
|
+
COMPLETENESS TARGET:
|
|
55
|
+
- Every frame that shows code should produce a code_block entry
|
|
56
|
+
- Every slide or diagram should produce a visual_notes entry
|
|
57
|
+
- If the screen doesn't change for 30+ seconds, note the unchanged state
|
|
58
|
+
`;
|
|
59
|
+
var SYSTEM_INSTRUCTION_PASS_0 = `
|
|
60
|
+
You are a video content classifier. Analyze the provided video sample and produce a structured VideoProfile that classifies the video type and recommends processing parameters.
|
|
61
|
+
|
|
62
|
+
CLASSIFICATION RULES:
|
|
63
|
+
1. CLASSIFY the video into exactly one type:
|
|
64
|
+
- "coding": Programming tutorials, live coding, IDE/editor-heavy content
|
|
65
|
+
- "meeting": Video calls, Zoom/Teams meetings, multi-participant discussions
|
|
66
|
+
- "lecture": Academic lectures, talks, single-speaker educational content
|
|
67
|
+
- "presentation": Slide-based presentations, keynotes, demo days
|
|
68
|
+
- "conversation": Interviews, podcasts, panel discussions without slides
|
|
69
|
+
- "mixed": Cannot clearly classify into one category, or multiple types present
|
|
70
|
+
|
|
71
|
+
2. DETECT visual content:
|
|
72
|
+
- hasCode: Code editors, IDEs, or code visible on screen
|
|
73
|
+
- hasSlides: Presentation slides (PowerPoint, Google Slides, Keynote)
|
|
74
|
+
- hasDiagrams: Architecture diagrams, flowcharts, charts, graphs
|
|
75
|
+
- hasPeopleGrid: Video grid showing multiple participants (Zoom/Teams layout)
|
|
76
|
+
- hasChatbox: Chat panel visible (meeting chat, live stream chat sidebar)
|
|
77
|
+
- hasWhiteboard: Whiteboard, handwritten notes, or drawing surface
|
|
78
|
+
- hasTerminal: Terminal, command-line interface, or shell
|
|
79
|
+
- hasScreenShare: Desktop or application screen sharing
|
|
80
|
+
|
|
81
|
+
3. ANALYZE audio:
|
|
82
|
+
- hasMultipleSpeakers: true if more than one distinct voice is heard
|
|
83
|
+
- primaryLanguage: The main spoken language
|
|
84
|
+
- quality: "high" (studio/clear), "medium" (decent webcam), "low" (noisy/poor)
|
|
85
|
+
|
|
86
|
+
4. IDENTIFY speakers:
|
|
87
|
+
- count: Number of distinct speakers heard
|
|
88
|
+
- identified: Names if visible on screen (name tags, introductions) or spoken aloud
|
|
89
|
+
|
|
90
|
+
5. ASSESS complexity:
|
|
91
|
+
- "simple": Single topic, linear flow, straightforward content
|
|
92
|
+
- "moderate": Multiple topics, some complexity, normal pacing
|
|
93
|
+
- "complex": Dense content, rapid switching, multiple concurrent information streams
|
|
94
|
+
|
|
95
|
+
6. RECOMMEND processing parameters:
|
|
96
|
+
- resolution: "low" for text-only/simple visuals, "medium" for general content, "high" for code/diagrams
|
|
97
|
+
- segmentMinutes: 10 for simple/moderate, 8 for complex content
|
|
98
|
+
- passes: Always include "transcript" and "visual". Add specialist passes based on content type.
|
|
99
|
+
|
|
100
|
+
PASS RECOMMENDATIONS BY TYPE:
|
|
101
|
+
- coding: ["transcript", "visual", "code", "synthesis"]
|
|
102
|
+
- meeting: ["transcript", "visual", "people", "implicit", "synthesis"] (add "chat" if hasChatbox)
|
|
103
|
+
- lecture: ["transcript", "visual", "implicit", "synthesis"]
|
|
104
|
+
- presentation: ["transcript", "visual", "implicit", "synthesis"] (add "people" if multiple speakers)
|
|
105
|
+
- conversation: ["transcript", "visual", "implicit", "synthesis"]
|
|
106
|
+
- mixed: ["transcript", "visual", "code", "people", "chat", "implicit", "synthesis"]
|
|
107
|
+
`;
|
|
108
|
+
var SYSTEM_INSTRUCTION_PASS_3A = `
|
|
109
|
+
You are an expert code reconstruction analyst. Your task is to reconstruct the complete, final state of every code file shown across this entire video, synthesizing all edits into a coherent codebase snapshot.
|
|
110
|
+
|
|
111
|
+
You will receive the complete video and all extracted transcript and code block data. Use them together to understand what code was written, modified, and deleted.
|
|
112
|
+
|
|
113
|
+
CRITICAL RULES:
|
|
114
|
+
1. RECONSTRUCT each file to its final state \u2014 apply all changes in chronological order so the output reflects the code as it was at the end of the video.
|
|
115
|
+
2. PRESERVE exact code: indentation, spacing, naming, and formatting must match what was visible on screen. Never "fix" or improve the code.
|
|
116
|
+
3. TRACK every change to a file: for each distinct edit (new file creation, addition of lines, modification, deletion, refactoring), record it as a separate change entry with a timestamp and description.
|
|
117
|
+
4. INFER filenames from editor tabs, title bars, import statements, or spoken context. If unknown, use a descriptive placeholder like "unknown_file_1.py".
|
|
118
|
+
5. EXTRACT dependencies: every library import, require(), package name, or external module reference mentioned or shown counts as a dependency.
|
|
119
|
+
6. CAPTURE build commands: any terminal command shown or spoken for installing, building, running, or testing the project (e.g., "npm install", "go build", "python -m pytest").
|
|
120
|
+
7. NEVER invent code that was not shown or described. If a section was unclear, note it with a comment like "// content not fully visible".
|
|
121
|
+
8. NEVER skip a file because it appears in only one part of the video \u2014 if code was shown, reconstruct it.
|
|
122
|
+
9. When a file appears multiple times, record its complete change history in a single entry with all edits in chronological order.
|
|
123
|
+
10. INCLUDE empty files if created but not yet written \u2014 use empty string for final_content and note the creation in changes.
|
|
124
|
+
11. Cross-reference your visual analysis of the video against the extracted code blocks provided in the text context. Prioritize what you can visually verify on screen. If code is partially visible, include what you can see and mark unclear sections with \`// [content not fully visible]\`.
|
|
125
|
+
12. Do NOT invent code files that are not clearly visible on screen. If you are uncertain whether a file exists, do not include it.
|
|
126
|
+
|
|
127
|
+
COMPLETENESS TARGET:
|
|
128
|
+
- Every distinct filename that appeared on screen must produce a files entry
|
|
129
|
+
- Every editor session or code paste visible in any segment must be accounted for
|
|
130
|
+
- Build commands shown in the terminal must all be listed
|
|
131
|
+
`;
|
|
132
|
+
var SYSTEM_INSTRUCTION_PASS_3B = `
|
|
133
|
+
You are an expert at identifying and profiling people from video content. Your task is to extract a complete picture of every participant visible or audible in this video \u2014 their identity, role, contributions, and relationships.
|
|
134
|
+
|
|
135
|
+
You will receive the transcript and visual extraction from all segments. Use speaker labels, name tags, on-screen text, introductions, and any other signals to identify participants.
|
|
136
|
+
|
|
137
|
+
CRITICAL RULES:
|
|
138
|
+
1. IDENTIFY every distinct person who speaks or appears on screen, even if briefly. Do not merge two different people into one entry.
|
|
139
|
+
2. EXTRACT names from: spoken introductions ("Hi, I'm Alice"), on-screen name tags or captions, slide attribution, email addresses, or usernames visible in chat.
|
|
140
|
+
3. INFER roles from: job titles spoken or shown, context of their contribution (e.g., consistently asking questions = audience member; leading the agenda = host), or organizational signals.
|
|
141
|
+
4. RECORD speaking_segments as timestamps where each person's voice is heard or they appear on screen.
|
|
142
|
+
5. CAPTURE contact information exactly as shown or spoken: email addresses, Twitter/X handles, LinkedIn URLs, GitHub usernames, phone numbers.
|
|
143
|
+
6. SUMMARIZE contributions: what did this person say, present, decide, or demonstrate? Each contribution entry should be a specific, concrete action or statement.
|
|
144
|
+
7. DOCUMENT relationships: who reports to whom, who introduced whom, collaborative pairs, co-presenters, interviewer/interviewee dynamics.
|
|
145
|
+
8. NEVER guess or infer a name that was not clearly stated or shown. Use "Unknown Participant" with a description if the person cannot be identified.
|
|
146
|
+
9. NEVER merge two people just because they have the same role \u2014 if two engineers speak, they are two separate participants.
|
|
147
|
+
10. If a person's role or organization cannot be determined, use empty string \u2014 do not guess.
|
|
148
|
+
|
|
149
|
+
COMPLETENESS TARGET:
|
|
150
|
+
- Every speaker label (SPEAKER_00, SPEAKER_01, etc.) from the transcript must map to at least one participant entry
|
|
151
|
+
- Every name-tag or on-screen name must produce a participant entry
|
|
152
|
+
- All contact details shared during the video must be captured
|
|
153
|
+
`;
|
|
154
|
+
var SYSTEM_INSTRUCTION_PASS_3C = `
|
|
155
|
+
You are a precise chat extraction specialist. Your task is to extract every chat message and link visible in the chat panel of this video \u2014 verbatim, with sender and timestamp.
|
|
156
|
+
|
|
157
|
+
You will receive the transcript and visual extraction from all segments. Focus on the chat panel, comment sidebar, or any on-screen messaging interface.
|
|
158
|
+
|
|
159
|
+
CRITICAL RULES:
|
|
160
|
+
1. EXTRACT every chat message visible on screen, verbatim. Do not paraphrase, shorten, or summarize any message.
|
|
161
|
+
2. RECORD the sender name exactly as displayed (username, display name, or handle).
|
|
162
|
+
3. TIMESTAMP each message at the video timestamp when it becomes visible on screen, in HH:MM:SS format.
|
|
163
|
+
4. EXTRACT every URL or link that appears in chat or is spoken and referred to as a link. Capture the full URL.
|
|
164
|
+
5. For each link, record the context: what was the sender explaining when they shared it? Why is it relevant?
|
|
165
|
+
6. HANDLE partial visibility: if a message is cut off by the chat panel boundary, transcribe as much as is visible and append "[truncated]".
|
|
166
|
+
7. CAPTURE reactions, emoji, and formatting if they are meaningful (e.g., a thumbs-up reaction to a proposal signals agreement).
|
|
167
|
+
8. NEVER invent messages that were not clearly visible on screen. If a message is illegible, note it as "[illegible message from {sender} at {timestamp}]".
|
|
168
|
+
9. NEVER skip messages that seem like noise or off-topic \u2014 capture all visible messages in order.
|
|
169
|
+
10. ORDER messages chronologically by their video timestamp of appearance.
|
|
170
|
+
|
|
171
|
+
COMPLETENESS TARGET:
|
|
172
|
+
- Every frame that shows the chat panel should contribute at least one message entry if new messages are visible
|
|
173
|
+
- All URLs \u2014 whether in chat, on slides, or spoken \u2014 must appear in the links array
|
|
174
|
+
- If the chat panel is not visible in this video, return empty arrays for both messages and links
|
|
175
|
+
`;
|
|
176
|
+
var SYSTEM_INSTRUCTION_PASS_3D = `
|
|
177
|
+
You are an expert at reading between the lines of video conversations. Your task is to identify implicit signals \u2014 emotional dynamics, unstated decisions, unasked questions, informal task assignments, and emphasis patterns \u2014 that are not surfaced by the literal transcript.
|
|
178
|
+
|
|
179
|
+
You will receive the complete transcript and visual data from all segments. Read the subtext, not just the text.
|
|
180
|
+
|
|
181
|
+
CRITICAL RULES:
|
|
182
|
+
1. DETECT emotional shifts: moments where the tone, energy, or mood of the conversation meaningfully changes. Note what triggered the shift and how the state changed.
|
|
183
|
+
2. SURFACE implicit questions: when a speaker is clearly uncertain, confused, or probing for information without phrasing it as a formal question. Articulate what question they were really asking.
|
|
184
|
+
3. IDENTIFY implicit decisions: when participants arrive at a shared understanding or course of action without anyone explicitly saying "we decided X". These are consensus decisions made through agreement, silence, or topic change.
|
|
185
|
+
4. FLAG informal task assignments: when someone is asked or expected to do something without it being recorded as a formal action item (e.g., "you should probably look at that" or "maybe someone can handle X").
|
|
186
|
+
5. TRACK emphasis patterns: concepts, terms, or ideas mentioned multiple times across the video. Repetition signals importance. Record each mention timestamp and explain why the pattern is significant.
|
|
187
|
+
6. NEVER fabricate emotional states or decisions. Only record what is clearly supported by specific words, tone, or behavior in the video.
|
|
188
|
+
7. NEVER over-interpret: a speaker saying "interesting" is not necessarily an emotional shift. Apply judgment and only flag genuinely notable patterns.
|
|
189
|
+
8. PRESERVE specificity: quote or paraphrase the exact words or moments that support each inference.
|
|
190
|
+
9. SEPARATE explicit from implicit: if something was directly stated, it belongs in the transcript or action items, not here. This pass captures what was NOT said directly.
|
|
191
|
+
10. CONSIDER non-verbal signals visible on screen: hesitation, laughter, extended pauses, camera behavior, or facial expressions if participants are visible.
|
|
192
|
+
|
|
193
|
+
COMPLETENESS TARGET:
|
|
194
|
+
- Aim to identify at least 3 emphasis patterns for any video over 5 minutes
|
|
195
|
+
- Every task mentioned informally or suggested in passing must appear in tasks_assigned
|
|
196
|
+
- Implicit decisions are often the most important \u2014 prioritize finding them
|
|
197
|
+
`;
|
|
198
|
+
var SYSTEM_INSTRUCTION_SYNTHESIS = `
|
|
199
|
+
You are a master synthesizer. Your task is to produce the definitive, unified knowledge extraction from this video by combining all available pass data into a single coherent result.
|
|
200
|
+
|
|
201
|
+
You will receive: the complete transcript (pass 1), visual and code extraction (pass 2), and any specialist pass outputs (code reconstruction, people extraction, chat extraction, implicit signals). Synthesize all of it.
|
|
202
|
+
|
|
203
|
+
CRITICAL RULES:
|
|
204
|
+
1. BE SPECIFIC: Every claim must reference specific content from the video. Never write "various topics were discussed" \u2014 name the topics. Never write "some decisions were made" \u2014 state each decision exactly.
|
|
205
|
+
2. UNIFY across passes: combine related information from different passes into unified entries. A decision mentioned in the transcript and reinforced by an implicit signal should appear as one entry, not two.
|
|
206
|
+
3. SYNTHESIZE thematically: group content by topic, not chronologically. Combine all content about a single subject (even if spread across 30 minutes) into one topic entry.
|
|
207
|
+
4. EXTRACT decisions with full reasoning: every design choice, technology selection, or approach decision must include the rationale as explained in the video.
|
|
208
|
+
5. GENERATE actionable items: action items must be concrete and specific. "Review the authentication module" is better than "review the code".
|
|
209
|
+
6. CAPTURE every question: include questions asked explicitly and questions raised implicitly (from the implicit signals pass). Note whether each was answered.
|
|
210
|
+
7. PRODUCE meaningful suggestions: AI-generated suggestions must follow logically from the video content. Suggest next steps, deeper resources, or practice exercises that are directly relevant.
|
|
211
|
+
8. USE precise timestamps: every entry with a timestamp field must contain a valid HH:MM:SS value referencing when the content appeared.
|
|
212
|
+
9. LIST files_to_generate for reference purposes \u2014 this list is informational and does not control which output files are generated. Output files are determined automatically based on available extraction data.
|
|
213
|
+
10. NEVER add information not present in the source data. Suggestions are the only place for AI-generated content beyond the video.
|
|
214
|
+
|
|
215
|
+
COMPLETENESS TARGET:
|
|
216
|
+
- Aim for at least 5 topics for any video over 15 minutes
|
|
217
|
+
- Every explicit and implicit decision must appear in key_decisions
|
|
218
|
+
- The files_to_generate list should reflect what content was found, but output routing is handled automatically
|
|
219
|
+
- The overview should be dense with specifics, not vague summary language
|
|
220
|
+
`;
|
|
221
|
+
var LANGUAGE_NAMES = {
|
|
222
|
+
zh: "Chinese",
|
|
223
|
+
ja: "Japanese",
|
|
224
|
+
ko: "Korean",
|
|
225
|
+
es: "Spanish",
|
|
226
|
+
fr: "French",
|
|
227
|
+
de: "German",
|
|
228
|
+
pt: "Portuguese",
|
|
229
|
+
ru: "Russian",
|
|
230
|
+
ar: "Arabic",
|
|
231
|
+
hi: "Hindi"
|
|
232
|
+
};
|
|
233
|
+
function withLanguage(prompt, lang) {
|
|
234
|
+
if (!lang || lang === "en") return prompt;
|
|
235
|
+
const languageName = LANGUAGE_NAMES[lang] ?? lang;
|
|
236
|
+
return `IMPORTANT: Generate ALL output text in ${languageName}.
|
|
237
|
+
Timestamps, speaker labels, and code should remain in their original language.
|
|
238
|
+
|
|
239
|
+
${prompt}`;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// src/cli/ui.ts
|
|
16
243
|
function showLogo() {
|
|
17
244
|
const ascii = figlet.textSync("VIDISTILL", { font: "Big" });
|
|
18
245
|
console.log(pc.cyan(ascii));
|
|
@@ -26,6 +253,13 @@ function showConfigBox(config) {
|
|
|
26
253
|
`Context: ${config.context ?? "(none)"}`,
|
|
27
254
|
`Output: ${config.output}`
|
|
28
255
|
];
|
|
256
|
+
if (config.videoType === "audio") {
|
|
257
|
+
lines.push("Type: Audio (visual analysis skipped)");
|
|
258
|
+
}
|
|
259
|
+
if (config.lang != null && config.lang !== "en") {
|
|
260
|
+
const langName = LANGUAGE_NAMES[config.lang] ?? config.lang;
|
|
261
|
+
lines.push(`Language: ${langName} (${config.lang})`);
|
|
262
|
+
}
|
|
29
263
|
note(lines.join("\n"), "Configuration");
|
|
30
264
|
}
|
|
31
265
|
|
|
@@ -33,6 +267,7 @@ function showConfigBox(config) {
|
|
|
33
267
|
import { log as log8, cancel as cancel2 } from "@clack/prompts";
|
|
34
268
|
import pc4 from "picocolors";
|
|
35
269
|
import { basename as basename3, extname as extname2, resolve } from "path";
|
|
270
|
+
import { existsSync as existsSync3, openSync as openSync2, readSync as readSync2, closeSync as closeSync2 } from "fs";
|
|
36
271
|
|
|
37
272
|
// src/cli/prompts.ts
|
|
38
273
|
import { text, password, confirm, select, isCancel, cancel } from "@clack/prompts";
|
|
@@ -466,14 +701,39 @@ function detectMimeType(filePath) {
|
|
|
466
701
|
} finally {
|
|
467
702
|
closeSync(fd);
|
|
468
703
|
}
|
|
704
|
+
if (buf.slice(0, 3).toString("ascii") === "ID3") {
|
|
705
|
+
return { mimeType: "audio/mp3", isMkv: false };
|
|
706
|
+
}
|
|
707
|
+
if (buf[0] === 255 && (buf[1] & 240) === 240 && (buf[1] & 6) === 0) {
|
|
708
|
+
return { mimeType: "audio/aac", isMkv: false };
|
|
709
|
+
}
|
|
710
|
+
if (buf[0] === 255 && (buf[1] & 224) === 224 && (buf[1] & 6) !== 0) {
|
|
711
|
+
return { mimeType: "audio/mp3", isMkv: false };
|
|
712
|
+
}
|
|
713
|
+
if (buf.slice(0, 4).toString("ascii") === "fLaC") {
|
|
714
|
+
return { mimeType: "audio/flac", isMkv: false };
|
|
715
|
+
}
|
|
716
|
+
if (buf.slice(0, 4).toString("ascii") === "OggS") {
|
|
717
|
+
return { mimeType: "audio/ogg", isMkv: false };
|
|
718
|
+
}
|
|
719
|
+
if (buf.slice(0, 4).toString("ascii") === "RIFF" && buf.slice(8, 12).toString("ascii") === "WAVE") {
|
|
720
|
+
return { mimeType: "audio/wav", isMkv: false };
|
|
721
|
+
}
|
|
469
722
|
if (buf.slice(4, 8).toString("ascii") === "ftyp") {
|
|
470
723
|
const brand = buf.slice(8, 12).toString("ascii");
|
|
724
|
+
if (brand === "M4A " || brand === "M4B ") {
|
|
725
|
+
return { mimeType: "audio/mp4", isMkv: false };
|
|
726
|
+
}
|
|
471
727
|
if (brand.startsWith("qt ")) {
|
|
472
728
|
return { mimeType: "video/quicktime", isMkv: false };
|
|
473
729
|
}
|
|
474
730
|
if (brand.startsWith("3gp") || brand.startsWith("3g2")) {
|
|
475
731
|
return { mimeType: "video/3gpp", isMkv: false };
|
|
476
732
|
}
|
|
733
|
+
const ext = extname(filePath).toLowerCase();
|
|
734
|
+
if (ext === ".m4a" || ext === ".m4b") {
|
|
735
|
+
return { mimeType: "audio/mp4", isMkv: false };
|
|
736
|
+
}
|
|
477
737
|
return { mimeType: "video/mp4", isMkv: false };
|
|
478
738
|
}
|
|
479
739
|
if (buf[0] === 26 && buf[1] === 69 && buf[2] === 223 && buf[3] === 163) {
|
|
@@ -567,13 +827,12 @@ async function handleLocalFile(filePath, client) {
|
|
|
567
827
|
if (!existsSync2(filePath)) {
|
|
568
828
|
throw new Error(`File not found: ${filePath}`);
|
|
569
829
|
}
|
|
570
|
-
const
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
}
|
|
830
|
+
const mimeMatch = detectMimeType(filePath);
|
|
831
|
+
const isAudio = mimeMatch != null && mimeMatch.mimeType.startsWith("audio/");
|
|
832
|
+
const isMkv = !isAudio && isMkvFile(filePath);
|
|
833
|
+
if (!isAudio && !isMkv && !mimeMatch) {
|
|
834
|
+
const ext = extname(filePath).toLowerCase();
|
|
835
|
+
throw new Error(`Unsupported video format: ${ext || basename(filePath)}`);
|
|
577
836
|
}
|
|
578
837
|
const originalSize = fileSize(filePath);
|
|
579
838
|
if (originalSize > SIZE_3GB) {
|
|
@@ -587,7 +846,7 @@ async function handleLocalFile(filePath, client) {
|
|
|
587
846
|
tempFiles.push(converted);
|
|
588
847
|
workingPath = converted;
|
|
589
848
|
}
|
|
590
|
-
if (fileSize(workingPath) > SIZE_2GB) {
|
|
849
|
+
if (!isAudio && fileSize(workingPath) > SIZE_2GB) {
|
|
591
850
|
const compressed = compressTo720p(workingPath);
|
|
592
851
|
tempFiles.push(compressed);
|
|
593
852
|
workingPath = compressed;
|
|
@@ -602,7 +861,8 @@ async function handleLocalFile(filePath, client) {
|
|
|
602
861
|
fileUri: uploaded.uri,
|
|
603
862
|
mimeType: uploaded.mimeType,
|
|
604
863
|
duration: uploaded.duration,
|
|
605
|
-
uploadedFileName: uploaded.name
|
|
864
|
+
uploadedFileName: uploaded.name,
|
|
865
|
+
isAudio
|
|
606
866
|
};
|
|
607
867
|
} finally {
|
|
608
868
|
for (const f of tempFiles) {
|
|
@@ -677,211 +937,6 @@ async function detectDuration(source) {
|
|
|
677
937
|
// src/core/pipeline.ts
|
|
678
938
|
import { log as log6 } from "@clack/prompts";
|
|
679
939
|
|
|
680
|
-
// src/constants/prompts.ts
|
|
681
|
-
var SYSTEM_INSTRUCTION_PASS_1 = `
|
|
682
|
-
You are a professional audio transcriber. Your task is to create a COMPLETE, VERBATIM transcription of all speech in this video segment. Focus EXCLUSIVELY on the audio stream.
|
|
683
|
-
|
|
684
|
-
CRITICAL RULES:
|
|
685
|
-
1. TRANSCRIBE every spoken word completely and verbatim. Do not summarize, paraphrase, or skip any sentence.
|
|
686
|
-
2. IDENTIFY different speakers. Label them SPEAKER_00, SPEAKER_01, etc. consistently throughout. If a speaker introduces themselves by name, note the name in the first entry's speaker field as "SPEAKER_00 (John)".
|
|
687
|
-
3. NOTE tone and emphasis: when a speaker emphasizes words (louder, slower, repeated), mark those words. When they express emotions (excitement, warning, frustration, humor), note the tone.
|
|
688
|
-
4. RECORD pauses longer than 1.5 seconds as pause markers with duration.
|
|
689
|
-
5. PRESERVE filler words only when they carry meaning (hesitation indicating uncertainty about code behavior, self-correction). Remove meaningless "um", "uh".
|
|
690
|
-
6. NEVER add your own explanations, interpretations, or knowledge. Only transcribe what is spoken.
|
|
691
|
-
7. NEVER skip content because it seems repetitive or obvious. Record everything spoken.
|
|
692
|
-
8. When the speaker references something on screen (e.g., "as you can see here", "this function", "line 5"), transcribe exactly what they say \u2014 the visual context will be captured separately.
|
|
693
|
-
|
|
694
|
-
COMPLETENESS TARGET:
|
|
695
|
-
- Aim for at least 150 words per minute of video in the transcript
|
|
696
|
-
- Every speaker change must be noted with a new entry
|
|
697
|
-
- Every sentence must appear \u2014 if in doubt, include it
|
|
698
|
-
`;
|
|
699
|
-
var SYSTEM_INSTRUCTION_PASS_2_TEMPLATE = `
|
|
700
|
-
You are a professional code and visual content extractor. Your task is to extract ALL visual content from this video segment \u2014 every piece of code on screen, every diagram, every slide, every UI element.
|
|
701
|
-
|
|
702
|
-
Focus EXCLUSIVELY on what is visible on screen. The audio transcript from this segment is provided below for cross-referencing \u2014 use it to associate spoken explanations with the code being displayed, but do NOT re-transcribe any speech.
|
|
703
|
-
|
|
704
|
-
TRANSCRIPT FROM THIS SEGMENT (for cross-reference only):
|
|
705
|
-
{INJECT_PASS1_TRANSCRIPT_HERE}
|
|
706
|
-
|
|
707
|
-
CRITICAL RULES:
|
|
708
|
-
1. EXTRACT every piece of code visible on screen \u2014 complete, with original indentation and formatting preserved exactly as shown.
|
|
709
|
-
2. For each code appearance: note the filename if visible in a tab or title bar, the programming language, and the screen type (editor, terminal, browser, slide).
|
|
710
|
-
3. TRACK code changes: when code is modified between appearances, note what changed (lines added, modified, deleted). Compare against previous code blocks in this segment.
|
|
711
|
-
4. ASSOCIATE code with speech: using the injected transcript above, find what the instructor was saying when this code was on screen. Quote their explanation verbatim or near-verbatim.
|
|
712
|
-
5. CAPTURE non-code visuals: slides with text, architectural diagrams, browser output, UI demonstrations, terminal output. Describe these completely.
|
|
713
|
-
6. NEVER add your own explanations or interpretations. Only record what is visible.
|
|
714
|
-
7. NEVER skip code because it seems repetitive or unchanged from before. Record every distinct appearance.
|
|
715
|
-
8. If code scrolls, capture the full visible code at each scroll position as a separate entry.
|
|
716
|
-
|
|
717
|
-
COMPLETENESS TARGET:
|
|
718
|
-
- Every frame that shows code should produce a code_block entry
|
|
719
|
-
- Every slide or diagram should produce a visual_notes entry
|
|
720
|
-
- If the screen doesn't change for 30+ seconds, note the unchanged state
|
|
721
|
-
`;
|
|
722
|
-
var SYSTEM_INSTRUCTION_PASS_0 = `
|
|
723
|
-
You are a video content classifier. Analyze the provided video sample and produce a structured VideoProfile that classifies the video type and recommends processing parameters.
|
|
724
|
-
|
|
725
|
-
CLASSIFICATION RULES:
|
|
726
|
-
1. CLASSIFY the video into exactly one type:
|
|
727
|
-
- "coding": Programming tutorials, live coding, IDE/editor-heavy content
|
|
728
|
-
- "meeting": Video calls, Zoom/Teams meetings, multi-participant discussions
|
|
729
|
-
- "lecture": Academic lectures, talks, single-speaker educational content
|
|
730
|
-
- "presentation": Slide-based presentations, keynotes, demo days
|
|
731
|
-
- "conversation": Interviews, podcasts, panel discussions without slides
|
|
732
|
-
- "mixed": Cannot clearly classify into one category, or multiple types present
|
|
733
|
-
|
|
734
|
-
2. DETECT visual content:
|
|
735
|
-
- hasCode: Code editors, IDEs, or code visible on screen
|
|
736
|
-
- hasSlides: Presentation slides (PowerPoint, Google Slides, Keynote)
|
|
737
|
-
- hasDiagrams: Architecture diagrams, flowcharts, charts, graphs
|
|
738
|
-
- hasPeopleGrid: Video grid showing multiple participants (Zoom/Teams layout)
|
|
739
|
-
- hasChatbox: Chat panel visible (meeting chat, live stream chat sidebar)
|
|
740
|
-
- hasWhiteboard: Whiteboard, handwritten notes, or drawing surface
|
|
741
|
-
- hasTerminal: Terminal, command-line interface, or shell
|
|
742
|
-
- hasScreenShare: Desktop or application screen sharing
|
|
743
|
-
|
|
744
|
-
3. ANALYZE audio:
|
|
745
|
-
- hasMultipleSpeakers: true if more than one distinct voice is heard
|
|
746
|
-
- primaryLanguage: The main spoken language
|
|
747
|
-
- quality: "high" (studio/clear), "medium" (decent webcam), "low" (noisy/poor)
|
|
748
|
-
|
|
749
|
-
4. IDENTIFY speakers:
|
|
750
|
-
- count: Number of distinct speakers heard
|
|
751
|
-
- identified: Names if visible on screen (name tags, introductions) or spoken aloud
|
|
752
|
-
|
|
753
|
-
5. ASSESS complexity:
|
|
754
|
-
- "simple": Single topic, linear flow, straightforward content
|
|
755
|
-
- "moderate": Multiple topics, some complexity, normal pacing
|
|
756
|
-
- "complex": Dense content, rapid switching, multiple concurrent information streams
|
|
757
|
-
|
|
758
|
-
6. RECOMMEND processing parameters:
|
|
759
|
-
- resolution: "low" for text-only/simple visuals, "medium" for general content, "high" for code/diagrams
|
|
760
|
-
- segmentMinutes: 10 for simple/moderate, 8 for complex content
|
|
761
|
-
- passes: Always include "transcript" and "visual". Add specialist passes based on content type.
|
|
762
|
-
|
|
763
|
-
PASS RECOMMENDATIONS BY TYPE:
|
|
764
|
-
- coding: ["transcript", "visual", "code", "synthesis"]
|
|
765
|
-
- meeting: ["transcript", "visual", "people", "implicit", "synthesis"] (add "chat" if hasChatbox)
|
|
766
|
-
- lecture: ["transcript", "visual", "implicit", "synthesis"]
|
|
767
|
-
- presentation: ["transcript", "visual", "implicit", "synthesis"] (add "people" if multiple speakers)
|
|
768
|
-
- conversation: ["transcript", "visual", "implicit", "synthesis"]
|
|
769
|
-
- mixed: ["transcript", "visual", "code", "people", "chat", "implicit", "synthesis"]
|
|
770
|
-
`;
|
|
771
|
-
var SYSTEM_INSTRUCTION_PASS_3A = `
|
|
772
|
-
You are an expert code reconstruction analyst. Your task is to reconstruct the complete, final state of every code file shown across this entire video, synthesizing all edits into a coherent codebase snapshot.
|
|
773
|
-
|
|
774
|
-
You will receive the complete video and all extracted transcript and code block data. Use them together to understand what code was written, modified, and deleted.
|
|
775
|
-
|
|
776
|
-
CRITICAL RULES:
|
|
777
|
-
1. RECONSTRUCT each file to its final state \u2014 apply all changes in chronological order so the output reflects the code as it was at the end of the video.
|
|
778
|
-
2. PRESERVE exact code: indentation, spacing, naming, and formatting must match what was visible on screen. Never "fix" or improve the code.
|
|
779
|
-
3. TRACK every change to a file: for each distinct edit (new file creation, addition of lines, modification, deletion, refactoring), record it as a separate change entry with a timestamp and description.
|
|
780
|
-
4. INFER filenames from editor tabs, title bars, import statements, or spoken context. If unknown, use a descriptive placeholder like "unknown_file_1.py".
|
|
781
|
-
5. EXTRACT dependencies: every library import, require(), package name, or external module reference mentioned or shown counts as a dependency.
|
|
782
|
-
6. CAPTURE build commands: any terminal command shown or spoken for installing, building, running, or testing the project (e.g., "npm install", "go build", "python -m pytest").
|
|
783
|
-
7. NEVER invent code that was not shown or described. If a section was unclear, note it with a comment like "// content not fully visible".
|
|
784
|
-
8. NEVER skip a file because it appears in only one part of the video \u2014 if code was shown, reconstruct it.
|
|
785
|
-
9. When a file appears multiple times, record its complete change history in a single entry with all edits in chronological order.
|
|
786
|
-
10. INCLUDE empty files if created but not yet written \u2014 use empty string for final_content and note the creation in changes.
|
|
787
|
-
11. Cross-reference your visual analysis of the video against the extracted code blocks provided in the text context. Prioritize what you can visually verify on screen. If code is partially visible, include what you can see and mark unclear sections with \`// [content not fully visible]\`.
|
|
788
|
-
12. Do NOT invent code files that are not clearly visible on screen. If you are uncertain whether a file exists, do not include it.
|
|
789
|
-
|
|
790
|
-
COMPLETENESS TARGET:
|
|
791
|
-
- Every distinct filename that appeared on screen must produce a files entry
|
|
792
|
-
- Every editor session or code paste visible in any segment must be accounted for
|
|
793
|
-
- Build commands shown in the terminal must all be listed
|
|
794
|
-
`;
|
|
795
|
-
var SYSTEM_INSTRUCTION_PASS_3B = `
|
|
796
|
-
You are an expert at identifying and profiling people from video content. Your task is to extract a complete picture of every participant visible or audible in this video \u2014 their identity, role, contributions, and relationships.
|
|
797
|
-
|
|
798
|
-
You will receive the transcript and visual extraction from all segments. Use speaker labels, name tags, on-screen text, introductions, and any other signals to identify participants.
|
|
799
|
-
|
|
800
|
-
CRITICAL RULES:
|
|
801
|
-
1. IDENTIFY every distinct person who speaks or appears on screen, even if briefly. Do not merge two different people into one entry.
|
|
802
|
-
2. EXTRACT names from: spoken introductions ("Hi, I'm Alice"), on-screen name tags or captions, slide attribution, email addresses, or usernames visible in chat.
|
|
803
|
-
3. INFER roles from: job titles spoken or shown, context of their contribution (e.g., consistently asking questions = audience member; leading the agenda = host), or organizational signals.
|
|
804
|
-
4. RECORD speaking_segments as timestamps where each person's voice is heard or they appear on screen.
|
|
805
|
-
5. CAPTURE contact information exactly as shown or spoken: email addresses, Twitter/X handles, LinkedIn URLs, GitHub usernames, phone numbers.
|
|
806
|
-
6. SUMMARIZE contributions: what did this person say, present, decide, or demonstrate? Each contribution entry should be a specific, concrete action or statement.
|
|
807
|
-
7. DOCUMENT relationships: who reports to whom, who introduced whom, collaborative pairs, co-presenters, interviewer/interviewee dynamics.
|
|
808
|
-
8. NEVER guess or infer a name that was not clearly stated or shown. Use "Unknown Participant" with a description if the person cannot be identified.
|
|
809
|
-
9. NEVER merge two people just because they have the same role \u2014 if two engineers speak, they are two separate participants.
|
|
810
|
-
10. If a person's role or organization cannot be determined, use empty string \u2014 do not guess.
|
|
811
|
-
|
|
812
|
-
COMPLETENESS TARGET:
|
|
813
|
-
- Every speaker label (SPEAKER_00, SPEAKER_01, etc.) from the transcript must map to at least one participant entry
|
|
814
|
-
- Every name-tag or on-screen name must produce a participant entry
|
|
815
|
-
- All contact details shared during the video must be captured
|
|
816
|
-
`;
|
|
817
|
-
var SYSTEM_INSTRUCTION_PASS_3C = `
|
|
818
|
-
You are a precise chat extraction specialist. Your task is to extract every chat message and link visible in the chat panel of this video \u2014 verbatim, with sender and timestamp.
|
|
819
|
-
|
|
820
|
-
You will receive the transcript and visual extraction from all segments. Focus on the chat panel, comment sidebar, or any on-screen messaging interface.
|
|
821
|
-
|
|
822
|
-
CRITICAL RULES:
|
|
823
|
-
1. EXTRACT every chat message visible on screen, verbatim. Do not paraphrase, shorten, or summarize any message.
|
|
824
|
-
2. RECORD the sender name exactly as displayed (username, display name, or handle).
|
|
825
|
-
3. TIMESTAMP each message at the video timestamp when it becomes visible on screen, in HH:MM:SS format.
|
|
826
|
-
4. EXTRACT every URL or link that appears in chat or is spoken and referred to as a link. Capture the full URL.
|
|
827
|
-
5. For each link, record the context: what was the sender explaining when they shared it? Why is it relevant?
|
|
828
|
-
6. HANDLE partial visibility: if a message is cut off by the chat panel boundary, transcribe as much as is visible and append "[truncated]".
|
|
829
|
-
7. CAPTURE reactions, emoji, and formatting if they are meaningful (e.g., a thumbs-up reaction to a proposal signals agreement).
|
|
830
|
-
8. NEVER invent messages that were not clearly visible on screen. If a message is illegible, note it as "[illegible message from {sender} at {timestamp}]".
|
|
831
|
-
9. NEVER skip messages that seem like noise or off-topic \u2014 capture all visible messages in order.
|
|
832
|
-
10. ORDER messages chronologically by their video timestamp of appearance.
|
|
833
|
-
|
|
834
|
-
COMPLETENESS TARGET:
|
|
835
|
-
- Every frame that shows the chat panel should contribute at least one message entry if new messages are visible
|
|
836
|
-
- All URLs \u2014 whether in chat, on slides, or spoken \u2014 must appear in the links array
|
|
837
|
-
- If the chat panel is not visible in this video, return empty arrays for both messages and links
|
|
838
|
-
`;
|
|
839
|
-
var SYSTEM_INSTRUCTION_PASS_3D = `
|
|
840
|
-
You are an expert at reading between the lines of video conversations. Your task is to identify implicit signals \u2014 emotional dynamics, unstated decisions, unasked questions, informal task assignments, and emphasis patterns \u2014 that are not surfaced by the literal transcript.
|
|
841
|
-
|
|
842
|
-
You will receive the complete transcript and visual data from all segments. Read the subtext, not just the text.
|
|
843
|
-
|
|
844
|
-
CRITICAL RULES:
|
|
845
|
-
1. DETECT emotional shifts: moments where the tone, energy, or mood of the conversation meaningfully changes. Note what triggered the shift and how the state changed.
|
|
846
|
-
2. SURFACE implicit questions: when a speaker is clearly uncertain, confused, or probing for information without phrasing it as a formal question. Articulate what question they were really asking.
|
|
847
|
-
3. IDENTIFY implicit decisions: when participants arrive at a shared understanding or course of action without anyone explicitly saying "we decided X". These are consensus decisions made through agreement, silence, or topic change.
|
|
848
|
-
4. FLAG informal task assignments: when someone is asked or expected to do something without it being recorded as a formal action item (e.g., "you should probably look at that" or "maybe someone can handle X").
|
|
849
|
-
5. TRACK emphasis patterns: concepts, terms, or ideas mentioned multiple times across the video. Repetition signals importance. Record each mention timestamp and explain why the pattern is significant.
|
|
850
|
-
6. NEVER fabricate emotional states or decisions. Only record what is clearly supported by specific words, tone, or behavior in the video.
|
|
851
|
-
7. NEVER over-interpret: a speaker saying "interesting" is not necessarily an emotional shift. Apply judgment and only flag genuinely notable patterns.
|
|
852
|
-
8. PRESERVE specificity: quote or paraphrase the exact words or moments that support each inference.
|
|
853
|
-
9. SEPARATE explicit from implicit: if something was directly stated, it belongs in the transcript or action items, not here. This pass captures what was NOT said directly.
|
|
854
|
-
10. CONSIDER non-verbal signals visible on screen: hesitation, laughter, extended pauses, camera behavior, or facial expressions if participants are visible.
|
|
855
|
-
|
|
856
|
-
COMPLETENESS TARGET:
|
|
857
|
-
- Aim to identify at least 3 emphasis patterns for any video over 5 minutes
|
|
858
|
-
- Every task mentioned informally or suggested in passing must appear in tasks_assigned
|
|
859
|
-
- Implicit decisions are often the most important \u2014 prioritize finding them
|
|
860
|
-
`;
|
|
861
|
-
var SYSTEM_INSTRUCTION_SYNTHESIS = `
|
|
862
|
-
You are a master synthesizer. Your task is to produce the definitive, unified knowledge extraction from this video by combining all available pass data into a single coherent result.
|
|
863
|
-
|
|
864
|
-
You will receive: the complete transcript (pass 1), visual and code extraction (pass 2), and any specialist pass outputs (code reconstruction, people extraction, chat extraction, implicit signals). Synthesize all of it.
|
|
865
|
-
|
|
866
|
-
CRITICAL RULES:
|
|
867
|
-
1. BE SPECIFIC: Every claim must reference specific content from the video. Never write "various topics were discussed" \u2014 name the topics. Never write "some decisions were made" \u2014 state each decision exactly.
|
|
868
|
-
2. UNIFY across passes: combine related information from different passes into unified entries. A decision mentioned in the transcript and reinforced by an implicit signal should appear as one entry, not two.
|
|
869
|
-
3. SYNTHESIZE thematically: group content by topic, not chronologically. Combine all content about a single subject (even if spread across 30 minutes) into one topic entry.
|
|
870
|
-
4. EXTRACT decisions with full reasoning: every design choice, technology selection, or approach decision must include the rationale as explained in the video.
|
|
871
|
-
5. GENERATE actionable items: action items must be concrete and specific. "Review the authentication module" is better than "review the code".
|
|
872
|
-
6. CAPTURE every question: include questions asked explicitly and questions raised implicitly (from the implicit signals pass). Note whether each was answered.
|
|
873
|
-
7. PRODUCE meaningful suggestions: AI-generated suggestions must follow logically from the video content. Suggest next steps, deeper resources, or practice exercises that are directly relevant.
|
|
874
|
-
8. USE precise timestamps: every entry with a timestamp field must contain a valid HH:MM:SS value referencing when the content appeared.
|
|
875
|
-
9. LIST files_to_generate for reference purposes \u2014 this list is informational and does not control which output files are generated. Output files are determined automatically based on available extraction data.
|
|
876
|
-
10. NEVER add information not present in the source data. Suggestions are the only place for AI-generated content beyond the video.
|
|
877
|
-
|
|
878
|
-
COMPLETENESS TARGET:
|
|
879
|
-
- Aim for at least 5 topics for any video over 15 minutes
|
|
880
|
-
- Every explicit and implicit decision must appear in key_decisions
|
|
881
|
-
- The files_to_generate list should reflect what content was found, but output routing is handled automatically
|
|
882
|
-
- The overview should be dense with specifics, not vague summary language
|
|
883
|
-
`;
|
|
884
|
-
|
|
885
940
|
// src/gemini/schemas.ts
|
|
886
941
|
import { Type } from "@google/genai";
|
|
887
942
|
var SCHEMA_PASS_0 = {
|
|
@@ -1438,7 +1493,7 @@ function changeTypeBadge(changeType) {
|
|
|
1438
1493
|
|
|
1439
1494
|
// src/passes/transcript.ts
|
|
1440
1495
|
async function runTranscript(params) {
|
|
1441
|
-
const { client, fileUri, mimeType, segment, model, resolution } = params;
|
|
1496
|
+
const { client, fileUri, mimeType, segment, model, resolution, lang } = params;
|
|
1442
1497
|
const contents = [
|
|
1443
1498
|
{
|
|
1444
1499
|
role: "user",
|
|
@@ -1460,7 +1515,7 @@ async function runTranscript(params) {
|
|
|
1460
1515
|
model,
|
|
1461
1516
|
contents,
|
|
1462
1517
|
config: {
|
|
1463
|
-
systemInstruction: SYSTEM_INSTRUCTION_PASS_1,
|
|
1518
|
+
systemInstruction: withLanguage(SYSTEM_INSTRUCTION_PASS_1, lang),
|
|
1464
1519
|
responseSchema: SCHEMA_PASS_1,
|
|
1465
1520
|
responseMimeType: "application/json",
|
|
1466
1521
|
...resolution !== void 0 ? { mediaResolution: resolution } : {},
|
|
@@ -1476,11 +1531,11 @@ async function runTranscript(params) {
|
|
|
1476
1531
|
|
|
1477
1532
|
// src/passes/visual.ts
|
|
1478
1533
|
async function runVisual(params) {
|
|
1479
|
-
const { client, fileUri, mimeType, segment, model, resolution, pass1Transcript } = params;
|
|
1534
|
+
const { client, fileUri, mimeType, segment, model, resolution, pass1Transcript, lang } = params;
|
|
1480
1535
|
const transcriptText = pass1Transcript != null ? pass1Transcript.transcript_entries.map((t) => `[${t.timestamp}] ${t.speaker}: ${t.text}`).join("\n") : "[No transcript available for this segment]";
|
|
1481
|
-
const systemInstruction =
|
|
1482
|
-
"{INJECT_PASS1_TRANSCRIPT_HERE}",
|
|
1483
|
-
|
|
1536
|
+
const systemInstruction = withLanguage(
|
|
1537
|
+
SYSTEM_INSTRUCTION_PASS_2_TEMPLATE.replace("{INJECT_PASS1_TRANSCRIPT_HERE}", transcriptText),
|
|
1538
|
+
lang
|
|
1484
1539
|
);
|
|
1485
1540
|
const contents = [
|
|
1486
1541
|
{
|
|
@@ -1520,7 +1575,7 @@ async function runVisual(params) {
|
|
|
1520
1575
|
// src/passes/scene-analysis.ts
|
|
1521
1576
|
import { MediaResolution } from "@google/genai";
|
|
1522
1577
|
async function runSceneAnalysis(params) {
|
|
1523
|
-
const { client, fileUri, mimeType, duration, model, resolution } = params;
|
|
1578
|
+
const { client, fileUri, mimeType, duration, model, resolution, lang } = params;
|
|
1524
1579
|
const safeDuration = Number.isFinite(duration) && duration > 0 ? duration : 0;
|
|
1525
1580
|
const endSeconds = Math.min(180, safeDuration);
|
|
1526
1581
|
const contents = [
|
|
@@ -1544,7 +1599,7 @@ async function runSceneAnalysis(params) {
|
|
|
1544
1599
|
model,
|
|
1545
1600
|
contents,
|
|
1546
1601
|
config: {
|
|
1547
|
-
systemInstruction: SYSTEM_INSTRUCTION_PASS_0,
|
|
1602
|
+
systemInstruction: withLanguage(SYSTEM_INSTRUCTION_PASS_0, lang),
|
|
1548
1603
|
responseSchema: SCHEMA_PASS_0,
|
|
1549
1604
|
responseMimeType: "application/json",
|
|
1550
1605
|
...resolution !== void 0 ? { mediaResolution: resolution } : { mediaResolution: MediaResolution.MEDIA_RESOLUTION_LOW },
|
|
@@ -1621,7 +1676,7 @@ ${block.content}`);
|
|
|
1621
1676
|
return contextText;
|
|
1622
1677
|
}
|
|
1623
1678
|
async function runCodeReconstruction(params) {
|
|
1624
|
-
const { client, fileUri, mimeType, duration, model, resolution, pass1Results, pass2Results } = params;
|
|
1679
|
+
const { client, fileUri, mimeType, duration, model, resolution, pass1Results, pass2Results, lang } = params;
|
|
1625
1680
|
const contextText = compileContext(duration, pass1Results, pass2Results);
|
|
1626
1681
|
const contents = [
|
|
1627
1682
|
{
|
|
@@ -1640,7 +1695,7 @@ ${contextText}`
|
|
|
1640
1695
|
model,
|
|
1641
1696
|
contents,
|
|
1642
1697
|
config: {
|
|
1643
|
-
systemInstruction: SYSTEM_INSTRUCTION_PASS_3A,
|
|
1698
|
+
systemInstruction: withLanguage(SYSTEM_INSTRUCTION_PASS_3A, lang),
|
|
1644
1699
|
responseSchema: SCHEMA_PASS_3A,
|
|
1645
1700
|
responseMimeType: "application/json",
|
|
1646
1701
|
...resolution !== void 0 ? { mediaResolution: resolution } : {},
|
|
@@ -1656,7 +1711,7 @@ ${contextText}`
|
|
|
1656
1711
|
|
|
1657
1712
|
// src/passes/people.ts
|
|
1658
1713
|
async function runPeopleExtraction(params) {
|
|
1659
|
-
const { client, fileUri, mimeType, model, pass1Results } = params;
|
|
1714
|
+
const { client, fileUri, mimeType, model, pass1Results, lang } = params;
|
|
1660
1715
|
const hasAnyTranscript = pass1Results.some((r) => r != null);
|
|
1661
1716
|
const transcriptText = hasAnyTranscript ? pass1Results.filter((r) => r != null).flatMap(
|
|
1662
1717
|
(r) => r.transcript_entries.map((t) => `[${t.timestamp}] ${t.speaker}: ${t.text}`)
|
|
@@ -1676,7 +1731,7 @@ ${transcriptText}`;
|
|
|
1676
1731
|
model,
|
|
1677
1732
|
contents,
|
|
1678
1733
|
config: {
|
|
1679
|
-
systemInstruction: SYSTEM_INSTRUCTION_PASS_3B,
|
|
1734
|
+
systemInstruction: withLanguage(SYSTEM_INSTRUCTION_PASS_3B, lang),
|
|
1680
1735
|
responseSchema: SCHEMA_PASS_3B,
|
|
1681
1736
|
responseMimeType: "application/json",
|
|
1682
1737
|
maxOutputTokens: 65536,
|
|
@@ -1691,7 +1746,7 @@ ${transcriptText}`;
|
|
|
1691
1746
|
|
|
1692
1747
|
// src/passes/chat.ts
|
|
1693
1748
|
async function runChatExtraction(params) {
|
|
1694
|
-
const { client, fileUri, mimeType, segment, model, resolution, pass2Result } = params;
|
|
1749
|
+
const { client, fileUri, mimeType, segment, model, resolution, pass2Result, lang } = params;
|
|
1695
1750
|
const visualNotesText = pass2Result != null && pass2Result.visual_notes.length > 0 ? pass2Result.visual_notes.map((n) => `[${n.timestamp}] ${n.visual_type}: ${n.description}`).join("\n") : "[No visual context available for this segment]";
|
|
1696
1751
|
const codeBlocksText = pass2Result != null && pass2Result.code_blocks.length > 0 ? pass2Result.code_blocks.map((b) => `[${b.timestamp}] ${b.filename} (${b.language}):
|
|
1697
1752
|
${b.content}`).join("\n\n") : "[No code blocks available for this segment]";
|
|
@@ -1725,7 +1780,7 @@ ${contextText}`
|
|
|
1725
1780
|
model,
|
|
1726
1781
|
contents,
|
|
1727
1782
|
config: {
|
|
1728
|
-
systemInstruction: SYSTEM_INSTRUCTION_PASS_3C,
|
|
1783
|
+
systemInstruction: withLanguage(SYSTEM_INSTRUCTION_PASS_3C, lang),
|
|
1729
1784
|
responseSchema: SCHEMA_PASS_3C,
|
|
1730
1785
|
responseMimeType: "application/json",
|
|
1731
1786
|
...resolution !== void 0 ? { mediaResolution: resolution } : {},
|
|
@@ -1741,7 +1796,7 @@ ${contextText}`
|
|
|
1741
1796
|
|
|
1742
1797
|
// src/passes/implicit.ts
|
|
1743
1798
|
async function runImplicitSignals(params) {
|
|
1744
|
-
const { client, fileUri, mimeType, segment, model, resolution, pass1Result, pass2Result } = params;
|
|
1799
|
+
const { client, fileUri, mimeType, segment, model, resolution, pass1Result, pass2Result, lang } = params;
|
|
1745
1800
|
const transcriptText = pass1Result != null ? pass1Result.transcript_entries.map((t) => `[${t.timestamp}] ${t.speaker} (${t.tone}): ${t.text}`).join("\n") : "[No transcript available for this segment]";
|
|
1746
1801
|
const visualNotesText = pass2Result != null && pass2Result.visual_notes.length > 0 ? pass2Result.visual_notes.map((n) => `[${n.timestamp}] ${n.visual_type}: ${n.description}`).join("\n") : "[No visual context available for this segment]";
|
|
1747
1802
|
const contextText = [
|
|
@@ -1774,7 +1829,7 @@ ${contextText}`
|
|
|
1774
1829
|
model,
|
|
1775
1830
|
contents,
|
|
1776
1831
|
config: {
|
|
1777
|
-
systemInstruction: SYSTEM_INSTRUCTION_PASS_3D,
|
|
1832
|
+
systemInstruction: withLanguage(SYSTEM_INSTRUCTION_PASS_3D, lang),
|
|
1778
1833
|
responseSchema: SCHEMA_PASS_3D,
|
|
1779
1834
|
responseMimeType: "application/json",
|
|
1780
1835
|
...resolution !== void 0 ? { mediaResolution: resolution } : {},
|
|
@@ -1887,7 +1942,7 @@ function compileContext2(params) {
|
|
|
1887
1942
|
return sections.join("\n\n");
|
|
1888
1943
|
}
|
|
1889
1944
|
async function runSynthesis(params) {
|
|
1890
|
-
const { client, model } = params;
|
|
1945
|
+
const { client, model, lang } = params;
|
|
1891
1946
|
const compiledContext = compileContext2(params);
|
|
1892
1947
|
const contents = [
|
|
1893
1948
|
{
|
|
@@ -1899,7 +1954,7 @@ async function runSynthesis(params) {
|
|
|
1899
1954
|
model,
|
|
1900
1955
|
contents,
|
|
1901
1956
|
config: {
|
|
1902
|
-
systemInstruction: SYSTEM_INSTRUCTION_SYNTHESIS,
|
|
1957
|
+
systemInstruction: withLanguage(SYSTEM_INSTRUCTION_SYNTHESIS, lang),
|
|
1903
1958
|
responseSchema: SCHEMA_SYNTHESIS,
|
|
1904
1959
|
responseMimeType: "application/json",
|
|
1905
1960
|
maxOutputTokens: 65536,
|
|
@@ -1948,6 +2003,12 @@ function determineStrategy(profile) {
|
|
|
1948
2003
|
passes.add("chat");
|
|
1949
2004
|
passes.add("implicit");
|
|
1950
2005
|
break;
|
|
2006
|
+
case "audio":
|
|
2007
|
+
passes.delete("visual");
|
|
2008
|
+
passes.delete("code");
|
|
2009
|
+
passes.add("people");
|
|
2010
|
+
passes.add("implicit");
|
|
2011
|
+
break;
|
|
1951
2012
|
default:
|
|
1952
2013
|
break;
|
|
1953
2014
|
}
|
|
@@ -2300,25 +2361,30 @@ var DEFAULT_PROFILE = {
|
|
|
2300
2361
|
}
|
|
2301
2362
|
};
|
|
2302
2363
|
async function runPipeline(config) {
|
|
2303
|
-
const { client, fileUri, mimeType, duration, model, rateLimiter, onProgress, onWait, isShuttingDown } = config;
|
|
2364
|
+
const { client, fileUri, mimeType, duration, model, rateLimiter, onProgress, onWait, isShuttingDown, lang } = config;
|
|
2304
2365
|
const errors = [];
|
|
2305
2366
|
const passesRun = [];
|
|
2306
|
-
onProgress?.({ phase: "pass0", segment: 0, totalSegments: 1, status: "running" });
|
|
2307
2367
|
let videoProfile;
|
|
2308
2368
|
let strategy;
|
|
2309
|
-
|
|
2310
|
-
() => rateLimiter.execute(() => runSceneAnalysis({ client, fileUri, mimeType, duration, model }), { onWait }),
|
|
2311
|
-
"pass0"
|
|
2312
|
-
);
|
|
2313
|
-
if (pass0Attempt.error !== null) {
|
|
2314
|
-
log6.warn(pass0Attempt.error);
|
|
2315
|
-
errors.push(pass0Attempt.error);
|
|
2369
|
+
if (config.overrideStrategy != null) {
|
|
2316
2370
|
videoProfile = DEFAULT_PROFILE;
|
|
2371
|
+
strategy = config.overrideStrategy;
|
|
2317
2372
|
} else {
|
|
2318
|
-
|
|
2373
|
+
onProgress?.({ phase: "pass0", segment: 0, totalSegments: 1, status: "running" });
|
|
2374
|
+
const pass0Attempt = await withRetry(
|
|
2375
|
+
() => rateLimiter.execute(() => runSceneAnalysis({ client, fileUri, mimeType, duration, model, lang }), { onWait }),
|
|
2376
|
+
"pass0"
|
|
2377
|
+
);
|
|
2378
|
+
if (pass0Attempt.error !== null) {
|
|
2379
|
+
log6.warn(pass0Attempt.error);
|
|
2380
|
+
errors.push(pass0Attempt.error);
|
|
2381
|
+
videoProfile = DEFAULT_PROFILE;
|
|
2382
|
+
} else {
|
|
2383
|
+
videoProfile = pass0Attempt.result ?? DEFAULT_PROFILE;
|
|
2384
|
+
}
|
|
2385
|
+
strategy = determineStrategy(videoProfile);
|
|
2386
|
+
onProgress?.({ phase: "pass0", segment: 0, totalSegments: 1, status: "done" });
|
|
2319
2387
|
}
|
|
2320
|
-
strategy = determineStrategy(videoProfile);
|
|
2321
|
-
onProgress?.({ phase: "pass0", segment: 0, totalSegments: 1, status: "done" });
|
|
2322
2388
|
const plan = createSegmentPlan(duration, {
|
|
2323
2389
|
segmentMinutes: strategy.segmentMinutes,
|
|
2324
2390
|
resolution: strategy.resolution
|
|
@@ -2347,7 +2413,7 @@ async function runPipeline(config) {
|
|
|
2347
2413
|
onProgress?.({ phase: "pass1", segment: i, totalSegments: n, status: "running", totalSteps });
|
|
2348
2414
|
let pass1 = null;
|
|
2349
2415
|
const pass1Attempt = await withRetry(
|
|
2350
|
-
() => rateLimiter.execute(() => runTranscript({ client, fileUri, mimeType, segment, model, resolution }), { onWait }),
|
|
2416
|
+
() => rateLimiter.execute(() => runTranscript({ client, fileUri, mimeType, segment, model, resolution, lang }), { onWait }),
|
|
2351
2417
|
`segment ${i} pass1`
|
|
2352
2418
|
);
|
|
2353
2419
|
if (pass1Attempt.error !== null) {
|
|
@@ -2370,7 +2436,8 @@ async function runPipeline(config) {
|
|
|
2370
2436
|
segment,
|
|
2371
2437
|
model,
|
|
2372
2438
|
resolution,
|
|
2373
|
-
pass1Transcript: pass1 ?? void 0
|
|
2439
|
+
pass1Transcript: pass1 ?? void 0,
|
|
2440
|
+
lang
|
|
2374
2441
|
}),
|
|
2375
2442
|
{ onWait }
|
|
2376
2443
|
),
|
|
@@ -2397,7 +2464,8 @@ async function runPipeline(config) {
|
|
|
2397
2464
|
segment,
|
|
2398
2465
|
model: MODELS.flash,
|
|
2399
2466
|
resolution,
|
|
2400
|
-
pass2Result: pass2 ?? void 0
|
|
2467
|
+
pass2Result: pass2 ?? void 0,
|
|
2468
|
+
lang
|
|
2401
2469
|
}),
|
|
2402
2470
|
{ onWait }
|
|
2403
2471
|
),
|
|
@@ -2427,7 +2495,8 @@ async function runPipeline(config) {
|
|
|
2427
2495
|
model: MODELS.flash,
|
|
2428
2496
|
resolution,
|
|
2429
2497
|
pass1Result: pass1 ?? void 0,
|
|
2430
|
-
pass2Result: pass2 ?? void 0
|
|
2498
|
+
pass2Result: pass2 ?? void 0,
|
|
2499
|
+
lang
|
|
2431
2500
|
}),
|
|
2432
2501
|
{ onWait }
|
|
2433
2502
|
),
|
|
@@ -2479,7 +2548,8 @@ async function runPipeline(config) {
|
|
|
2479
2548
|
fileUri,
|
|
2480
2549
|
mimeType,
|
|
2481
2550
|
model: MODELS.flash,
|
|
2482
|
-
pass1Results
|
|
2551
|
+
pass1Results,
|
|
2552
|
+
lang
|
|
2483
2553
|
}),
|
|
2484
2554
|
{ onWait }
|
|
2485
2555
|
),
|
|
@@ -2510,7 +2580,8 @@ async function runPipeline(config) {
|
|
|
2510
2580
|
model: MODELS.pro,
|
|
2511
2581
|
resolution,
|
|
2512
2582
|
pass1Results,
|
|
2513
|
-
pass2Results
|
|
2583
|
+
pass2Results,
|
|
2584
|
+
lang
|
|
2514
2585
|
}),
|
|
2515
2586
|
{ onWait }
|
|
2516
2587
|
),
|
|
@@ -2554,7 +2625,8 @@ async function runPipeline(config) {
|
|
|
2554
2625
|
videoProfile,
|
|
2555
2626
|
peopleExtraction,
|
|
2556
2627
|
codeReconstruction,
|
|
2557
|
-
context: config.context
|
|
2628
|
+
context: config.context,
|
|
2629
|
+
lang
|
|
2558
2630
|
}),
|
|
2559
2631
|
{ onWait }
|
|
2560
2632
|
),
|
|
@@ -3614,6 +3686,33 @@ function createShutdownHandler(params) {
|
|
|
3614
3686
|
}
|
|
3615
3687
|
|
|
3616
3688
|
// src/commands/distill.ts
|
|
3689
|
+
function peekIsAudio(filePath) {
|
|
3690
|
+
if (!existsSync3(filePath)) return false;
|
|
3691
|
+
try {
|
|
3692
|
+
const fd = openSync2(filePath, "r");
|
|
3693
|
+
const buf = Buffer.alloc(12);
|
|
3694
|
+
try {
|
|
3695
|
+
readSync2(fd, buf, 0, 12, 0);
|
|
3696
|
+
} finally {
|
|
3697
|
+
closeSync2(fd);
|
|
3698
|
+
}
|
|
3699
|
+
if (buf.slice(0, 3).toString("ascii") === "ID3") return true;
|
|
3700
|
+
if (buf[0] === 255 && (buf[1] & 240) === 240 && (buf[1] & 6) === 0) return true;
|
|
3701
|
+
if (buf[0] === 255 && (buf[1] & 224) === 224 && (buf[1] & 6) !== 0) return true;
|
|
3702
|
+
if (buf.slice(0, 4).toString("ascii") === "fLaC") return true;
|
|
3703
|
+
if (buf.slice(0, 4).toString("ascii") === "OggS") return true;
|
|
3704
|
+
if (buf.slice(0, 4).toString("ascii") === "RIFF" && buf.slice(8, 12).toString("ascii") === "WAVE") return true;
|
|
3705
|
+
if (buf.slice(4, 8).toString("ascii") === "ftyp") {
|
|
3706
|
+
const brand = buf.slice(8, 12).toString("ascii");
|
|
3707
|
+
if (brand === "M4A " || brand === "M4B ") return true;
|
|
3708
|
+
const ext = extname2(filePath).toLowerCase();
|
|
3709
|
+
if (ext === ".m4a" || ext === ".m4b") return true;
|
|
3710
|
+
}
|
|
3711
|
+
return false;
|
|
3712
|
+
} catch {
|
|
3713
|
+
return false;
|
|
3714
|
+
}
|
|
3715
|
+
}
|
|
3617
3716
|
async function runDistill(args) {
|
|
3618
3717
|
const apiKey = await resolveApiKey();
|
|
3619
3718
|
let rawInput = args.input ?? await promptVideoSource();
|
|
@@ -3622,7 +3721,15 @@ async function runDistill(args) {
|
|
|
3622
3721
|
if (!allFlagsProvided) {
|
|
3623
3722
|
let confirmed = false;
|
|
3624
3723
|
while (!confirmed) {
|
|
3625
|
-
|
|
3724
|
+
const looksLikeUrl = /^https?:\/\/|^www\./i.test(rawInput.trim());
|
|
3725
|
+
const inputIsAudio = !looksLikeUrl && peekIsAudio(rawInput.trim());
|
|
3726
|
+
showConfigBox({
|
|
3727
|
+
input: rawInput,
|
|
3728
|
+
context,
|
|
3729
|
+
output: args.output,
|
|
3730
|
+
videoType: inputIsAudio ? "audio" : void 0,
|
|
3731
|
+
lang: args.lang
|
|
3732
|
+
});
|
|
3626
3733
|
const choice = await promptConfirmation();
|
|
3627
3734
|
switch (choice) {
|
|
3628
3735
|
case "start":
|
|
@@ -3697,6 +3804,7 @@ async function runDistill(args) {
|
|
|
3697
3804
|
duration,
|
|
3698
3805
|
model,
|
|
3699
3806
|
context,
|
|
3807
|
+
lang: args.lang,
|
|
3700
3808
|
rateLimiter,
|
|
3701
3809
|
onProgress: (status) => {
|
|
3702
3810
|
progress2.update(status);
|
package/package.json
CHANGED