tuna-agent 0.1.144 → 0.1.146
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Analyze Video Handler
|
|
3
3
|
* Downloads YouTube video via yt-dlp, extracts audio, transcribes via Whisper,
|
|
4
|
-
* extracts frames per segment, describes each
|
|
4
|
+
* extracts frames per segment, describes each scene via Gemini 3 Flash vision
|
|
5
|
+
* (per-scene; Phase-1 summary/style/cast still uses gpt-4o).
|
|
5
6
|
*
|
|
6
|
-
* Requires on the host machine: yt-dlp, ffmpeg, ffprobe, OPENAI_API_KEY
|
|
7
|
+
* Requires on the host machine: yt-dlp, ffmpeg, ffprobe, OPENAI_API_KEY
|
|
8
|
+
* (Whisper + correction + Phase-1) and GEMINI_API_KEY (per-scene vision) env vars.
|
|
7
9
|
*/
|
|
8
10
|
import { AgentWebSocketClient } from './ws-client.js';
|
|
9
11
|
export interface AnalyzeVideoResult {
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Analyze Video Handler
|
|
3
3
|
* Downloads YouTube video via yt-dlp, extracts audio, transcribes via Whisper,
|
|
4
|
-
* extracts frames per segment, describes each
|
|
4
|
+
* extracts frames per segment, describes each scene via Gemini 3 Flash vision
|
|
5
|
+
* (per-scene; Phase-1 summary/style/cast still uses gpt-4o).
|
|
5
6
|
*
|
|
6
|
-
* Requires on the host machine: yt-dlp, ffmpeg, ffprobe, OPENAI_API_KEY
|
|
7
|
+
* Requires on the host machine: yt-dlp, ffmpeg, ffprobe, OPENAI_API_KEY
|
|
8
|
+
* (Whisper + correction + Phase-1) and GEMINI_API_KEY (per-scene vision) env vars.
|
|
7
9
|
*/
|
|
8
10
|
import { spawn } from 'child_process';
|
|
9
11
|
import { promises as fs } from 'fs';
|
|
@@ -11,6 +13,12 @@ import path from 'path';
|
|
|
11
13
|
import os from 'os';
|
|
12
14
|
import crypto from 'crypto';
|
|
13
15
|
const OPENAI_KEY = process.env.OPENAI_API_KEY || '';
|
|
16
|
+
// Gemini 3 Flash powers per-scene visionDescribe: cheaper image tokens than
|
|
17
|
+
// gpt-4o-mini's ~33x multiplier and a stronger VLM. Comma/newline-separated
|
|
18
|
+
// list → rotate on 429 so a single free-tier key still completes (slower).
|
|
19
|
+
const GEMINI_KEYS = (process.env.GEMINI_API_KEY || '')
|
|
20
|
+
.split(/[,\n]+/).map(s => s.trim()).filter(Boolean);
|
|
21
|
+
const GEMINI_MODEL = process.env.GEMINI_MODEL || 'gemini-3-flash-preview';
|
|
14
22
|
const YT_DLP = process.env.YT_DLP_BIN || '/home/gatoasang94/.local/bin/yt-dlp';
|
|
15
23
|
const FFMPEG = process.env.FFMPEG_BIN || '/usr/bin/ffmpeg';
|
|
16
24
|
const FFPROBE = process.env.FFPROBE_BIN || '/usr/bin/ffprobe';
|
|
@@ -19,6 +27,8 @@ const RATES = {
|
|
|
19
27
|
whisperPerMin: 0.006,
|
|
20
28
|
'gpt-4o-mini': { in: 0.15, out: 0.60 },
|
|
21
29
|
'gpt-4o': { in: 2.50, out: 10.0 },
|
|
30
|
+
// Gemini 3 Flash preview: text+image input share one rate, output 6x.
|
|
31
|
+
'gemini-3-flash-preview': { in: 0.50, out: 3.0 },
|
|
22
32
|
};
|
|
23
33
|
// Per-run cost+token accumulator. Single-threaded JS → plain mutation is safe
|
|
24
34
|
// even across the parallel visionDescribe calls.
|
|
@@ -38,6 +48,17 @@ class CostTracker {
|
|
|
38
48
|
const cost = ((usage.prompt_tokens || 0) / 1e6) * r.in + ((usage.completion_tokens || 0) / 1e6) * r.out;
|
|
39
49
|
this.add(bucket, cost);
|
|
40
50
|
}
|
|
51
|
+
// Gemini reports usageMetadata.{promptTokenCount,candidatesTokenCount}
|
|
52
|
+
// instead of OpenAI's prompt_tokens/completion_tokens.
|
|
53
|
+
geminiVision(bucket, usage) {
|
|
54
|
+
if (!usage) {
|
|
55
|
+
this.add(bucket, 0);
|
|
56
|
+
return;
|
|
57
|
+
}
|
|
58
|
+
const r = RATES['gemini-3-flash-preview'];
|
|
59
|
+
const cost = ((usage.promptTokenCount || 0) / 1e6) * r.in + ((usage.candidatesTokenCount || 0) / 1e6) * r.out;
|
|
60
|
+
this.add(bucket, cost);
|
|
61
|
+
}
|
|
41
62
|
whisper(audioSec) { this.add('whisper', (audioSec / 60) * RATES.whisperPerMin); }
|
|
42
63
|
total() { return Object.values(this.breakdown).reduce((s, b) => s + b.cost, 0); }
|
|
43
64
|
}
|
|
@@ -150,8 +171,49 @@ ${rawText}`,
|
|
|
150
171
|
// turns") instead of guessing from a single frozen midpoint. The model is
|
|
151
172
|
// told the frames are chronological so it describes the action arc, not 3
|
|
152
173
|
// separate moments.
|
|
174
|
+
// One Gemini generateContent call with key rotation + exponential backoff on
|
|
175
|
+
// 429/5xx. A single free-tier key under the 5-way concurrent batch WILL
|
|
176
|
+
// rate-limit; retrying (slower) beats dropping the scene description.
|
|
177
|
+
async function geminiGenerate(parts, maxOutputTokens) {
|
|
178
|
+
if (!GEMINI_KEYS.length)
|
|
179
|
+
return { text: '' };
|
|
180
|
+
const body = JSON.stringify({
|
|
181
|
+
contents: [{ parts }],
|
|
182
|
+
generationConfig: { maxOutputTokens, temperature: 0.4 },
|
|
183
|
+
});
|
|
184
|
+
const MAX_ATTEMPTS = 6;
|
|
185
|
+
let keyIdx = 0;
|
|
186
|
+
let lastErr = '';
|
|
187
|
+
for (let attempt = 0; attempt < MAX_ATTEMPTS; attempt++) {
|
|
188
|
+
const key = GEMINI_KEYS[keyIdx % GEMINI_KEYS.length];
|
|
189
|
+
try {
|
|
190
|
+
const res = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/${encodeURIComponent(GEMINI_MODEL)}:generateContent?key=${key}`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body });
|
|
191
|
+
if (res.status === 429 || res.status >= 500) {
|
|
192
|
+
lastErr = `Gemini ${res.status}`;
|
|
193
|
+
keyIdx++; // rotate to the next key before backing off
|
|
194
|
+
const backoff = Math.min(60000, 1500 * 2 ** attempt) + Math.floor(Math.random() * 1000);
|
|
195
|
+
await new Promise(r => setTimeout(r, backoff));
|
|
196
|
+
continue;
|
|
197
|
+
}
|
|
198
|
+
if (!res.ok) {
|
|
199
|
+
lastErr = `Gemini ${res.status}: ${(await res.text()).slice(0, 200)}`;
|
|
200
|
+
break;
|
|
201
|
+
}
|
|
202
|
+
const data = await res.json();
|
|
203
|
+
const text = (data?.candidates?.[0]?.content?.parts || [])
|
|
204
|
+
.map(p => p.text || '').join('').trim();
|
|
205
|
+
return { text, usage: data?.usageMetadata };
|
|
206
|
+
}
|
|
207
|
+
catch (e) {
|
|
208
|
+
lastErr = e instanceof Error ? e.message : String(e);
|
|
209
|
+
await new Promise(r => setTimeout(r, Math.min(30000, 1000 * 2 ** attempt)));
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
console.warn(`[analyze_video] geminiGenerate failed after retries: ${lastErr}`);
|
|
213
|
+
return { text: '' };
|
|
214
|
+
}
|
|
153
215
|
async function visionDescribe(frameB64s, voiceoverText, castContext = '', cost) {
|
|
154
|
-
if (!
|
|
216
|
+
if (!GEMINI_KEYS.length)
|
|
155
217
|
return '';
|
|
156
218
|
const frames = frameB64s.filter(Boolean);
|
|
157
219
|
if (frames.length === 0)
|
|
@@ -162,20 +224,7 @@ async function visionDescribe(frameB64s, voiceoverText, castContext = '', cost)
|
|
|
162
224
|
const seqNote = frames.length > 1
|
|
163
225
|
? ` The ${frames.length} images are CHRONOLOGICAL samples from the SAME scene (start → middle → end). Treat them as one continuous shot: describe the MOTION ARC across them (direction of movement, camera push/pan, what changes from first to last frame), not three separate moments.`
|
|
164
226
|
: '';
|
|
165
|
-
const
|
|
166
|
-
type: 'image_url',
|
|
167
|
-
image_url: { url: `data:image/jpeg;base64,${b64}` },
|
|
168
|
-
}));
|
|
169
|
-
const res = await fetch('https://api.openai.com/v1/chat/completions', {
|
|
170
|
-
method: 'POST',
|
|
171
|
-
headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
|
|
172
|
-
body: JSON.stringify({
|
|
173
|
-
model: 'gpt-4o-mini',
|
|
174
|
-
max_tokens: 350,
|
|
175
|
-
messages: [{
|
|
176
|
-
role: 'user',
|
|
177
|
-
content: [
|
|
178
|
-
{ type: 'text', text: `Describe this scene in detail (4-6 sentences, English).${seqNote}${castBlock} Include:
|
|
227
|
+
const promptText = `Describe this scene in detail (4-6 sentences, English).${seqNote}${castBlock} Include:
|
|
179
228
|
- Characters: name them using the KNOWN CHARACTER CAST labels above when they appear; appearance (shape, color, size), facial expression, what they're doing
|
|
180
229
|
- Physical connections: Are characters physically joined/attached/fused together (e.g. organs connected at a junction, body parts linked)? Or are they separate/independent? Be VERY specific — "physically attached at Y-junction" is different from "standing next to each other"
|
|
181
230
|
- Spatial positions: exact position of each character (left/right/above/below/center), distance between them
|
|
@@ -183,17 +232,14 @@ async function visionDescribe(frameB64s, voiceoverText, castContext = '', cost)
|
|
|
183
232
|
- Camera: angle, framing (close-up, wide, etc.), and any camera movement across the frames
|
|
184
233
|
- Action: the movement/action arc from first to last frame (direction, what changes)
|
|
185
234
|
|
|
186
|
-
Voiceover during this scene: "${voiceoverText || '(none)'}"
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
});
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
const data = await res.json();
|
|
195
|
-
cost?.chat('vision', 'gpt-4o-mini', data.usage);
|
|
196
|
-
return data.choices?.[0]?.message?.content?.trim() || '';
|
|
235
|
+
Voiceover during this scene: "${voiceoverText || '(none)'}"`;
|
|
236
|
+
const parts = [
|
|
237
|
+
{ text: promptText },
|
|
238
|
+
...frames.map(b64 => ({ inlineData: { mimeType: 'image/jpeg', data: b64 } })),
|
|
239
|
+
];
|
|
240
|
+
const { text, usage } = await geminiGenerate(parts, 512);
|
|
241
|
+
cost?.geminiVision('vision', usage);
|
|
242
|
+
return text;
|
|
197
243
|
}
|
|
198
244
|
// Phase 1 (the strong part of AI_Video_Clone, ported): ONE gpt-4o call over
|
|
199
245
|
// frames sampled across the whole video + transcript that returns, together:
|
|
@@ -235,7 +281,8 @@ Return ONLY a JSON object, no markdown fences:
|
|
|
235
281
|
Rules:
|
|
236
282
|
- characters.name: a stable short uppercase label reused for this subject (e.g. "THE BISHOP", "U-94 SUBMARINE"). Max 4 words.
|
|
237
283
|
- Only RECURRING subjects worth a reference sheet. Skip one-off extras. Max 6.
|
|
238
|
-
- characters.description: ENGLISH only, factual, no camera/action words
|
|
284
|
+
- characters.description: ENGLISH only, factual, no camera/action words.
|
|
285
|
+
- DISTINCT FACES (CRITICAL): every character MUST have a HIGHLY UNIQUE facial structure, a distinct hairstyle, a specific body type and a clearly different age. NEVER reuse the same or a similar facial description for two characters — they must look completely different from one another.`,
|
|
239
286
|
},
|
|
240
287
|
];
|
|
241
288
|
for (const b64 of frames) {
|
|
@@ -273,7 +320,7 @@ Rules:
|
|
|
273
320
|
`[AESTHETIC & STYLE]\n${styleLine}\n` +
|
|
274
321
|
`[COMPOSITION & LAYOUT]\nCharacter Reference Sheet. Full-body side-by-side.\n` +
|
|
275
322
|
`[CHARACTER CAST LIST]\n${castList}\n` +
|
|
276
|
-
`[TECHNICAL SPECIFICATIONS]\nHigh detail, 8k resolution, consistent facial structures across all frames.`;
|
|
323
|
+
`[TECHNICAL SPECIFICATIONS]\nHigh detail, 8k resolution, consistent facial structures across all frames. Each character has a completely distinct face, hairstyle, body type and age — no two characters look alike.`;
|
|
277
324
|
}
|
|
278
325
|
return { video_summary, video_style, master_cast_prompt, characters };
|
|
279
326
|
}
|