tuna-agent 0.1.144 → 0.1.145

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,11 @@
1
1
  /**
2
2
  * Analyze Video Handler
3
3
  * Downloads YouTube video via yt-dlp, extracts audio, transcribes via Whisper,
4
- * extracts frames per segment, describes each frame via GPT-4o vision.
4
+ * extracts frames per segment, describes each scene via Gemini 3 Flash vision
5
+ * (per-scene; Phase-1 summary/style/cast still uses gpt-4o).
5
6
  *
6
- * Requires on the host machine: yt-dlp, ffmpeg, ffprobe, OPENAI_API_KEY env var.
7
+ * Requires on the host machine: yt-dlp, ffmpeg, ffprobe, OPENAI_API_KEY
8
+ * (Whisper + correction + Phase-1) and GEMINI_API_KEY (per-scene vision) env vars.
7
9
  */
8
10
  import { AgentWebSocketClient } from './ws-client.js';
9
11
  export interface AnalyzeVideoResult {
@@ -1,9 +1,11 @@
1
1
  /**
2
2
  * Analyze Video Handler
3
3
  * Downloads YouTube video via yt-dlp, extracts audio, transcribes via Whisper,
4
- * extracts frames per segment, describes each frame via GPT-4o vision.
4
+ * extracts frames per segment, describes each scene via Gemini 3 Flash vision
5
+ * (per-scene; Phase-1 summary/style/cast still uses gpt-4o).
5
6
  *
6
- * Requires on the host machine: yt-dlp, ffmpeg, ffprobe, OPENAI_API_KEY env var.
7
+ * Requires on the host machine: yt-dlp, ffmpeg, ffprobe, OPENAI_API_KEY
8
+ * (Whisper + correction + Phase-1) and GEMINI_API_KEY (per-scene vision) env vars.
7
9
  */
8
10
  import { spawn } from 'child_process';
9
11
  import { promises as fs } from 'fs';
@@ -11,6 +13,12 @@ import path from 'path';
11
13
  import os from 'os';
12
14
  import crypto from 'crypto';
13
15
  const OPENAI_KEY = process.env.OPENAI_API_KEY || '';
16
+ // Gemini 3 Flash powers per-scene visionDescribe: cheaper image tokens than
17
+ // gpt-4o-mini's ~33x multiplier and a stronger VLM. Comma/newline-separated
18
+ // list → rotate on 429 so a single free-tier key still completes (slower).
19
+ const GEMINI_KEYS = (process.env.GEMINI_API_KEY || '')
20
+ .split(/[,\n]+/).map(s => s.trim()).filter(Boolean);
21
+ const GEMINI_MODEL = process.env.GEMINI_MODEL || 'gemini-3-flash-preview';
14
22
  const YT_DLP = process.env.YT_DLP_BIN || '/home/gatoasang94/.local/bin/yt-dlp';
15
23
  const FFMPEG = process.env.FFMPEG_BIN || '/usr/bin/ffmpeg';
16
24
  const FFPROBE = process.env.FFPROBE_BIN || '/usr/bin/ffprobe';
@@ -19,6 +27,8 @@ const RATES = {
19
27
  whisperPerMin: 0.006,
20
28
  'gpt-4o-mini': { in: 0.15, out: 0.60 },
21
29
  'gpt-4o': { in: 2.50, out: 10.0 },
30
+ // Gemini 3 Flash preview: text+image input share one rate, output 6x.
31
+ 'gemini-3-flash-preview': { in: 0.50, out: 3.0 },
22
32
  };
23
33
  // Per-run cost+token accumulator. Single-threaded JS → plain mutation is safe
24
34
  // even across the parallel visionDescribe calls.
@@ -38,6 +48,17 @@ class CostTracker {
38
48
  const cost = ((usage.prompt_tokens || 0) / 1e6) * r.in + ((usage.completion_tokens || 0) / 1e6) * r.out;
39
49
  this.add(bucket, cost);
40
50
  }
51
+ // Gemini reports usageMetadata.{promptTokenCount,candidatesTokenCount}
52
+ // instead of OpenAI's prompt_tokens/completion_tokens.
53
+ geminiVision(bucket, usage) {
54
+ if (!usage) {
55
+ this.add(bucket, 0);
56
+ return;
57
+ }
58
+ const r = RATES['gemini-3-flash-preview'];
59
+ const cost = ((usage.promptTokenCount || 0) / 1e6) * r.in + ((usage.candidatesTokenCount || 0) / 1e6) * r.out;
60
+ this.add(bucket, cost);
61
+ }
41
62
  whisper(audioSec) { this.add('whisper', (audioSec / 60) * RATES.whisperPerMin); }
42
63
  total() { return Object.values(this.breakdown).reduce((s, b) => s + b.cost, 0); }
43
64
  }
@@ -150,8 +171,49 @@ ${rawText}`,
150
171
  // turns") instead of guessing from a single frozen midpoint. The model is
151
172
  // told the frames are chronological so it describes the action arc, not 3
152
173
  // separate moments.
174
+ // One Gemini generateContent call with key rotation + exponential backoff on
175
+ // 429/5xx. A single free-tier key under the 5-way concurrent batch WILL
176
+ // rate-limit; retrying (slower) beats dropping the scene description.
177
+ async function geminiGenerate(parts, maxOutputTokens) {
178
+ if (!GEMINI_KEYS.length)
179
+ return { text: '' };
180
+ const body = JSON.stringify({
181
+ contents: [{ parts }],
182
+ generationConfig: { maxOutputTokens, temperature: 0.4 },
183
+ });
184
+ const MAX_ATTEMPTS = 6;
185
+ let keyIdx = 0;
186
+ let lastErr = '';
187
+ for (let attempt = 0; attempt < MAX_ATTEMPTS; attempt++) {
188
+ const key = GEMINI_KEYS[keyIdx % GEMINI_KEYS.length];
189
+ try {
190
+ const res = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/${encodeURIComponent(GEMINI_MODEL)}:generateContent?key=${key}`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body });
191
+ if (res.status === 429 || res.status >= 500) {
192
+ lastErr = `Gemini ${res.status}`;
193
+ keyIdx++; // rotate to the next key before backing off
194
+ const backoff = Math.min(60000, 1500 * 2 ** attempt) + Math.floor(Math.random() * 1000);
195
+ await new Promise(r => setTimeout(r, backoff));
196
+ continue;
197
+ }
198
+ if (!res.ok) {
199
+ lastErr = `Gemini ${res.status}: ${(await res.text()).slice(0, 200)}`;
200
+ break;
201
+ }
202
+ const data = await res.json();
203
+ const text = (data?.candidates?.[0]?.content?.parts || [])
204
+ .map(p => p.text || '').join('').trim();
205
+ return { text, usage: data?.usageMetadata };
206
+ }
207
+ catch (e) {
208
+ lastErr = e instanceof Error ? e.message : String(e);
209
+ await new Promise(r => setTimeout(r, Math.min(30000, 1000 * 2 ** attempt)));
210
+ }
211
+ }
212
+ console.warn(`[analyze_video] geminiGenerate failed after retries: ${lastErr}`);
213
+ return { text: '' };
214
+ }
153
215
  async function visionDescribe(frameB64s, voiceoverText, castContext = '', cost) {
154
- if (!OPENAI_KEY)
216
+ if (!GEMINI_KEYS.length)
155
217
  return '';
156
218
  const frames = frameB64s.filter(Boolean);
157
219
  if (frames.length === 0)
@@ -162,20 +224,7 @@ async function visionDescribe(frameB64s, voiceoverText, castContext = '', cost)
162
224
  const seqNote = frames.length > 1
163
225
  ? ` The ${frames.length} images are CHRONOLOGICAL samples from the SAME scene (start → middle → end). Treat them as one continuous shot: describe the MOTION ARC across them (direction of movement, camera push/pan, what changes from first to last frame), not three separate moments.`
164
226
  : '';
165
- const imageParts = frames.map(b64 => ({
166
- type: 'image_url',
167
- image_url: { url: `data:image/jpeg;base64,${b64}` },
168
- }));
169
- const res = await fetch('https://api.openai.com/v1/chat/completions', {
170
- method: 'POST',
171
- headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
172
- body: JSON.stringify({
173
- model: 'gpt-4o-mini',
174
- max_tokens: 350,
175
- messages: [{
176
- role: 'user',
177
- content: [
178
- { type: 'text', text: `Describe this scene in detail (4-6 sentences, English).${seqNote}${castBlock} Include:
227
+ const promptText = `Describe this scene in detail (4-6 sentences, English).${seqNote}${castBlock} Include:
179
228
  - Characters: name them using the KNOWN CHARACTER CAST labels above when they appear; appearance (shape, color, size), facial expression, what they're doing
180
229
  - Physical connections: Are characters physically joined/attached/fused together (e.g. organs connected at a junction, body parts linked)? Or are they separate/independent? Be VERY specific — "physically attached at Y-junction" is different from "standing next to each other"
181
230
  - Spatial positions: exact position of each character (left/right/above/below/center), distance between them
@@ -183,17 +232,14 @@ async function visionDescribe(frameB64s, voiceoverText, castContext = '', cost)
183
232
  - Camera: angle, framing (close-up, wide, etc.), and any camera movement across the frames
184
233
  - Action: the movement/action arc from first to last frame (direction, what changes)
185
234
 
186
- Voiceover during this scene: "${voiceoverText || '(none)'}"` },
187
- ...imageParts,
188
- ],
189
- }],
190
- }),
191
- });
192
- if (!res.ok)
193
- return '';
194
- const data = await res.json();
195
- cost?.chat('vision', 'gpt-4o-mini', data.usage);
196
- return data.choices?.[0]?.message?.content?.trim() || '';
235
+ Voiceover during this scene: "${voiceoverText || '(none)'}"`;
236
+ const parts = [
237
+ { text: promptText },
238
+ ...frames.map(b64 => ({ inlineData: { mimeType: 'image/jpeg', data: b64 } })),
239
+ ];
240
+ const { text, usage } = await geminiGenerate(parts, 512);
241
+ cost?.geminiVision('vision', usage);
242
+ return text;
197
243
  }
198
244
  // Phase 1 (the strong part of AI_Video_Clone, ported): ONE gpt-4o call over
199
245
  // frames sampled across the whole video + transcript that returns, together:
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "tuna-agent",
3
- "version": "0.1.144",
3
+ "version": "0.1.145",
4
4
  "description": "Tuna Agent - Run AI coding tasks on your machine",
5
5
  "bin": {
6
6
  "tuna-agent": "dist/cli/index.js"