npm - tuna-agent - Versions diffs - 0.1.144 → 0.1.145 - Mend

tuna-agent 0.1.144 → 0.1.145

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/daemon/analyze-video-handler.d.ts +4 -2
package/dist/daemon/analyze-video-handler.js +74 -28
package/package.json +1 -1

package/dist/daemon/analyze-video-handler.d.ts CHANGED Viewed

@@ -1,9 +1,11 @@
 /**
  * Analyze Video Handler
  * Downloads YouTube video via yt-dlp, extracts audio, transcribes via Whisper,
- * extracts frames per segment, describes each frame via GPT-4o vision.
+ * extracts frames per segment, describes each scene via Gemini 3 Flash vision
+ * (per-scene; Phase-1 summary/style/cast still uses gpt-4o).
  *
- * Requires on the host machine: yt-dlp, ffmpeg, ffprobe, OPENAI_API_KEY env var.
+ * Requires on the host machine: yt-dlp, ffmpeg, ffprobe, OPENAI_API_KEY
+ * (Whisper + correction + Phase-1) and GEMINI_API_KEY (per-scene vision) env vars.
  */
 import { AgentWebSocketClient } from './ws-client.js';
 export interface AnalyzeVideoResult {

package/dist/daemon/analyze-video-handler.js CHANGED Viewed

@@ -1,9 +1,11 @@
 /**
  * Analyze Video Handler
  * Downloads YouTube video via yt-dlp, extracts audio, transcribes via Whisper,
- * extracts frames per segment, describes each frame via GPT-4o vision.
+ * extracts frames per segment, describes each scene via Gemini 3 Flash vision
+ * (per-scene; Phase-1 summary/style/cast still uses gpt-4o).
  *
- * Requires on the host machine: yt-dlp, ffmpeg, ffprobe, OPENAI_API_KEY env var.
+ * Requires on the host machine: yt-dlp, ffmpeg, ffprobe, OPENAI_API_KEY
+ * (Whisper + correction + Phase-1) and GEMINI_API_KEY (per-scene vision) env vars.
  */
 import { spawn } from 'child_process';
 import { promises as fs } from 'fs';
@@ -11,6 +13,12 @@ import path from 'path';
 import os from 'os';
 import crypto from 'crypto';
 const OPENAI_KEY = process.env.OPENAI_API_KEY || '';
+// Gemini 3 Flash powers per-scene visionDescribe: cheaper image tokens than
+// gpt-4o-mini's ~33x multiplier and a stronger VLM. Comma/newline-separated
+// list → rotate on 429 so a single free-tier key still completes (slower).
+const GEMINI_KEYS = (process.env.GEMINI_API_KEY || '')
+    .split(/[,\n]+/).map(s => s.trim()).filter(Boolean);
+const GEMINI_MODEL = process.env.GEMINI_MODEL || 'gemini-3-flash-preview';
 const YT_DLP = process.env.YT_DLP_BIN || '/home/gatoasang94/.local/bin/yt-dlp';
 const FFMPEG = process.env.FFMPEG_BIN || '/usr/bin/ffmpeg';
 const FFPROBE = process.env.FFPROBE_BIN || '/usr/bin/ffprobe';
@@ -19,6 +27,8 @@ const RATES = {
     whisperPerMin: 0.006,
     'gpt-4o-mini': { in: 0.15, out: 0.60 },
     'gpt-4o': { in: 2.50, out: 10.0 },
+    // Gemini 3 Flash preview: text+image input share one rate, output 6x.
+    'gemini-3-flash-preview': { in: 0.50, out: 3.0 },
 };
 // Per-run cost+token accumulator. Single-threaded JS → plain mutation is safe
 // even across the parallel visionDescribe calls.
@@ -38,6 +48,17 @@ class CostTracker {
         const cost = ((usage.prompt_tokens || 0) / 1e6) * r.in + ((usage.completion_tokens || 0) / 1e6) * r.out;
         this.add(bucket, cost);
     }
+    // Gemini reports usageMetadata.{promptTokenCount,candidatesTokenCount}
+    // instead of OpenAI's prompt_tokens/completion_tokens.
+    geminiVision(bucket, usage) {
+        if (!usage) {
+            this.add(bucket, 0);
+            return;
+        }
+        const r = RATES['gemini-3-flash-preview'];
+        const cost = ((usage.promptTokenCount || 0) / 1e6) * r.in + ((usage.candidatesTokenCount || 0) / 1e6) * r.out;
+        this.add(bucket, cost);
+    }
     whisper(audioSec) { this.add('whisper', (audioSec / 60) * RATES.whisperPerMin); }
     total() { return Object.values(this.breakdown).reduce((s, b) => s + b.cost, 0); }
 }
@@ -150,8 +171,49 @@ ${rawText}`,
 // turns") instead of guessing from a single frozen midpoint. The model is
 // told the frames are chronological so it describes the action arc, not 3
 // separate moments.
+// One Gemini generateContent call with key rotation + exponential backoff on
+// 429/5xx. A single free-tier key under the 5-way concurrent batch WILL
+// rate-limit; retrying (slower) beats dropping the scene description.
+async function geminiGenerate(parts, maxOutputTokens) {
+    if (!GEMINI_KEYS.length)
+        return { text: '' };
+    const body = JSON.stringify({
+        contents: [{ parts }],
+        generationConfig: { maxOutputTokens, temperature: 0.4 },
+    });
+    const MAX_ATTEMPTS = 6;
+    let keyIdx = 0;
+    let lastErr = '';
+    for (let attempt = 0; attempt < MAX_ATTEMPTS; attempt++) {
+        const key = GEMINI_KEYS[keyIdx % GEMINI_KEYS.length];
+        try {
+            const res = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/${encodeURIComponent(GEMINI_MODEL)}:generateContent?key=${key}`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body });
+            if (res.status === 429 || res.status >= 500) {
+                lastErr = `Gemini ${res.status}`;
+                keyIdx++; // rotate to the next key before backing off
+                const backoff = Math.min(60000, 1500 * 2 ** attempt) + Math.floor(Math.random() * 1000);
+                await new Promise(r => setTimeout(r, backoff));
+                continue;
+            }
+            if (!res.ok) {
+                lastErr = `Gemini ${res.status}: ${(await res.text()).slice(0, 200)}`;
+                break;
+            }
+            const data = await res.json();
+            const text = (data?.candidates?.[0]?.content?.parts || [])
+                .map(p => p.text || '').join('').trim();
+            return { text, usage: data?.usageMetadata };
+        }
+        catch (e) {
+            lastErr = e instanceof Error ? e.message : String(e);
+            await new Promise(r => setTimeout(r, Math.min(30000, 1000 * 2 ** attempt)));
+        }
+    }
+    console.warn(`[analyze_video] geminiGenerate failed after retries: ${lastErr}`);
+    return { text: '' };
+}
 async function visionDescribe(frameB64s, voiceoverText, castContext = '', cost) {
-    if (!OPENAI_KEY)
+    if (!GEMINI_KEYS.length)
         return '';
     const frames = frameB64s.filter(Boolean);
     if (frames.length === 0)
@@ -162,20 +224,7 @@ async function visionDescribe(frameB64s, voiceoverText, castContext = '', cost)
     const seqNote = frames.length > 1
         ? ` The ${frames.length} images are CHRONOLOGICAL samples from the SAME scene (start → middle → end). Treat them as one continuous shot: describe the MOTION ARC across them (direction of movement, camera push/pan, what changes from first to last frame), not three separate moments.`
         : '';
-    const imageParts = frames.map(b64 => ({
-        type: 'image_url',
-        image_url: { url: `data:image/jpeg;base64,${b64}` },
-    }));
-    const res = await fetch('https://api.openai.com/v1/chat/completions', {
-        method: 'POST',
-        headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
-        body: JSON.stringify({
-            model: 'gpt-4o-mini',
-            max_tokens: 350,
-            messages: [{
-                    role: 'user',
-                    content: [
-                        { type: 'text', text: `Describe this scene in detail (4-6 sentences, English).${seqNote}${castBlock} Include:
+    const promptText = `Describe this scene in detail (4-6 sentences, English).${seqNote}${castBlock} Include:
 - Characters: name them using the KNOWN CHARACTER CAST labels above when they appear; appearance (shape, color, size), facial expression, what they're doing
 - Physical connections: Are characters physically joined/attached/fused together (e.g. organs connected at a junction, body parts linked)? Or are they separate/independent? Be VERY specific — "physically attached at Y-junction" is different from "standing next to each other"
 - Spatial positions: exact position of each character (left/right/above/below/center), distance between them
@@ -183,17 +232,14 @@ async function visionDescribe(frameB64s, voiceoverText, castContext = '', cost)
 - Camera: angle, framing (close-up, wide, etc.), and any camera movement across the frames
 - Action: the movement/action arc from first to last frame (direction, what changes)
-Voiceover during this scene: "${voiceoverText || '(none)'}"` },
-                        ...imageParts,
-                    ],
-                }],
-        }),
-    });
-    if (!res.ok)
-        return '';
-    const data = await res.json();
-    cost?.chat('vision', 'gpt-4o-mini', data.usage);
-    return data.choices?.[0]?.message?.content?.trim() || '';
+Voiceover during this scene: "${voiceoverText || '(none)'}"`;
+    const parts = [
+        { text: promptText },
+        ...frames.map(b64 => ({ inlineData: { mimeType: 'image/jpeg', data: b64 } })),
+    ];
+    const { text, usage } = await geminiGenerate(parts, 512);
+    cost?.geminiVision('vision', usage);
+    return text;
 }
 // Phase 1 (the strong part of AI_Video_Clone, ported): ONE gpt-4o call over
 // frames sampled across the whole video + transcript that returns, together:

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "tuna-agent",
-  "version": "0.1.144",
+  "version": "0.1.145",
   "description": "Tuna Agent - Run AI coding tasks on your machine",
   "bin": {
     "tuna-agent": "dist/cli/index.js"