npm - tuna-agent - Versions diffs - 0.1.148 → 0.1.149 - Mend

tuna-agent 0.1.148 → 0.1.149

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/dist/daemon/analyze-video-handler.js +24 -27
package/package.json +1 -1

package/dist/daemon/analyze-video-handler.js CHANGED Viewed

@@ -29,6 +29,8 @@ const RATES = {
     'gpt-4o': { in: 2.50, out: 10.0 },
     // Gemini 3 Flash preview: text+image input share one rate, output 6x.
     'gemini-3-flash-preview': { in: 0.50, out: 3.0 },
+    // Gemini 2.5 Flash: cheaper image-heavy reads (used by Phase-1).
+    'gemini-2.5-flash': { in: 0.30, out: 2.50 },
 };
 // Per-run cost+token accumulator. Single-threaded JS → plain mutation is safe
 // even across the parallel visionDescribe calls.
@@ -50,12 +52,12 @@ class CostTracker {
     }
     // Gemini reports usageMetadata.{promptTokenCount,candidatesTokenCount}
     // instead of OpenAI's prompt_tokens/completion_tokens.
-    geminiVision(bucket, usage) {
+    geminiVision(bucket, usage, model = 'gemini-3-flash-preview') {
         if (!usage) {
             this.add(bucket, 0);
             return;
         }
-        const r = RATES['gemini-3-flash-preview'];
+        const r = RATES[model];
         const cost = ((usage.promptTokenCount || 0) / 1e6) * r.in + ((usage.candidatesTokenCount || 0) / 1e6) * r.out;
         this.add(bucket, cost);
     }
@@ -174,7 +176,7 @@ ${rawText}`,
 // One Gemini generateContent call with key rotation + exponential backoff on
 // 429/5xx. A single free-tier key under the 5-way concurrent batch WILL
 // rate-limit; retrying (slower) beats dropping the scene description.
-async function geminiGenerate(parts, maxOutputTokens) {
+async function geminiGenerate(parts, maxOutputTokens, model = GEMINI_MODEL) {
     if (!GEMINI_KEYS.length)
         return { text: '' };
     const body = JSON.stringify({
@@ -187,7 +189,7 @@ async function geminiGenerate(parts, maxOutputTokens) {
     for (let attempt = 0; attempt < MAX_ATTEMPTS; attempt++) {
         const key = GEMINI_KEYS[keyIdx % GEMINI_KEYS.length];
         try {
-            const res = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/${encodeURIComponent(GEMINI_MODEL)}:generateContent?key=${key}`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body });
+            const res = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/${encodeURIComponent(model)}:generateContent?key=${key}`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body });
             if (res.status === 429 || res.status >= 500) {
                 lastErr = `Gemini ${res.status}`;
                 keyIdx++; // rotate to the next key before backing off
@@ -259,13 +261,10 @@ async function visionExtractPhase1(frames, transcript, cost) {
         master_cast_prompt: '',
         characters: [],
     };
-    if (!OPENAI_KEY || frames.length === 0)
+    if (!GEMINI_KEYS.length || frames.length === 0)
         return empty;
     try {
-        const content = [
-            {
-                type: 'text',
-                text: `Act as a Master Film Director. These frames are sampled across an ENTIRE video, in order. Use them + the transcript to analyse the whole piece.
+        const promptText = `Act as a Master Film Director. These frames are sampled across an ENTIRE video, in order. Use them + the transcript to analyse the whole piece.
 Transcript context: "${(transcript || '').slice(0, 4000)}"
@@ -282,22 +281,18 @@ Rules:
 - characters.name: a stable short uppercase label reused for this subject (e.g. "THE BISHOP", "U-94 SUBMARINE"). Max 4 words.
 - RECALL (CRITICAL): list EVERY distinct recurring subject SEPARATELY. If a family or group recurs, include EACH member as its own entry (e.g. adult man, adult woman, older boy, younger girl) — never merge them into one. Skip only true one-off background extras. Be COMPLETE: missing a recurring character is worse than one extra. Up to 8.
 - characters.description: ENGLISH only, factual, no camera/action words.
-- DISTINCT FACES (CRITICAL): every character MUST have a HIGHLY UNIQUE facial structure, a distinct hairstyle, a specific body type and a clearly different age. NEVER reuse the same or a similar facial description for two characters — they must look completely different from one another.`,
-            },
+- DISTINCT FACES (CRITICAL): every character MUST have a HIGHLY UNIQUE facial structure, a distinct hairstyle, a specific body type and a clearly different age. NEVER reuse the same or a similar facial description for two characters — they must look completely different from one another.`;
+        // Phase-1 on Gemini 2.5 Flash: image-heavy read is far cheaper than gpt-4o,
+        // and cast recall is backstopped by the post-Phase-2 reconcile pass, so a
+        // small frame sample suffices here.
+        const parts = [
+            { text: promptText },
+            ...frames.map(b64 => ({ inlineData: { mimeType: 'image/jpeg', data: b64 } })),
         ];
-        for (const b64 of frames) {
-            content.push({ type: 'image_url', image_url: { url: `data:image/jpeg;base64,${b64}` } });
-        }
-        const res = await fetch('https://api.openai.com/v1/chat/completions', {
-            method: 'POST',
-            headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
-            body: JSON.stringify({ model: 'gpt-4o', max_tokens: 1600, messages: [{ role: 'user', content }] }),
-        });
-        if (!res.ok)
+        const { text: rawTxt, usage } = await geminiGenerate(parts, 1600, 'gemini-2.5-flash');
+        if (!rawTxt)
             return empty;
-        const data = await res.json();
-        cost?.chat('phase1', 'gpt-4o', data.usage);
-        const rawTxt = (data.choices?.[0]?.message?.content || '').trim();
+        cost?.geminiVision('phase1', usage, 'gemini-2.5-flash');
         let parsed = {};
         try {
             const m = rawTxt.match(/\{[\s\S]*\}/);
@@ -599,10 +594,12 @@ export async function analyzeVideo(url, onProgress) {
         // master cast + characters. Runs before per-scene describe so the cast
         // context keeps naming consistent across the whole timeline.
         progress('Đang phân tích tổng thể (summary + style + master cast)...');
-        // Sample up to 30 frames evenly across the whole video — denser sampling
-        // is critical for cast RECALL on sparse-narration (ASMR) videos where
-        // Phase-1 relies almost entirely on frames (matches AI_Video_Clone).
-        const p1SampleCount = Math.min(30, frameBuffers.length);
+        // Sample up to 10 frames evenly — enough for summary + style + a naming
+        // seed. Cast RECALL no longer depends on this sample: the post-Phase-2
+        // reconcile pass derives the definitive cast from every per-scene
+        // description, so a small sample keeps the (now Gemini 2.5 Flash) Phase-1
+        // call cheap.
+        const p1SampleCount = Math.min(10, frameBuffers.length);
         const p1Step = Math.max(1, Math.floor(frameBuffers.length / p1SampleCount));
         const p1Samples = frameBuffers
             .filter((_, i) => i % p1Step === 0)

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "tuna-agent",
-  "version": "0.1.148",
+  "version": "0.1.149",
   "description": "Tuna Agent - Run AI coding tasks on your machine",
   "bin": {
     "tuna-agent": "dist/cli/index.js"