npm - tuna-agent - Versions diffs - 0.1.151 → 0.1.152 - Mend

tuna-agent 0.1.151 → 0.1.152

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/dist/daemon/analyze-video-handler.js +24 -16
package/package.json +1 -1

package/dist/daemon/analyze-video-handler.js CHANGED Viewed

@@ -269,7 +269,7 @@ async function visionExtractPhase1(frames, transcript, cost) {
         master_cast_prompt: '',
         characters: [],
     };
-    if (!GEMINI_KEYS.length || frames.length === 0)
+    if (!OPENAI_KEY || frames.length === 0)
         return empty;
     try {
         const promptText = `Act as a Master Film Director. These frames are sampled across an ENTIRE video, in order. Use them + the transcript to analyse the whole piece.
@@ -291,17 +291,25 @@ Rules:
 - NO GROUP ENTRIES (CRITICAL): NEVER output a collective/crowd label as a single entry — forbidden: "VILLAGERS", "LADIES GROUP", "KNITTING GROUP", "CROWD", "GROUP OF ...", any "*_GROUP". If 2+ similar secondary people RECUR across scenes, list them as SEPARATE numbered individuals (e.g. WOMAN_1, WOMAN_2, WOMAN_3), each with its OWN distinct face/hair/body/age. Only a truly anonymous one-off background that never recurs may be omitted entirely.
 - characters.description: ENGLISH only, factual, no camera/action words.
 - DISTINCT FACES (CRITICAL): every character MUST have a HIGHLY UNIQUE facial structure, a distinct hairstyle, a specific body type and a clearly different age. NEVER reuse the same or a similar facial description for two characters — they must look completely different from one another.`;
-        // Phase-1 on Gemini 2.5 Flash: image-heavy read is far cheaper than gpt-4o,
-        // and cast recall is backstopped by the post-Phase-2 reconcile pass, so a
-        // small frame sample suffices here.
-        const parts = [
-            { text: promptText },
-            ...frames.map(b64 => ({ inlineData: { mimeType: 'image/jpeg', data: b64 } })),
-        ];
-        const { text: rawTxt, usage } = await geminiGenerate(parts, 1600, 'gemini-2.5-flash');
+        // Phase-1 stays on gpt-4o (best cinematic summary + style — the narrative
+        // spine for the cloned script) with a dense 30-frame seed. Only 1
+        // call/video; final cast recall is double-covered by the reconcile pass.
+        const content = [{ type: 'text', text: promptText }];
+        for (const b64 of frames) {
+            content.push({ type: 'image_url', image_url: { url: `data:image/jpeg;base64,${b64}` } });
+        }
+        const res = await fetch('https://api.openai.com/v1/chat/completions', {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
+            body: JSON.stringify({ model: 'gpt-4o', max_tokens: 1600, messages: [{ role: 'user', content }] }),
+        });
+        if (!res.ok)
+            return empty;
+        const data = await res.json();
+        cost?.chat('phase1', 'gpt-4o', data.usage);
+        const rawTxt = (data.choices?.[0]?.message?.content || '').trim();
         if (!rawTxt)
             return empty;
-        cost?.geminiVision('phase1', usage, 'gemini-2.5-flash');
         let parsed = {};
         try {
             const m = rawTxt.match(/\{[\s\S]*\}/);
@@ -604,12 +612,12 @@ export async function analyzeVideo(url, onProgress) {
         // master cast + characters. Runs before per-scene describe so the cast
         // context keeps naming consistent across the whole timeline.
         progress('Đang phân tích tổng thể (summary + style + master cast)...');
-        // Sample up to 10 frames evenly — enough for summary + style + a naming
-        // seed. Cast RECALL no longer depends on this sample: the post-Phase-2
-        // reconcile pass derives the definitive cast from every per-scene
-        // description, so a small sample keeps the (now Gemini 2.5 Flash) Phase-1
-        // call cheap.
-        const p1SampleCount = Math.min(10, frameBuffers.length);
+        // Sample up to 30 frames evenly (matches AI_Video_Clone). Final cast
+        // RECALL is owned by the post-Phase-2 reconcile pass (reads EVERY scene),
+        // but a dense 30-frame seed gives the per-scene pass a consistent naming
+        // vocabulary up-front → cleaner reconcile + safer on hard cases. Only 1
+        // call/video so the richer sample is worth it.
+        const p1SampleCount = Math.min(30, frameBuffers.length);
         const p1Step = Math.max(1, Math.floor(frameBuffers.length / p1SampleCount));
         const p1Samples = frameBuffers
             .filter((_, i) => i % p1Step === 0)

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "tuna-agent",
-  "version": "0.1.151",
+  "version": "0.1.152",
   "description": "Tuna Agent - Run AI coding tasks on your machine",
   "bin": {
     "tuna-agent": "dist/cli/index.js"