npm - tuna-agent - Versions diffs - 0.1.134 → 0.1.135 - Mend

tuna-agent 0.1.134 → 0.1.135

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/dist/daemon/analyze-video-handler.js +58 -22
package/package.json +1 -1

package/dist/daemon/analyze-video-handler.js CHANGED Viewed

@@ -74,31 +74,46 @@ ${rawText}`,
         return rawText;
     }
 }
-async function visionDescribe(frameB64, voiceoverText, castContext = '') {
+// Accepts 1..N frames sampled across a scene (start → mid → end). Multiple
+// frames let the model observe MOTION direction ("walks left-to-right then
+// turns") instead of guessing from a single frozen midpoint. The model is
+// told the frames are chronological so it describes the action arc, not 3
+// separate moments.
+async function visionDescribe(frameB64s, voiceoverText, castContext = '') {
     if (!OPENAI_KEY)
         return '';
+    const frames = frameB64s.filter(Boolean);
+    if (frames.length === 0)
+        return '';
     const castBlock = castContext
         ? `\n\nKNOWN CHARACTER CAST (reuse these EXACT names when a subject appears — do NOT invent new labels for the same subject):\n${castContext}\n`
         : '';
+    const seqNote = frames.length > 1
+        ? ` The ${frames.length} images are CHRONOLOGICAL samples from the SAME scene (start → middle → end). Treat them as one continuous shot: describe the MOTION ARC across them (direction of movement, camera push/pan, what changes from first to last frame), not three separate moments.`
+        : '';
+    const imageParts = frames.map(b64 => ({
+        type: 'image_url',
+        image_url: { url: `data:image/jpeg;base64,${b64}` },
+    }));
     const res = await fetch('https://api.openai.com/v1/chat/completions', {
         method: 'POST',
         headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
         body: JSON.stringify({
             model: 'gpt-4o-mini',
-            max_tokens: 300,
+            max_tokens: 350,
             messages: [{
                     role: 'user',
                     content: [
-                        { type: 'text', text: `Describe this frame in detail (4-6 sentences, English).${castBlock} Include:
+                        { type: 'text', text: `Describe this scene in detail (4-6 sentences, English).${seqNote}${castBlock} Include:
 - Characters: name them using the KNOWN CHARACTER CAST labels above when they appear; appearance (shape, color, size), facial expression, what they're doing
 - Physical connections: Are characters physically joined/attached/fused together (e.g. organs connected at a junction, body parts linked)? Or are they separate/independent? Be VERY specific — "physically attached at Y-junction" is different from "standing next to each other"
 - Spatial positions: exact position of each character (left/right/above/below/center), distance between them
 - Environment: setting, lighting, color palette, atmosphere
-- Camera: angle, framing (close-up, wide, etc.)
-- Action: what is happening in this moment, movement direction
+- Camera: angle, framing (close-up, wide, etc.), and any camera movement across the frames
+- Action: the movement/action arc from first to last frame (direction, what changes)
-Voiceover at this moment: "${voiceoverText || '(none)'}"` },
-                        { type: 'image_url', image_url: { url: `data:image/jpeg;base64,${frameB64}` } },
+Voiceover during this scene: "${voiceoverText || '(none)'}"` },
+                        ...imageParts,
                     ],
                 }],
         }),
@@ -298,26 +313,47 @@ export async function analyzeVideo(url, onProgress) {
         const finalSlots = sceneSlots.slice(0, MAX_SCENES);
         progress(`Đang cắt ${finalSlots.length} frames và phân tích...`);
         console.log('[analyze_video] Building', finalSlots.length, 'scenes (segments:', segments.length, ', duration:', durationSec, 's)');
-        // Step 1: Extract all frames sequentially (ffmpeg can't run in parallel on same file efficiently)
+        // Step 1: Extract frames sequentially. Per scene we grab 3 chronological
+        // frames — start → middle → end — so the vision model can read the motion
+        // arc (direction of movement, camera push) instead of guessing from a
+        // single frozen midpoint. The MIDDLE frame doubles as the UI thumbnail.
+        // Tiny scenes (<1.5s) collapse to just the midpoint (the 3 frames would
+        // be near-identical — no motion info, wasted tokens). Start/end are
+        // nudged ~15% inward to dodge hard-cut / black transition frames.
         const frameBuffers = [];
         for (let i = 0; i < finalSlots.length; i++) {
             const slot = finalSlots[i];
-            const midpoint = (slot.start + slot.end) / 2;
-            const framePath = path.join(framesDir, `scene-${String(i).padStart(3, '0')}.jpg`);
-            try {
-                await run(FFMPEG, ['-y', '-ss', String(midpoint), '-i', videoPath, '-vframes', '1', '-vf', 'scale=640:-1', '-q:v', '5', framePath, '-loglevel', 'error']);
-                const buf = await fs.readFile(framePath);
-                frameBuffers.push({ idx: i, buf, slot });
+            const span = slot.end - slot.start;
+            const mid = (slot.start + slot.end) / 2;
+            const inset = Math.min(0.3, span * 0.15);
+            const stamps = span < 1.5
+                ? [mid]
+                : [slot.start + inset, mid, slot.end - inset];
+            const buffers = [];
+            let thumb = null;
+            for (let k = 0; k < stamps.length; k++) {
+                const framePath = path.join(framesDir, `scene-${String(i).padStart(3, '0')}-${k}.jpg`);
+                try {
+                    await run(FFMPEG, ['-y', '-ss', String(Math.max(0, stamps[k])), '-i', videoPath, '-vframes', '1', '-vf', 'scale=640:-1', '-q:v', '5', framePath, '-loglevel', 'error']);
+                    const buf = await fs.readFile(framePath);
+                    buffers.push(buf);
+                    // Middle frame = thumbnail (index 1 when 3 frames, index 0 when 1).
+                    if (k === Math.floor(stamps.length / 2))
+                        thumb = buf;
+                }
+                catch (err) {
+                    const msg = err instanceof Error ? err.message : String(err);
+                    console.warn('[analyze_video] Frame extract failed for scene', i, 'frame', k, msg);
+                }
             }
-            catch (err) {
-                const msg = err instanceof Error ? err.message : String(err);
-                console.warn('[analyze_video] Frame extract failed for scene', i, msg);
+            if (buffers.length) {
+                frameBuffers.push({ idx: i, frames: buffers, thumb: thumb || buffers[0], slot });
             }
         }
         // Step 2: Style + Master Cast FIRST (Phase 1), so per-scene describe can
         // reuse consistent character labels (the AI_Video_Clone lesson).
         progress('Đang phân tích video style...');
-        const styleSamples = frameBuffers.slice(0, 3).map(f => f.buf.toString('base64'));
+        const styleSamples = frameBuffers.slice(0, 3).map(f => f.thumb.toString('base64'));
         const video_style = await visionExtractStyle(styleSamples);
         console.log('[analyze_video] Video style:', video_style.substring(0, 100));
         progress('Đang trích xuất dàn nhân vật (Master Cast)...');
@@ -327,7 +363,7 @@ export async function analyzeVideo(url, onProgress) {
         const castSamples = frameBuffers
             .filter((_, i) => i % castStep === 0)
             .slice(0, castSampleCount)
-            .map(f => f.buf.toString('base64'));
+            .map(f => f.thumb.toString('base64'));
         const { master_cast_prompt, characters } = await visionExtractMasterCast(castSamples, transcript.text || '', video_style);
         const castContext = characters.length
             ? characters.map(c => `- ${c.name}: ${c.description}`).join('\n')
@@ -341,14 +377,14 @@ export async function analyzeVideo(url, onProgress) {
         for (let b = 0; b < frameBuffers.length; b += BATCH_SIZE) {
             const batch = frameBuffers.slice(b, b + BATCH_SIZE);
             progress(`Đang phân tích scene ${b + 1}-${Math.min(b + BATCH_SIZE, frameBuffers.length)}/${frameBuffers.length}...`);
-            const results = await Promise.all(batch.map(async ({ idx, buf, slot }) => {
+            const results = await Promise.all(batch.map(async ({ idx, frames, thumb, slot }) => {
                 try {
-                    const visual_description = await visionDescribe(buf.toString('base64'), slot.voiceover, castContext);
+                    const visual_description = await visionDescribe(frames.map(f => f.toString('base64')), slot.voiceover, castContext);
                     return {
                         scene_number: idx + 1,
                         timestamp_start: Math.round(slot.start * 10) / 10,
                         timestamp_end: Math.round(slot.end * 10) / 10,
-                        thumbnail_base64: buf.toString('base64'),
+                        thumbnail_base64: thumb.toString('base64'),
                         voiceover: slot.voiceover,
                         visual_description,
                     };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "tuna-agent",
-  "version": "0.1.134",
+  "version": "0.1.135",
   "description": "Tuna Agent - Run AI coding tasks on your machine",
   "bin": {
     "tuna-agent": "dist/cli/index.js"