npm - tuna-agent - Versions diffs - 0.1.136 → 0.1.137 - Mend

tuna-agent 0.1.136 → 0.1.137

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/dist/daemon/analyze-video-handler.js +55 -37
package/package.json +1 -1

package/dist/daemon/analyze-video-handler.js CHANGED Viewed

@@ -269,56 +269,74 @@ export async function analyzeVideo(url, onProgress) {
         // 90s monologue becomes ~11 scenes instead of one giant clip. A hard
         // ceiling still bounds runaway vision cost on very long videos.
         const TARGET_SCENE_SEC = 8;
-        const HARD_CAP = 600; // ~80 min @ 8s — safety bound on vision API spend
-        const targetScenes = Math.max(1, Math.ceil(durationSec / TARGET_SCENE_SEC));
-        const MAX_SCENES = Math.min(targetScenes + 20, HARD_CAP);
-        // Split a [start,end] span into ≤TARGET_SCENE_SEC sub-slots, preserving
-        // the voiceover on the FIRST sub-slot (the rest are silent continuations
-        // of the same spoken line so lip-sync isn't duplicated downstream).
-        const pushSplit = (start, end, voiceover) => {
-            const span = end - start;
-            if (span <= TARGET_SCENE_SEC * 1.5) {
-                sceneSlots.push({ start, end, voiceover });
-                return;
-            }
-            const n = Math.ceil(span / TARGET_SCENE_SEC);
-            const step = span / n;
-            for (let k = 0; k < n; k++) {
-                sceneSlots.push({
-                    start: start + k * step,
-                    end: k === n - 1 ? end : start + (k + 1) * step,
-                    voiceover: k === 0 ? voiceover : '',
-                });
-            }
-        };
+        // Safety ceiling ONLY (≈80 min @ 8s). It must NOT be derived from
+        // ceil(duration/8): Whisper emits hundreds of 2-4s segments for a talky
+        // video, so a tighter cap + slice() silently dropped the back half of
+        // the video (13-min clip → 118 slots → only first 6:21 kept). The
+        // normalise pass below already collapses tiny segments into ~8s scenes,
+        // so the natural count ≈ ceil(duration/8) and this only guards runaway.
+        const HARD_CAP = 600;
+        const spans = [];
         if (segments.length > 0) {
-            if (segments[0].start > SILENCE_THRESHOLD) {
-                pushSplit(0, segments[0].start, '');
-            }
+            if (segments[0].start > SILENCE_THRESHOLD)
+                spans.push({ start: 0, end: segments[0].start, voiceover: '' });
             for (let i = 0; i < segments.length; i++) {
                 const seg = segments[i];
-                pushSplit(seg.start, seg.end, seg.text?.trim() || '');
+                spans.push({ start: seg.start, end: seg.end, voiceover: seg.text?.trim() || '' });
                 if (i < segments.length - 1) {
                     const gap = segments[i + 1].start - seg.end;
-                    if (gap > SILENCE_THRESHOLD) {
-                        pushSplit(seg.end, segments[i + 1].start, '');
-                    }
+                    if (gap > SILENCE_THRESHOLD)
+                        spans.push({ start: seg.end, end: segments[i + 1].start, voiceover: '' });
                 }
             }
             const lastEnd = segments[segments.length - 1].end;
-            if (durationSec - lastEnd > SILENCE_THRESHOLD) {
-                pushSplit(lastEnd, durationSec, '');
-            }
+            if (durationSec - lastEnd > SILENCE_THRESHOLD)
+                spans.push({ start: lastEnd, end: durationSec, voiceover: '' });
         }
         else {
-            // No transcript — split into scenes every 8s (Veo3 clip length)
             for (let t = 0; t < durationSec; t += TARGET_SCENE_SEC) {
-                sceneSlots.push({ start: t, end: Math.min(t + TARGET_SCENE_SEC, durationSec), voiceover: '' });
+                spans.push({ start: t, end: Math.min(t + TARGET_SCENE_SEC, durationSec), voiceover: '' });
+            }
+        }
+        // 2) Normalise every span to ~TARGET-second scenes covering the FULL
+        // timeline:
+        //   - long span (> 1.5×TARGET): split into ceil(span/TARGET) equal slots
+        //   - short spans: greedily MERGE consecutive ones until ≈TARGET so a
+        //     talky video becomes ~ceil(duration/8) Veo3-length scenes instead
+        //     of hundreds of 2s fragments — crucially WITHOUT dropping the tail.
+        for (let i = 0; i < spans.length;) {
+            const s = spans[i];
+            const span = s.end - s.start;
+            if (span > TARGET_SCENE_SEC * 1.5) {
+                const n = Math.ceil(span / TARGET_SCENE_SEC);
+                const step = span / n;
+                for (let k = 0; k < n; k++) {
+                    sceneSlots.push({
+                        start: s.start + k * step,
+                        end: k === n - 1 ? s.end : s.start + (k + 1) * step,
+                        voiceover: k === 0 ? s.voiceover : '',
+                    });
+                }
+                i++;
+            }
+            else {
+                let end = s.end;
+                const vo = s.voiceover ? [s.voiceover] : [];
+                let j = i + 1;
+                while (j < spans.length &&
+                    (end - s.start) < TARGET_SCENE_SEC &&
+                    (spans[j].end - s.start) <= TARGET_SCENE_SEC * 1.5) {
+                    end = spans[j].end;
+                    if (spans[j].voiceover)
+                        vo.push(spans[j].voiceover);
+                    j++;
+                }
+                sceneSlots.push({ start: s.start, end, voiceover: vo.join(' ') });
+                i = j;
             }
         }
-        // Duration-aware cap (was a flat 30 — that silently truncated any video
-        // longer than ~4 min). Re-number after slicing.
-        const finalSlots = sceneSlots.slice(0, MAX_SCENES);
+        // slice() now only ever trims pathological >80-min inputs.
+        const finalSlots = sceneSlots.slice(0, HARD_CAP);
         progress(`Đang cắt ${finalSlots.length} frames và phân tích...`);
         console.log('[analyze_video] Building', finalSlots.length, 'scenes (segments:', segments.length, ', duration:', durationSec, 's)');
         // Step 1: Extract frames sequentially. Per scene we grab 3 chronological

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "tuna-agent",
-  "version": "0.1.136",
+  "version": "0.1.137",
   "description": "Tuna Agent - Run AI coding tasks on your machine",
   "bin": {
     "tuna-agent": "dist/cli/index.js"