npm - tuna-agent - Versions diffs - 0.1.137 → 0.1.139 - Mend

tuna-agent 0.1.137 → 0.1.139

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/daemon/analyze-video-handler.d.ts +1 -0
package/dist/daemon/analyze-video-handler.js +129 -73
package/package.json +1 -1

package/dist/daemon/analyze-video-handler.d.ts CHANGED Viewed

@@ -11,6 +11,7 @@ export interface AnalyzeVideoResult {
     duration_sec: number;
     language: string;
     transcript: string;
+    summary: string;
     video_style: string;
     master_cast_prompt: string;
     characters: Array<{

package/dist/daemon/analyze-video-handler.js CHANGED Viewed

@@ -14,6 +14,49 @@ const OPENAI_KEY = process.env.OPENAI_API_KEY || '';
 const YT_DLP = process.env.YT_DLP_BIN || '/home/gatoasang94/.local/bin/yt-dlp';
 const FFMPEG = process.env.FFMPEG_BIN || '/usr/bin/ffmpeg';
 const FFPROBE = process.env.FFPROBE_BIN || '/usr/bin/ffprobe';
+// Downloaded source videos are cached by URL hash so re-analyze doesn't
+// re-download (saves bandwidth + time on long clips). relabs01 shares disk
+// with Demucs + the local media server, so the cache is bounded: drop files
+// older than 7 days, then if the total still exceeds 15 GB evict oldest-first.
+const CACHE_DIR = path.join(os.homedir(), '.tuna-analyze-cache');
+const CACHE_MAX_AGE_MS = 7 * 24 * 3600 * 1000;
+const CACHE_MAX_BYTES = 15 * 1024 * 1024 * 1024;
+async function pruneVideoCache() {
+    try {
+        await fs.mkdir(CACHE_DIR, { recursive: true });
+        const names = await fs.readdir(CACHE_DIR);
+        const now = Date.now();
+        const live = [];
+        for (const name of names) {
+            const p = path.join(CACHE_DIR, name);
+            try {
+                const st = await fs.stat(p);
+                if (!st.isFile())
+                    continue;
+                if (now - st.mtimeMs > CACHE_MAX_AGE_MS) {
+                    await fs.rm(p, { force: true });
+                    continue;
+                }
+                live.push({ p, size: st.size, mtime: st.mtimeMs });
+            }
+            catch { /* race with another run deleting it — ignore */ }
+        }
+        let total = live.reduce((s, f) => s + f.size, 0);
+        if (total > CACHE_MAX_BYTES) {
+            live.sort((a, b) => a.mtime - b.mtime); // oldest first
+            for (const f of live) {
+                if (total <= CACHE_MAX_BYTES)
+                    break;
+                try {
+                    await fs.rm(f.p, { force: true });
+                    total -= f.size;
+                }
+                catch { /* ignore */ }
+            }
+        }
+    }
+    catch { /* cache pruning is best-effort; never block analysis */ }
+}
 function run(cmd, args, opts = {}) {
     return new Promise((resolve, reject) => {
         const p = spawn(cmd, args, { stdio: ['ignore', 'pipe', 'pipe'], ...opts });
@@ -123,38 +166,47 @@ Voiceover during this scene: "${voiceoverText || '(none)'}"` },
     const data = await res.json();
     return data.choices?.[0]?.message?.content?.trim() || '';
 }
-// Phase 1 (borrowed from AI_Video_Clone): extract the recurring character
-// cast ONCE from frames sampled across the whole video + the transcript.
-// Returns a master-cast prompt block in the exact [AESTHETIC & STYLE] /
-// [CHARACTER CAST LIST] format that channel-manager's ScriptImporter parses,
-// plus a structured characters[] list. Doing this upfront (a) populates
-// idea.master_cast_prompt so FlowKit has a reference sheet to generate, and
-// (b) gives every per-scene describe call a consistent naming vocabulary so
-// scene 1 and scene 50 refer to "THE BISHOP" instead of "a man in a suit".
-async function visionExtractMasterCast(frames, transcript, videoStyle) {
-    const empty = { master_cast_prompt: '', characters: [] };
+// Phase 1 (the strong part of AI_Video_Clone, ported): ONE gpt-4o call over
+// frames sampled across the whole video + transcript that returns, together:
+//   - video_summary: a cinematic paragraph of the whole story (drives the
+//     downstream script-generation prompt — the tool's biggest edge)
+//   - video_style:   a rich 3-4 sentence aesthetic analysis (medium, palette,
+//     lighting, camera language) — replaces the old terse 1-2 sentence
+//     visionExtractStyle gpt-4o-mini call entirely
+//   - characters[]:  the recurring cast for the [CHARACTER CAST LIST] block
+// Folding all three into one call is cheaper than the previous two calls
+// (style + cast) AND uses gpt-4o for style (was gpt-4o-mini). The master-cast
+// prompt is assembled here in the exact format ScriptImporter parses.
+async function visionExtractPhase1(frames, transcript) {
+    const empty = {
+        video_summary: '',
+        video_style: '',
+        master_cast_prompt: '',
+        characters: [],
+    };
     if (!OPENAI_KEY || frames.length === 0)
         return empty;
     try {
         const content = [
             {
                 type: 'text',
-                text: `Act as a Master Film Director. These frames are sampled across an entire video. Identify EVERY recurring character/subject (people, anthropomorphic objects, animals, mascots).
+                text: `Act as a Master Film Director. These frames are sampled across an ENTIRE video, in order. Use them + the transcript to analyse the whole piece.
-Transcript context (may name characters): "${(transcript || '').slice(0, 1500)}"
+Transcript context: "${(transcript || '').slice(0, 4000)}"
 Return ONLY a JSON object, no markdown fences:
 {
+  "video_summary": "One detailed cinematic paragraph (5-8 sentences, English) telling the WHOLE story start to finish: setup, key beats, climax, resolution. This is the narrative spine — be specific about what happens.",
+  "video_style": "3-4 sentences (English): artistic medium (2D/3D/live-action/CGI), color palette, lighting, camera language, overall aesthetic vibe. Cinematic, specific.",
   "characters": [
     { "name": "SHORT_UPPERCASE_LABEL", "description": "one-line English visual description: age/build, face, hair, outfit, colors, distinguishing features" }
   ]
 }
 Rules:
-- name: a stable short uppercase label you will reuse for this subject (e.g. "THE BISHOP", "RED CAR", "NARRATOR DOG"). Max 4 words.
-- Only RECURRING subjects worth a reference sheet. Skip one-off background extras.
-- description: ENGLISH only, factual, no camera/action words.
-- Max 6 characters.`,
+- characters.name: a stable short uppercase label reused for this subject (e.g. "THE BISHOP", "U-94 SUBMARINE"). Max 4 words.
+- Only RECURRING subjects worth a reference sheet. Skip one-off extras. Max 6.
+- characters.description: ENGLISH only, factual, no camera/action words.`,
             },
         ];
         for (const b64 of frames) {
@@ -163,16 +215,16 @@ Rules:
         const res = await fetch('https://api.openai.com/v1/chat/completions', {
             method: 'POST',
             headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
-            body: JSON.stringify({ model: 'gpt-4o', max_tokens: 1200, messages: [{ role: 'user', content }] }),
+            body: JSON.stringify({ model: 'gpt-4o', max_tokens: 1600, messages: [{ role: 'user', content }] }),
         });
         if (!res.ok)
             return empty;
         const data = await res.json();
-        const raw = (data.choices?.[0]?.message?.content || '').trim();
+        const rawTxt = (data.choices?.[0]?.message?.content || '').trim();
         let parsed = {};
         try {
-            const m = raw.match(/\{[\s\S]*\}/);
-            parsed = JSON.parse(m ? m[0] : raw);
+            const m = rawTxt.match(/\{[\s\S]*\}/);
+            parsed = JSON.parse(m ? m[0] : rawTxt);
         }
         catch {
             return empty;
@@ -181,57 +233,63 @@ Rules:
             .filter(c => c && c.name && c.description)
             .slice(0, 6)
             .map(c => ({ name: String(c.name).trim(), description: String(c.description).trim() }));
-        if (characters.length === 0)
-            return empty;
-        // Assemble the verbatim-style master cast block ScriptImporter expects.
-        const styleLine = (videoStyle || '').trim() || 'Keep the original video’s visual style, color grading, and lighting.';
-        const castList = characters.map(c => `- ${c.name}: ${c.description}`).join('\n');
-        const master_cast_prompt = `[AESTHETIC & STYLE]\n${styleLine}\n` +
-            `[COMPOSITION & LAYOUT]\nCharacter Reference Sheet. Full-body side-by-side.\n` +
-            `[CHARACTER CAST LIST]\n${castList}\n` +
-            `[TECHNICAL SPECIFICATIONS]\nHigh detail, 8k resolution, consistent facial structures across all frames.`;
-        return { master_cast_prompt, characters };
-    }
-    catch {
-        return empty;
-    }
-}
-async function visionExtractStyle(frames) {
-    if (!OPENAI_KEY || frames.length === 0)
-        return '';
-    try {
-        const content = [
-            { type: 'text', text: 'Analyze these frames from a video and extract a concise visual style description (1-2 sentences). Focus on: animation style (cartoon, realistic, anime, etc.), color palette, lighting, character design approach (anthropomorphized objects, real people, etc.), and overall aesthetic.\n\nReturn ONLY the style description, nothing else.' },
-        ];
-        for (const b64 of frames) {
-            content.push({ type: 'image_url', image_url: { url: `data:image/jpeg;base64,${b64}` } });
+        const video_summary = (parsed.video_summary || '').trim();
+        const video_style = (parsed.video_style || '').trim();
+        let master_cast_prompt = '';
+        if (characters.length > 0) {
+            const styleLine = video_style || 'Keep the original video’s visual style, color grading, and lighting.';
+            const castList = characters.map(c => `- ${c.name}: ${c.description}`).join('\n');
+            master_cast_prompt =
+                `[AESTHETIC & STYLE]\n${styleLine}\n` +
+                    `[COMPOSITION & LAYOUT]\nCharacter Reference Sheet. Full-body side-by-side.\n` +
+                    `[CHARACTER CAST LIST]\n${castList}\n` +
+                    `[TECHNICAL SPECIFICATIONS]\nHigh detail, 8k resolution, consistent facial structures across all frames.`;
         }
-        const res = await fetch('https://api.openai.com/v1/chat/completions', {
-            method: 'POST',
-            headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
-            body: JSON.stringify({ model: 'gpt-4o-mini', max_tokens: 200, messages: [{ role: 'user', content }] }),
-        });
-        if (!res.ok)
-            return '';
-        const data = await res.json();
-        return (data.choices?.[0]?.message?.content || '').trim().replace(/\*\*/g, '');
+        return { video_summary, video_style, master_cast_prompt, characters };
     }
     catch {
-        return '';
+        return empty;
     }
 }
 export async function analyzeVideo(url, onProgress) {
     const progress = onProgress || (() => { });
     const tmpDir = path.join(os.tmpdir(), 'tuna-analyze-' + crypto.randomBytes(6).toString('hex'));
     await fs.mkdir(tmpDir, { recursive: true });
-    const videoPath = path.join(tmpDir, 'video.mp4');
+    // Video lives in the persistent URL-keyed cache (NOT tmpDir) so re-analyze
+    // reuses it. Only audio/frames are per-run + cleaned up in `finally`.
+    const urlHash = crypto.createHash('sha1').update(url).digest('hex');
+    const videoPath = path.join(CACHE_DIR, `${urlHash}.mp4`);
     const audioPath = path.join(tmpDir, 'audio.mp3');
     const framesDir = path.join(tmpDir, 'frames');
     await fs.mkdir(framesDir, { recursive: true });
     try {
-        progress('Đang tải video...');
-        console.log('[analyze_video] Downloading:', url);
-        await run(YT_DLP, ['-f', 'best[height<=720]/best', '-o', videoPath, '--no-playlist', '--quiet', url]);
+        await pruneVideoCache();
+        const cached = await fs.stat(videoPath).then(st => st.isFile() && st.size > 0).catch(() => false);
+        if (cached) {
+            progress('Dùng video đã tải (cache)...');
+            console.log('[analyze_video] Cache HIT:', videoPath);
+            // Bump mtime so an actively re-analyzed video isn't evicted by age.
+            try {
+                const now = new Date();
+                await fs.utimes(videoPath, now, now);
+            }
+            catch { /* ignore */ }
+        }
+        else {
+            progress('Đang tải video...');
+            console.log('[analyze_video] Cache MISS, downloading:', url);
+            // Download to a temp name then atomically rename in, so a concurrent
+            // analyze of the same URL never reads a half-written file.
+            const dlTmp = path.join(CACHE_DIR, `${urlHash}.dl-${crypto.randomBytes(4).toString('hex')}.mp4`);
+            try {
+                await run(YT_DLP, ['-f', 'best[height<=720]/best', '-o', dlTmp, '--no-playlist', '--quiet', url]);
+                await fs.rename(dlTmp, videoPath);
+            }
+            catch (e) {
+                await fs.rm(dlTmp, { force: true }).catch(() => { });
+                throw e;
+            }
+        }
         // Grab the original video title (metadata only, no extra download) so the
         // clone idea gets a real name instead of "Clone: www.youtube.com".
         let source_title = '';
@@ -376,25 +434,22 @@ export async function analyzeVideo(url, onProgress) {
                 frameBuffers.push({ idx: i, frames: buffers, thumb: thumb || buffers[0], slot });
             }
         }
-        // Step 2: Style + Master Cast FIRST (Phase 1), so per-scene describe can
-        // reuse consistent character labels (the AI_Video_Clone lesson).
-        progress('Đang phân tích video style...');
-        const styleSamples = frameBuffers.slice(0, 3).map(f => f.thumb.toString('base64'));
-        const video_style = await visionExtractStyle(styleSamples);
-        console.log('[analyze_video] Video style:', video_style.substring(0, 100));
-        progress('Đang trích xuất dàn nhân vật (Master Cast)...');
-        // Sample up to 12 frames evenly across the whole video for cast detection.
-        const castSampleCount = Math.min(12, frameBuffers.length);
-        const castStep = Math.max(1, Math.floor(frameBuffers.length / castSampleCount));
-        const castSamples = frameBuffers
-            .filter((_, i) => i % castStep === 0)
-            .slice(0, castSampleCount)
+        // Step 2: Phase 1 — ONE gpt-4o call returning summary + rich style +
+        // master cast + characters. Runs before per-scene describe so the cast
+        // context keeps naming consistent across the whole timeline.
+        progress('Đang phân tích tổng thể (summary + style + master cast)...');
+        // Sample up to 12 frames evenly across the whole video.
+        const p1SampleCount = Math.min(12, frameBuffers.length);
+        const p1Step = Math.max(1, Math.floor(frameBuffers.length / p1SampleCount));
+        const p1Samples = frameBuffers
+            .filter((_, i) => i % p1Step === 0)
+            .slice(0, p1SampleCount)
             .map(f => f.thumb.toString('base64'));
-        const { master_cast_prompt, characters } = await visionExtractMasterCast(castSamples, transcript.text || '', video_style);
+        const { video_summary, video_style, master_cast_prompt, characters } = await visionExtractPhase1(p1Samples, transcript.text || '');
         const castContext = characters.length
             ? characters.map(c => `- ${c.name}: ${c.description}`).join('\n')
             : '';
-        console.log('[analyze_video] Master cast:', characters.map(c => c.name).join(', ') || '(none)');
+        console.log('[analyze_video] Phase1 — style:', video_style.slice(0, 80), '| cast:', characters.map(c => c.name).join(', ') || '(none)', '| summary:', video_summary.length, 'chars');
         // Step 3: Vision describe all frames in parallel (batch of 5), passing the
         // cast context so naming stays consistent across the whole timeline.
         progress(`Đang phân tích ${frameBuffers.length} scenes song song...`);
@@ -429,6 +484,7 @@ export async function analyzeVideo(url, onProgress) {
             duration_sec: Math.round(durationSec),
             language: transcript.language || 'unknown',
             transcript: transcript.text || '',
+            summary: video_summary,
             video_style,
             master_cast_prompt,
             characters,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "tuna-agent",
-  "version": "0.1.137",
+  "version": "0.1.139",
   "description": "Tuna Agent - Run AI coding tasks on your machine",
   "bin": {
     "tuna-agent": "dist/cli/index.js"