tuna-agent 0.1.137 → 0.1.139

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,6 +11,7 @@ export interface AnalyzeVideoResult {
11
11
  duration_sec: number;
12
12
  language: string;
13
13
  transcript: string;
14
+ summary: string;
14
15
  video_style: string;
15
16
  master_cast_prompt: string;
16
17
  characters: Array<{
@@ -14,6 +14,49 @@ const OPENAI_KEY = process.env.OPENAI_API_KEY || '';
14
14
  const YT_DLP = process.env.YT_DLP_BIN || '/home/gatoasang94/.local/bin/yt-dlp';
15
15
  const FFMPEG = process.env.FFMPEG_BIN || '/usr/bin/ffmpeg';
16
16
  const FFPROBE = process.env.FFPROBE_BIN || '/usr/bin/ffprobe';
17
+ // Downloaded source videos are cached by URL hash so re-analyze doesn't
18
+ // re-download (saves bandwidth + time on long clips). relabs01 shares disk
19
+ // with Demucs + the local media server, so the cache is bounded: drop files
20
+ // older than 7 days, then if the total still exceeds 15 GB evict oldest-first.
21
+ const CACHE_DIR = path.join(os.homedir(), '.tuna-analyze-cache');
22
+ const CACHE_MAX_AGE_MS = 7 * 24 * 3600 * 1000;
23
+ const CACHE_MAX_BYTES = 15 * 1024 * 1024 * 1024;
24
+ async function pruneVideoCache() {
25
+ try {
26
+ await fs.mkdir(CACHE_DIR, { recursive: true });
27
+ const names = await fs.readdir(CACHE_DIR);
28
+ const now = Date.now();
29
+ const live = [];
30
+ for (const name of names) {
31
+ const p = path.join(CACHE_DIR, name);
32
+ try {
33
+ const st = await fs.stat(p);
34
+ if (!st.isFile())
35
+ continue;
36
+ if (now - st.mtimeMs > CACHE_MAX_AGE_MS) {
37
+ await fs.rm(p, { force: true });
38
+ continue;
39
+ }
40
+ live.push({ p, size: st.size, mtime: st.mtimeMs });
41
+ }
42
+ catch { /* race with another run deleting it — ignore */ }
43
+ }
44
+ let total = live.reduce((s, f) => s + f.size, 0);
45
+ if (total > CACHE_MAX_BYTES) {
46
+ live.sort((a, b) => a.mtime - b.mtime); // oldest first
47
+ for (const f of live) {
48
+ if (total <= CACHE_MAX_BYTES)
49
+ break;
50
+ try {
51
+ await fs.rm(f.p, { force: true });
52
+ total -= f.size;
53
+ }
54
+ catch { /* ignore */ }
55
+ }
56
+ }
57
+ }
58
+ catch { /* cache pruning is best-effort; never block analysis */ }
59
+ }
17
60
  function run(cmd, args, opts = {}) {
18
61
  return new Promise((resolve, reject) => {
19
62
  const p = spawn(cmd, args, { stdio: ['ignore', 'pipe', 'pipe'], ...opts });
@@ -123,38 +166,47 @@ Voiceover during this scene: "${voiceoverText || '(none)'}"` },
123
166
  const data = await res.json();
124
167
  return data.choices?.[0]?.message?.content?.trim() || '';
125
168
  }
126
- // Phase 1 (borrowed from AI_Video_Clone): extract the recurring character
127
- // cast ONCE from frames sampled across the whole video + the transcript.
128
- // Returns a master-cast prompt block in the exact [AESTHETIC & STYLE] /
129
- // [CHARACTER CAST LIST] format that channel-manager's ScriptImporter parses,
130
- // plus a structured characters[] list. Doing this upfront (a) populates
131
- // idea.master_cast_prompt so FlowKit has a reference sheet to generate, and
132
- // (b) gives every per-scene describe call a consistent naming vocabulary so
133
- // scene 1 and scene 50 refer to "THE BISHOP" instead of "a man in a suit".
134
- async function visionExtractMasterCast(frames, transcript, videoStyle) {
135
- const empty = { master_cast_prompt: '', characters: [] };
169
+ // Phase 1 (the strong part of AI_Video_Clone, ported): ONE gpt-4o call over
170
+ // frames sampled across the whole video + transcript that returns, together:
171
+ // - video_summary: a cinematic paragraph of the whole story (drives the
172
+ // downstream script-generation prompt the tool's biggest edge)
173
+ // - video_style: a rich 3-4 sentence aesthetic analysis (medium, palette,
174
+ // lighting, camera language) replaces the old terse 1-2 sentence
175
+ // visionExtractStyle gpt-4o-mini call entirely
176
+ // - characters[]: the recurring cast for the [CHARACTER CAST LIST] block
177
+ // Folding all three into one call is cheaper than the previous two calls
178
+ // (style + cast) AND uses gpt-4o for style (was gpt-4o-mini). The master-cast
179
+ // prompt is assembled here in the exact format ScriptImporter parses.
180
+ async function visionExtractPhase1(frames, transcript) {
181
+ const empty = {
182
+ video_summary: '',
183
+ video_style: '',
184
+ master_cast_prompt: '',
185
+ characters: [],
186
+ };
136
187
  if (!OPENAI_KEY || frames.length === 0)
137
188
  return empty;
138
189
  try {
139
190
  const content = [
140
191
  {
141
192
  type: 'text',
142
- text: `Act as a Master Film Director. These frames are sampled across an entire video. Identify EVERY recurring character/subject (people, anthropomorphic objects, animals, mascots).
193
+ text: `Act as a Master Film Director. These frames are sampled across an ENTIRE video, in order. Use them + the transcript to analyse the whole piece.
143
194
 
144
- Transcript context (may name characters): "${(transcript || '').slice(0, 1500)}"
195
+ Transcript context: "${(transcript || '').slice(0, 4000)}"
145
196
 
146
197
  Return ONLY a JSON object, no markdown fences:
147
198
  {
199
+ "video_summary": "One detailed cinematic paragraph (5-8 sentences, English) telling the WHOLE story start to finish: setup, key beats, climax, resolution. This is the narrative spine — be specific about what happens.",
200
+ "video_style": "3-4 sentences (English): artistic medium (2D/3D/live-action/CGI), color palette, lighting, camera language, overall aesthetic vibe. Cinematic, specific.",
148
201
  "characters": [
149
202
  { "name": "SHORT_UPPERCASE_LABEL", "description": "one-line English visual description: age/build, face, hair, outfit, colors, distinguishing features" }
150
203
  ]
151
204
  }
152
205
 
153
206
  Rules:
154
- - name: a stable short uppercase label you will reuse for this subject (e.g. "THE BISHOP", "RED CAR", "NARRATOR DOG"). Max 4 words.
155
- - Only RECURRING subjects worth a reference sheet. Skip one-off background extras.
156
- - description: ENGLISH only, factual, no camera/action words.
157
- - Max 6 characters.`,
207
+ - characters.name: a stable short uppercase label reused for this subject (e.g. "THE BISHOP", "U-94 SUBMARINE"). Max 4 words.
208
+ - Only RECURRING subjects worth a reference sheet. Skip one-off extras. Max 6.
209
+ - characters.description: ENGLISH only, factual, no camera/action words.`,
158
210
  },
159
211
  ];
160
212
  for (const b64 of frames) {
@@ -163,16 +215,16 @@ Rules:
163
215
  const res = await fetch('https://api.openai.com/v1/chat/completions', {
164
216
  method: 'POST',
165
217
  headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
166
- body: JSON.stringify({ model: 'gpt-4o', max_tokens: 1200, messages: [{ role: 'user', content }] }),
218
+ body: JSON.stringify({ model: 'gpt-4o', max_tokens: 1600, messages: [{ role: 'user', content }] }),
167
219
  });
168
220
  if (!res.ok)
169
221
  return empty;
170
222
  const data = await res.json();
171
- const raw = (data.choices?.[0]?.message?.content || '').trim();
223
+ const rawTxt = (data.choices?.[0]?.message?.content || '').trim();
172
224
  let parsed = {};
173
225
  try {
174
- const m = raw.match(/\{[\s\S]*\}/);
175
- parsed = JSON.parse(m ? m[0] : raw);
226
+ const m = rawTxt.match(/\{[\s\S]*\}/);
227
+ parsed = JSON.parse(m ? m[0] : rawTxt);
176
228
  }
177
229
  catch {
178
230
  return empty;
@@ -181,57 +233,63 @@ Rules:
181
233
  .filter(c => c && c.name && c.description)
182
234
  .slice(0, 6)
183
235
  .map(c => ({ name: String(c.name).trim(), description: String(c.description).trim() }));
184
- if (characters.length === 0)
185
- return empty;
186
- // Assemble the verbatim-style master cast block ScriptImporter expects.
187
- const styleLine = (videoStyle || '').trim() || 'Keep the original video’s visual style, color grading, and lighting.';
188
- const castList = characters.map(c => `- ${c.name}: ${c.description}`).join('\n');
189
- const master_cast_prompt = `[AESTHETIC & STYLE]\n${styleLine}\n` +
190
- `[COMPOSITION & LAYOUT]\nCharacter Reference Sheet. Full-body side-by-side.\n` +
191
- `[CHARACTER CAST LIST]\n${castList}\n` +
192
- `[TECHNICAL SPECIFICATIONS]\nHigh detail, 8k resolution, consistent facial structures across all frames.`;
193
- return { master_cast_prompt, characters };
194
- }
195
- catch {
196
- return empty;
197
- }
198
- }
199
- async function visionExtractStyle(frames) {
200
- if (!OPENAI_KEY || frames.length === 0)
201
- return '';
202
- try {
203
- const content = [
204
- { type: 'text', text: 'Analyze these frames from a video and extract a concise visual style description (1-2 sentences). Focus on: animation style (cartoon, realistic, anime, etc.), color palette, lighting, character design approach (anthropomorphized objects, real people, etc.), and overall aesthetic.\n\nReturn ONLY the style description, nothing else.' },
205
- ];
206
- for (const b64 of frames) {
207
- content.push({ type: 'image_url', image_url: { url: `data:image/jpeg;base64,${b64}` } });
236
+ const video_summary = (parsed.video_summary || '').trim();
237
+ const video_style = (parsed.video_style || '').trim();
238
+ let master_cast_prompt = '';
239
+ if (characters.length > 0) {
240
+ const styleLine = video_style || 'Keep the original video’s visual style, color grading, and lighting.';
241
+ const castList = characters.map(c => `- ${c.name}: ${c.description}`).join('\n');
242
+ master_cast_prompt =
243
+ `[AESTHETIC & STYLE]\n${styleLine}\n` +
244
+ `[COMPOSITION & LAYOUT]\nCharacter Reference Sheet. Full-body side-by-side.\n` +
245
+ `[CHARACTER CAST LIST]\n${castList}\n` +
246
+ `[TECHNICAL SPECIFICATIONS]\nHigh detail, 8k resolution, consistent facial structures across all frames.`;
208
247
  }
209
- const res = await fetch('https://api.openai.com/v1/chat/completions', {
210
- method: 'POST',
211
- headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
212
- body: JSON.stringify({ model: 'gpt-4o-mini', max_tokens: 200, messages: [{ role: 'user', content }] }),
213
- });
214
- if (!res.ok)
215
- return '';
216
- const data = await res.json();
217
- return (data.choices?.[0]?.message?.content || '').trim().replace(/\*\*/g, '');
248
+ return { video_summary, video_style, master_cast_prompt, characters };
218
249
  }
219
250
  catch {
220
- return '';
251
+ return empty;
221
252
  }
222
253
  }
223
254
  export async function analyzeVideo(url, onProgress) {
224
255
  const progress = onProgress || (() => { });
225
256
  const tmpDir = path.join(os.tmpdir(), 'tuna-analyze-' + crypto.randomBytes(6).toString('hex'));
226
257
  await fs.mkdir(tmpDir, { recursive: true });
227
- const videoPath = path.join(tmpDir, 'video.mp4');
258
+ // Video lives in the persistent URL-keyed cache (NOT tmpDir) so re-analyze
259
+ // reuses it. Only audio/frames are per-run + cleaned up in `finally`.
260
+ const urlHash = crypto.createHash('sha1').update(url).digest('hex');
261
+ const videoPath = path.join(CACHE_DIR, `${urlHash}.mp4`);
228
262
  const audioPath = path.join(tmpDir, 'audio.mp3');
229
263
  const framesDir = path.join(tmpDir, 'frames');
230
264
  await fs.mkdir(framesDir, { recursive: true });
231
265
  try {
232
- progress('Đang tải video...');
233
- console.log('[analyze_video] Downloading:', url);
234
- await run(YT_DLP, ['-f', 'best[height<=720]/best', '-o', videoPath, '--no-playlist', '--quiet', url]);
266
+ await pruneVideoCache();
267
+ const cached = await fs.stat(videoPath).then(st => st.isFile() && st.size > 0).catch(() => false);
268
+ if (cached) {
269
+ progress('Dùng video đã tải (cache)...');
270
+ console.log('[analyze_video] Cache HIT:', videoPath);
271
+ // Bump mtime so an actively re-analyzed video isn't evicted by age.
272
+ try {
273
+ const now = new Date();
274
+ await fs.utimes(videoPath, now, now);
275
+ }
276
+ catch { /* ignore */ }
277
+ }
278
+ else {
279
+ progress('Đang tải video...');
280
+ console.log('[analyze_video] Cache MISS, downloading:', url);
281
+ // Download to a temp name then atomically rename in, so a concurrent
282
+ // analyze of the same URL never reads a half-written file.
283
+ const dlTmp = path.join(CACHE_DIR, `${urlHash}.dl-${crypto.randomBytes(4).toString('hex')}.mp4`);
284
+ try {
285
+ await run(YT_DLP, ['-f', 'best[height<=720]/best', '-o', dlTmp, '--no-playlist', '--quiet', url]);
286
+ await fs.rename(dlTmp, videoPath);
287
+ }
288
+ catch (e) {
289
+ await fs.rm(dlTmp, { force: true }).catch(() => { });
290
+ throw e;
291
+ }
292
+ }
235
293
  // Grab the original video title (metadata only, no extra download) so the
236
294
  // clone idea gets a real name instead of "Clone: www.youtube.com".
237
295
  let source_title = '';
@@ -376,25 +434,22 @@ export async function analyzeVideo(url, onProgress) {
376
434
  frameBuffers.push({ idx: i, frames: buffers, thumb: thumb || buffers[0], slot });
377
435
  }
378
436
  }
379
- // Step 2: Style + Master Cast FIRST (Phase 1), so per-scene describe can
380
- // reuse consistent character labels (the AI_Video_Clone lesson).
381
- progress('Đang phân tích video style...');
382
- const styleSamples = frameBuffers.slice(0, 3).map(f => f.thumb.toString('base64'));
383
- const video_style = await visionExtractStyle(styleSamples);
384
- console.log('[analyze_video] Video style:', video_style.substring(0, 100));
385
- progress('Đang trích xuất dàn nhân vật (Master Cast)...');
386
- // Sample up to 12 frames evenly across the whole video for cast detection.
387
- const castSampleCount = Math.min(12, frameBuffers.length);
388
- const castStep = Math.max(1, Math.floor(frameBuffers.length / castSampleCount));
389
- const castSamples = frameBuffers
390
- .filter((_, i) => i % castStep === 0)
391
- .slice(0, castSampleCount)
437
+ // Step 2: Phase 1 ONE gpt-4o call returning summary + rich style +
438
+ // master cast + characters. Runs before per-scene describe so the cast
439
+ // context keeps naming consistent across the whole timeline.
440
+ progress('Đang phân tích tổng thể (summary + style + master cast)...');
441
+ // Sample up to 12 frames evenly across the whole video.
442
+ const p1SampleCount = Math.min(12, frameBuffers.length);
443
+ const p1Step = Math.max(1, Math.floor(frameBuffers.length / p1SampleCount));
444
+ const p1Samples = frameBuffers
445
+ .filter((_, i) => i % p1Step === 0)
446
+ .slice(0, p1SampleCount)
392
447
  .map(f => f.thumb.toString('base64'));
393
- const { master_cast_prompt, characters } = await visionExtractMasterCast(castSamples, transcript.text || '', video_style);
448
+ const { video_summary, video_style, master_cast_prompt, characters } = await visionExtractPhase1(p1Samples, transcript.text || '');
394
449
  const castContext = characters.length
395
450
  ? characters.map(c => `- ${c.name}: ${c.description}`).join('\n')
396
451
  : '';
397
- console.log('[analyze_video] Master cast:', characters.map(c => c.name).join(', ') || '(none)');
452
+ console.log('[analyze_video] Phase1 — style:', video_style.slice(0, 80), '| cast:', characters.map(c => c.name).join(', ') || '(none)', '| summary:', video_summary.length, 'chars');
398
453
  // Step 3: Vision describe all frames in parallel (batch of 5), passing the
399
454
  // cast context so naming stays consistent across the whole timeline.
400
455
  progress(`Đang phân tích ${frameBuffers.length} scenes song song...`);
@@ -429,6 +484,7 @@ export async function analyzeVideo(url, onProgress) {
429
484
  duration_sec: Math.round(durationSec),
430
485
  language: transcript.language || 'unknown',
431
486
  transcript: transcript.text || '',
487
+ summary: video_summary,
432
488
  video_style,
433
489
  master_cast_prompt,
434
490
  characters,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "tuna-agent",
3
- "version": "0.1.137",
3
+ "version": "0.1.139",
4
4
  "description": "Tuna Agent - Run AI coding tasks on your machine",
5
5
  "bin": {
6
6
  "tuna-agent": "dist/cli/index.js"