tuna-agent 0.1.134 → 0.1.136

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,6 +7,7 @@
7
7
  */
8
8
  import { AgentWebSocketClient } from './ws-client.js';
9
9
  export interface AnalyzeVideoResult {
10
+ source_title: string;
10
11
  duration_sec: number;
11
12
  language: string;
12
13
  transcript: string;
@@ -74,31 +74,46 @@ ${rawText}`,
74
74
  return rawText;
75
75
  }
76
76
  }
77
- async function visionDescribe(frameB64, voiceoverText, castContext = '') {
77
+ // Accepts 1..N frames sampled across a scene (start mid end). Multiple
78
+ // frames let the model observe MOTION direction ("walks left-to-right then
79
+ // turns") instead of guessing from a single frozen midpoint. The model is
80
+ // told the frames are chronological so it describes the action arc, not 3
81
+ // separate moments.
82
+ async function visionDescribe(frameB64s, voiceoverText, castContext = '') {
78
83
  if (!OPENAI_KEY)
79
84
  return '';
85
+ const frames = frameB64s.filter(Boolean);
86
+ if (frames.length === 0)
87
+ return '';
80
88
  const castBlock = castContext
81
89
  ? `\n\nKNOWN CHARACTER CAST (reuse these EXACT names when a subject appears — do NOT invent new labels for the same subject):\n${castContext}\n`
82
90
  : '';
91
+ const seqNote = frames.length > 1
92
+ ? ` The ${frames.length} images are CHRONOLOGICAL samples from the SAME scene (start → middle → end). Treat them as one continuous shot: describe the MOTION ARC across them (direction of movement, camera push/pan, what changes from first to last frame), not three separate moments.`
93
+ : '';
94
+ const imageParts = frames.map(b64 => ({
95
+ type: 'image_url',
96
+ image_url: { url: `data:image/jpeg;base64,${b64}` },
97
+ }));
83
98
  const res = await fetch('https://api.openai.com/v1/chat/completions', {
84
99
  method: 'POST',
85
100
  headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
86
101
  body: JSON.stringify({
87
102
  model: 'gpt-4o-mini',
88
- max_tokens: 300,
103
+ max_tokens: 350,
89
104
  messages: [{
90
105
  role: 'user',
91
106
  content: [
92
- { type: 'text', text: `Describe this frame in detail (4-6 sentences, English).${castBlock} Include:
107
+ { type: 'text', text: `Describe this scene in detail (4-6 sentences, English).${seqNote}${castBlock} Include:
93
108
  - Characters: name them using the KNOWN CHARACTER CAST labels above when they appear; appearance (shape, color, size), facial expression, what they're doing
94
109
  - Physical connections: Are characters physically joined/attached/fused together (e.g. organs connected at a junction, body parts linked)? Or are they separate/independent? Be VERY specific — "physically attached at Y-junction" is different from "standing next to each other"
95
110
  - Spatial positions: exact position of each character (left/right/above/below/center), distance between them
96
111
  - Environment: setting, lighting, color palette, atmosphere
97
- - Camera: angle, framing (close-up, wide, etc.)
98
- - Action: what is happening in this moment, movement direction
112
+ - Camera: angle, framing (close-up, wide, etc.), and any camera movement across the frames
113
+ - Action: the movement/action arc from first to last frame (direction, what changes)
99
114
 
100
- Voiceover at this moment: "${voiceoverText || '(none)'}"` },
101
- { type: 'image_url', image_url: { url: `data:image/jpeg;base64,${frameB64}` } },
115
+ Voiceover during this scene: "${voiceoverText || '(none)'}"` },
116
+ ...imageParts,
102
117
  ],
103
118
  }],
104
119
  }),
@@ -217,6 +232,14 @@ export async function analyzeVideo(url, onProgress) {
217
232
  progress('Đang tải video...');
218
233
  console.log('[analyze_video] Downloading:', url);
219
234
  await run(YT_DLP, ['-f', 'best[height<=720]/best', '-o', videoPath, '--no-playlist', '--quiet', url]);
235
+ // Grab the original video title (metadata only, no extra download) so the
236
+ // clone idea gets a real name instead of "Clone: www.youtube.com".
237
+ let source_title = '';
238
+ try {
239
+ const t = await run(YT_DLP, ['--skip-download', '--no-warnings', '--no-playlist', '--print', '%(title)s', url]);
240
+ source_title = (t.out || '').trim().split('\n')[0].slice(0, 200);
241
+ }
242
+ catch { /* title is best-effort — analysis still proceeds without it */ }
220
243
  progress('Đang tách audio...');
221
244
  console.log('[analyze_video] Extracting audio');
222
245
  await run(FFMPEG, ['-y', '-i', videoPath, '-vn', '-ar', '16000', '-ac', '1', '-b:a', '64k', audioPath, '-loglevel', 'error']);
@@ -298,26 +321,47 @@ export async function analyzeVideo(url, onProgress) {
298
321
  const finalSlots = sceneSlots.slice(0, MAX_SCENES);
299
322
  progress(`Đang cắt ${finalSlots.length} frames và phân tích...`);
300
323
  console.log('[analyze_video] Building', finalSlots.length, 'scenes (segments:', segments.length, ', duration:', durationSec, 's)');
301
- // Step 1: Extract all frames sequentially (ffmpeg can't run in parallel on same file efficiently)
324
+ // Step 1: Extract frames sequentially. Per scene we grab 3 chronological
325
+ // frames — start → middle → end — so the vision model can read the motion
326
+ // arc (direction of movement, camera push) instead of guessing from a
327
+ // single frozen midpoint. The MIDDLE frame doubles as the UI thumbnail.
328
+ // Tiny scenes (<1.5s) collapse to just the midpoint (the 3 frames would
329
+ // be near-identical — no motion info, wasted tokens). Start/end are
330
+ // nudged ~15% inward to dodge hard-cut / black transition frames.
302
331
  const frameBuffers = [];
303
332
  for (let i = 0; i < finalSlots.length; i++) {
304
333
  const slot = finalSlots[i];
305
- const midpoint = (slot.start + slot.end) / 2;
306
- const framePath = path.join(framesDir, `scene-${String(i).padStart(3, '0')}.jpg`);
307
- try {
308
- await run(FFMPEG, ['-y', '-ss', String(midpoint), '-i', videoPath, '-vframes', '1', '-vf', 'scale=640:-1', '-q:v', '5', framePath, '-loglevel', 'error']);
309
- const buf = await fs.readFile(framePath);
310
- frameBuffers.push({ idx: i, buf, slot });
334
+ const span = slot.end - slot.start;
335
+ const mid = (slot.start + slot.end) / 2;
336
+ const inset = Math.min(0.3, span * 0.15);
337
+ const stamps = span < 1.5
338
+ ? [mid]
339
+ : [slot.start + inset, mid, slot.end - inset];
340
+ const buffers = [];
341
+ let thumb = null;
342
+ for (let k = 0; k < stamps.length; k++) {
343
+ const framePath = path.join(framesDir, `scene-${String(i).padStart(3, '0')}-${k}.jpg`);
344
+ try {
345
+ await run(FFMPEG, ['-y', '-ss', String(Math.max(0, stamps[k])), '-i', videoPath, '-vframes', '1', '-vf', 'scale=640:-1', '-q:v', '5', framePath, '-loglevel', 'error']);
346
+ const buf = await fs.readFile(framePath);
347
+ buffers.push(buf);
348
+ // Middle frame = thumbnail (index 1 when 3 frames, index 0 when 1).
349
+ if (k === Math.floor(stamps.length / 2))
350
+ thumb = buf;
351
+ }
352
+ catch (err) {
353
+ const msg = err instanceof Error ? err.message : String(err);
354
+ console.warn('[analyze_video] Frame extract failed for scene', i, 'frame', k, msg);
355
+ }
311
356
  }
312
- catch (err) {
313
- const msg = err instanceof Error ? err.message : String(err);
314
- console.warn('[analyze_video] Frame extract failed for scene', i, msg);
357
+ if (buffers.length) {
358
+ frameBuffers.push({ idx: i, frames: buffers, thumb: thumb || buffers[0], slot });
315
359
  }
316
360
  }
317
361
  // Step 2: Style + Master Cast FIRST (Phase 1), so per-scene describe can
318
362
  // reuse consistent character labels (the AI_Video_Clone lesson).
319
363
  progress('Đang phân tích video style...');
320
- const styleSamples = frameBuffers.slice(0, 3).map(f => f.buf.toString('base64'));
364
+ const styleSamples = frameBuffers.slice(0, 3).map(f => f.thumb.toString('base64'));
321
365
  const video_style = await visionExtractStyle(styleSamples);
322
366
  console.log('[analyze_video] Video style:', video_style.substring(0, 100));
323
367
  progress('Đang trích xuất dàn nhân vật (Master Cast)...');
@@ -327,7 +371,7 @@ export async function analyzeVideo(url, onProgress) {
327
371
  const castSamples = frameBuffers
328
372
  .filter((_, i) => i % castStep === 0)
329
373
  .slice(0, castSampleCount)
330
- .map(f => f.buf.toString('base64'));
374
+ .map(f => f.thumb.toString('base64'));
331
375
  const { master_cast_prompt, characters } = await visionExtractMasterCast(castSamples, transcript.text || '', video_style);
332
376
  const castContext = characters.length
333
377
  ? characters.map(c => `- ${c.name}: ${c.description}`).join('\n')
@@ -341,14 +385,14 @@ export async function analyzeVideo(url, onProgress) {
341
385
  for (let b = 0; b < frameBuffers.length; b += BATCH_SIZE) {
342
386
  const batch = frameBuffers.slice(b, b + BATCH_SIZE);
343
387
  progress(`Đang phân tích scene ${b + 1}-${Math.min(b + BATCH_SIZE, frameBuffers.length)}/${frameBuffers.length}...`);
344
- const results = await Promise.all(batch.map(async ({ idx, buf, slot }) => {
388
+ const results = await Promise.all(batch.map(async ({ idx, frames, thumb, slot }) => {
345
389
  try {
346
- const visual_description = await visionDescribe(buf.toString('base64'), slot.voiceover, castContext);
390
+ const visual_description = await visionDescribe(frames.map(f => f.toString('base64')), slot.voiceover, castContext);
347
391
  return {
348
392
  scene_number: idx + 1,
349
393
  timestamp_start: Math.round(slot.start * 10) / 10,
350
394
  timestamp_end: Math.round(slot.end * 10) / 10,
351
- thumbnail_base64: buf.toString('base64'),
395
+ thumbnail_base64: thumb.toString('base64'),
352
396
  voiceover: slot.voiceover,
353
397
  visual_description,
354
398
  };
@@ -363,6 +407,7 @@ export async function analyzeVideo(url, onProgress) {
363
407
  }
364
408
  const scenes = sceneResults.sort((a, b) => a.scene_number - b.scene_number);
365
409
  return {
410
+ source_title,
366
411
  duration_sec: Math.round(durationSec),
367
412
  language: transcript.language || 'unknown',
368
413
  transcript: transcript.text || '',
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "tuna-agent",
3
- "version": "0.1.134",
3
+ "version": "0.1.136",
4
4
  "description": "Tuna Agent - Run AI coding tasks on your machine",
5
5
  "bin": {
6
6
  "tuna-agent": "dist/cli/index.js"