tuna-agent 0.1.134 → 0.1.135

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -74,31 +74,46 @@ ${rawText}`,
74
74
  return rawText;
75
75
  }
76
76
  }
77
- async function visionDescribe(frameB64, voiceoverText, castContext = '') {
77
+ // Accepts 1..N frames sampled across a scene (start mid end). Multiple
78
+ // frames let the model observe MOTION direction ("walks left-to-right then
79
+ // turns") instead of guessing from a single frozen midpoint. The model is
80
+ // told the frames are chronological so it describes the action arc, not 3
81
+ // separate moments.
82
+ async function visionDescribe(frameB64s, voiceoverText, castContext = '') {
78
83
  if (!OPENAI_KEY)
79
84
  return '';
85
+ const frames = frameB64s.filter(Boolean);
86
+ if (frames.length === 0)
87
+ return '';
80
88
  const castBlock = castContext
81
89
  ? `\n\nKNOWN CHARACTER CAST (reuse these EXACT names when a subject appears — do NOT invent new labels for the same subject):\n${castContext}\n`
82
90
  : '';
91
+ const seqNote = frames.length > 1
92
+ ? ` The ${frames.length} images are CHRONOLOGICAL samples from the SAME scene (start → middle → end). Treat them as one continuous shot: describe the MOTION ARC across them (direction of movement, camera push/pan, what changes from first to last frame), not three separate moments.`
93
+ : '';
94
+ const imageParts = frames.map(b64 => ({
95
+ type: 'image_url',
96
+ image_url: { url: `data:image/jpeg;base64,${b64}` },
97
+ }));
83
98
  const res = await fetch('https://api.openai.com/v1/chat/completions', {
84
99
  method: 'POST',
85
100
  headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
86
101
  body: JSON.stringify({
87
102
  model: 'gpt-4o-mini',
88
- max_tokens: 300,
103
+ max_tokens: 350,
89
104
  messages: [{
90
105
  role: 'user',
91
106
  content: [
92
- { type: 'text', text: `Describe this frame in detail (4-6 sentences, English).${castBlock} Include:
107
+ { type: 'text', text: `Describe this scene in detail (4-6 sentences, English).${seqNote}${castBlock} Include:
93
108
  - Characters: name them using the KNOWN CHARACTER CAST labels above when they appear; appearance (shape, color, size), facial expression, what they're doing
94
109
  - Physical connections: Are characters physically joined/attached/fused together (e.g. organs connected at a junction, body parts linked)? Or are they separate/independent? Be VERY specific — "physically attached at Y-junction" is different from "standing next to each other"
95
110
  - Spatial positions: exact position of each character (left/right/above/below/center), distance between them
96
111
  - Environment: setting, lighting, color palette, atmosphere
97
- - Camera: angle, framing (close-up, wide, etc.)
98
- - Action: what is happening in this moment, movement direction
112
+ - Camera: angle, framing (close-up, wide, etc.), and any camera movement across the frames
113
+ - Action: the movement/action arc from first to last frame (direction, what changes)
99
114
 
100
- Voiceover at this moment: "${voiceoverText || '(none)'}"` },
101
- { type: 'image_url', image_url: { url: `data:image/jpeg;base64,${frameB64}` } },
115
+ Voiceover during this scene: "${voiceoverText || '(none)'}"` },
116
+ ...imageParts,
102
117
  ],
103
118
  }],
104
119
  }),
@@ -298,26 +313,47 @@ export async function analyzeVideo(url, onProgress) {
298
313
  const finalSlots = sceneSlots.slice(0, MAX_SCENES);
299
314
  progress(`Đang cắt ${finalSlots.length} frames và phân tích...`);
300
315
  console.log('[analyze_video] Building', finalSlots.length, 'scenes (segments:', segments.length, ', duration:', durationSec, 's)');
301
- // Step 1: Extract all frames sequentially (ffmpeg can't run in parallel on same file efficiently)
316
+ // Step 1: Extract frames sequentially. Per scene we grab 3 chronological
317
+ // frames — start → middle → end — so the vision model can read the motion
318
+ // arc (direction of movement, camera push) instead of guessing from a
319
+ // single frozen midpoint. The MIDDLE frame doubles as the UI thumbnail.
320
+ // Tiny scenes (<1.5s) collapse to just the midpoint (the 3 frames would
321
+ // be near-identical — no motion info, wasted tokens). Start/end are
322
+ // nudged ~15% inward to dodge hard-cut / black transition frames.
302
323
  const frameBuffers = [];
303
324
  for (let i = 0; i < finalSlots.length; i++) {
304
325
  const slot = finalSlots[i];
305
- const midpoint = (slot.start + slot.end) / 2;
306
- const framePath = path.join(framesDir, `scene-${String(i).padStart(3, '0')}.jpg`);
307
- try {
308
- await run(FFMPEG, ['-y', '-ss', String(midpoint), '-i', videoPath, '-vframes', '1', '-vf', 'scale=640:-1', '-q:v', '5', framePath, '-loglevel', 'error']);
309
- const buf = await fs.readFile(framePath);
310
- frameBuffers.push({ idx: i, buf, slot });
326
+ const span = slot.end - slot.start;
327
+ const mid = (slot.start + slot.end) / 2;
328
+ const inset = Math.min(0.3, span * 0.15);
329
+ const stamps = span < 1.5
330
+ ? [mid]
331
+ : [slot.start + inset, mid, slot.end - inset];
332
+ const buffers = [];
333
+ let thumb = null;
334
+ for (let k = 0; k < stamps.length; k++) {
335
+ const framePath = path.join(framesDir, `scene-${String(i).padStart(3, '0')}-${k}.jpg`);
336
+ try {
337
+ await run(FFMPEG, ['-y', '-ss', String(Math.max(0, stamps[k])), '-i', videoPath, '-vframes', '1', '-vf', 'scale=640:-1', '-q:v', '5', framePath, '-loglevel', 'error']);
338
+ const buf = await fs.readFile(framePath);
339
+ buffers.push(buf);
340
+ // Middle frame = thumbnail (index 1 when 3 frames, index 0 when 1).
341
+ if (k === Math.floor(stamps.length / 2))
342
+ thumb = buf;
343
+ }
344
+ catch (err) {
345
+ const msg = err instanceof Error ? err.message : String(err);
346
+ console.warn('[analyze_video] Frame extract failed for scene', i, 'frame', k, msg);
347
+ }
311
348
  }
312
- catch (err) {
313
- const msg = err instanceof Error ? err.message : String(err);
314
- console.warn('[analyze_video] Frame extract failed for scene', i, msg);
349
+ if (buffers.length) {
350
+ frameBuffers.push({ idx: i, frames: buffers, thumb: thumb || buffers[0], slot });
315
351
  }
316
352
  }
317
353
  // Step 2: Style + Master Cast FIRST (Phase 1), so per-scene describe can
318
354
  // reuse consistent character labels (the AI_Video_Clone lesson).
319
355
  progress('Đang phân tích video style...');
320
- const styleSamples = frameBuffers.slice(0, 3).map(f => f.buf.toString('base64'));
356
+ const styleSamples = frameBuffers.slice(0, 3).map(f => f.thumb.toString('base64'));
321
357
  const video_style = await visionExtractStyle(styleSamples);
322
358
  console.log('[analyze_video] Video style:', video_style.substring(0, 100));
323
359
  progress('Đang trích xuất dàn nhân vật (Master Cast)...');
@@ -327,7 +363,7 @@ export async function analyzeVideo(url, onProgress) {
327
363
  const castSamples = frameBuffers
328
364
  .filter((_, i) => i % castStep === 0)
329
365
  .slice(0, castSampleCount)
330
- .map(f => f.buf.toString('base64'));
366
+ .map(f => f.thumb.toString('base64'));
331
367
  const { master_cast_prompt, characters } = await visionExtractMasterCast(castSamples, transcript.text || '', video_style);
332
368
  const castContext = characters.length
333
369
  ? characters.map(c => `- ${c.name}: ${c.description}`).join('\n')
@@ -341,14 +377,14 @@ export async function analyzeVideo(url, onProgress) {
341
377
  for (let b = 0; b < frameBuffers.length; b += BATCH_SIZE) {
342
378
  const batch = frameBuffers.slice(b, b + BATCH_SIZE);
343
379
  progress(`Đang phân tích scene ${b + 1}-${Math.min(b + BATCH_SIZE, frameBuffers.length)}/${frameBuffers.length}...`);
344
- const results = await Promise.all(batch.map(async ({ idx, buf, slot }) => {
380
+ const results = await Promise.all(batch.map(async ({ idx, frames, thumb, slot }) => {
345
381
  try {
346
- const visual_description = await visionDescribe(buf.toString('base64'), slot.voiceover, castContext);
382
+ const visual_description = await visionDescribe(frames.map(f => f.toString('base64')), slot.voiceover, castContext);
347
383
  return {
348
384
  scene_number: idx + 1,
349
385
  timestamp_start: Math.round(slot.start * 10) / 10,
350
386
  timestamp_end: Math.round(slot.end * 10) / 10,
351
- thumbnail_base64: buf.toString('base64'),
387
+ thumbnail_base64: thumb.toString('base64'),
352
388
  voiceover: slot.voiceover,
353
389
  visual_description,
354
390
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "tuna-agent",
3
- "version": "0.1.134",
3
+ "version": "0.1.135",
4
4
  "description": "Tuna Agent - Run AI coding tasks on your machine",
5
5
  "bin": {
6
6
  "tuna-agent": "dist/cli/index.js"