tuna-agent 0.1.134 → 0.1.135
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -74,31 +74,46 @@ ${rawText}`,
|
|
|
74
74
|
return rawText;
|
|
75
75
|
}
|
|
76
76
|
}
|
|
77
|
-
|
|
77
|
+
// Accepts 1..N frames sampled across a scene (start → mid → end). Multiple
|
|
78
|
+
// frames let the model observe MOTION direction ("walks left-to-right then
|
|
79
|
+
// turns") instead of guessing from a single frozen midpoint. The model is
|
|
80
|
+
// told the frames are chronological so it describes the action arc, not 3
|
|
81
|
+
// separate moments.
|
|
82
|
+
async function visionDescribe(frameB64s, voiceoverText, castContext = '') {
|
|
78
83
|
if (!OPENAI_KEY)
|
|
79
84
|
return '';
|
|
85
|
+
const frames = frameB64s.filter(Boolean);
|
|
86
|
+
if (frames.length === 0)
|
|
87
|
+
return '';
|
|
80
88
|
const castBlock = castContext
|
|
81
89
|
? `\n\nKNOWN CHARACTER CAST (reuse these EXACT names when a subject appears — do NOT invent new labels for the same subject):\n${castContext}\n`
|
|
82
90
|
: '';
|
|
91
|
+
const seqNote = frames.length > 1
|
|
92
|
+
? ` The ${frames.length} images are CHRONOLOGICAL samples from the SAME scene (start → middle → end). Treat them as one continuous shot: describe the MOTION ARC across them (direction of movement, camera push/pan, what changes from first to last frame), not three separate moments.`
|
|
93
|
+
: '';
|
|
94
|
+
const imageParts = frames.map(b64 => ({
|
|
95
|
+
type: 'image_url',
|
|
96
|
+
image_url: { url: `data:image/jpeg;base64,${b64}` },
|
|
97
|
+
}));
|
|
83
98
|
const res = await fetch('https://api.openai.com/v1/chat/completions', {
|
|
84
99
|
method: 'POST',
|
|
85
100
|
headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
|
|
86
101
|
body: JSON.stringify({
|
|
87
102
|
model: 'gpt-4o-mini',
|
|
88
|
-
max_tokens:
|
|
103
|
+
max_tokens: 350,
|
|
89
104
|
messages: [{
|
|
90
105
|
role: 'user',
|
|
91
106
|
content: [
|
|
92
|
-
{ type: 'text', text: `Describe this
|
|
107
|
+
{ type: 'text', text: `Describe this scene in detail (4-6 sentences, English).${seqNote}${castBlock} Include:
|
|
93
108
|
- Characters: name them using the KNOWN CHARACTER CAST labels above when they appear; appearance (shape, color, size), facial expression, what they're doing
|
|
94
109
|
- Physical connections: Are characters physically joined/attached/fused together (e.g. organs connected at a junction, body parts linked)? Or are they separate/independent? Be VERY specific — "physically attached at Y-junction" is different from "standing next to each other"
|
|
95
110
|
- Spatial positions: exact position of each character (left/right/above/below/center), distance between them
|
|
96
111
|
- Environment: setting, lighting, color palette, atmosphere
|
|
97
|
-
- Camera: angle, framing (close-up, wide, etc.)
|
|
98
|
-
- Action:
|
|
112
|
+
- Camera: angle, framing (close-up, wide, etc.), and any camera movement across the frames
|
|
113
|
+
- Action: the movement/action arc from first to last frame (direction, what changes)
|
|
99
114
|
|
|
100
|
-
Voiceover
|
|
101
|
-
|
|
115
|
+
Voiceover during this scene: "${voiceoverText || '(none)'}"` },
|
|
116
|
+
...imageParts,
|
|
102
117
|
],
|
|
103
118
|
}],
|
|
104
119
|
}),
|
|
@@ -298,26 +313,47 @@ export async function analyzeVideo(url, onProgress) {
|
|
|
298
313
|
const finalSlots = sceneSlots.slice(0, MAX_SCENES);
|
|
299
314
|
progress(`Đang cắt ${finalSlots.length} frames và phân tích...`);
|
|
300
315
|
console.log('[analyze_video] Building', finalSlots.length, 'scenes (segments:', segments.length, ', duration:', durationSec, 's)');
|
|
301
|
-
// Step 1: Extract
|
|
316
|
+
// Step 1: Extract frames sequentially. Per scene we grab 3 chronological
|
|
317
|
+
// frames — start → middle → end — so the vision model can read the motion
|
|
318
|
+
// arc (direction of movement, camera push) instead of guessing from a
|
|
319
|
+
// single frozen midpoint. The MIDDLE frame doubles as the UI thumbnail.
|
|
320
|
+
// Tiny scenes (<1.5s) collapse to just the midpoint (the 3 frames would
|
|
321
|
+
// be near-identical — no motion info, wasted tokens). Start/end are
|
|
322
|
+
// nudged ~15% inward to dodge hard-cut / black transition frames.
|
|
302
323
|
const frameBuffers = [];
|
|
303
324
|
for (let i = 0; i < finalSlots.length; i++) {
|
|
304
325
|
const slot = finalSlots[i];
|
|
305
|
-
const
|
|
306
|
-
const
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
326
|
+
const span = slot.end - slot.start;
|
|
327
|
+
const mid = (slot.start + slot.end) / 2;
|
|
328
|
+
const inset = Math.min(0.3, span * 0.15);
|
|
329
|
+
const stamps = span < 1.5
|
|
330
|
+
? [mid]
|
|
331
|
+
: [slot.start + inset, mid, slot.end - inset];
|
|
332
|
+
const buffers = [];
|
|
333
|
+
let thumb = null;
|
|
334
|
+
for (let k = 0; k < stamps.length; k++) {
|
|
335
|
+
const framePath = path.join(framesDir, `scene-${String(i).padStart(3, '0')}-${k}.jpg`);
|
|
336
|
+
try {
|
|
337
|
+
await run(FFMPEG, ['-y', '-ss', String(Math.max(0, stamps[k])), '-i', videoPath, '-vframes', '1', '-vf', 'scale=640:-1', '-q:v', '5', framePath, '-loglevel', 'error']);
|
|
338
|
+
const buf = await fs.readFile(framePath);
|
|
339
|
+
buffers.push(buf);
|
|
340
|
+
// Middle frame = thumbnail (index 1 when 3 frames, index 0 when 1).
|
|
341
|
+
if (k === Math.floor(stamps.length / 2))
|
|
342
|
+
thumb = buf;
|
|
343
|
+
}
|
|
344
|
+
catch (err) {
|
|
345
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
346
|
+
console.warn('[analyze_video] Frame extract failed for scene', i, 'frame', k, msg);
|
|
347
|
+
}
|
|
311
348
|
}
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
console.warn('[analyze_video] Frame extract failed for scene', i, msg);
|
|
349
|
+
if (buffers.length) {
|
|
350
|
+
frameBuffers.push({ idx: i, frames: buffers, thumb: thumb || buffers[0], slot });
|
|
315
351
|
}
|
|
316
352
|
}
|
|
317
353
|
// Step 2: Style + Master Cast FIRST (Phase 1), so per-scene describe can
|
|
318
354
|
// reuse consistent character labels (the AI_Video_Clone lesson).
|
|
319
355
|
progress('Đang phân tích video style...');
|
|
320
|
-
const styleSamples = frameBuffers.slice(0, 3).map(f => f.
|
|
356
|
+
const styleSamples = frameBuffers.slice(0, 3).map(f => f.thumb.toString('base64'));
|
|
321
357
|
const video_style = await visionExtractStyle(styleSamples);
|
|
322
358
|
console.log('[analyze_video] Video style:', video_style.substring(0, 100));
|
|
323
359
|
progress('Đang trích xuất dàn nhân vật (Master Cast)...');
|
|
@@ -327,7 +363,7 @@ export async function analyzeVideo(url, onProgress) {
|
|
|
327
363
|
const castSamples = frameBuffers
|
|
328
364
|
.filter((_, i) => i % castStep === 0)
|
|
329
365
|
.slice(0, castSampleCount)
|
|
330
|
-
.map(f => f.
|
|
366
|
+
.map(f => f.thumb.toString('base64'));
|
|
331
367
|
const { master_cast_prompt, characters } = await visionExtractMasterCast(castSamples, transcript.text || '', video_style);
|
|
332
368
|
const castContext = characters.length
|
|
333
369
|
? characters.map(c => `- ${c.name}: ${c.description}`).join('\n')
|
|
@@ -341,14 +377,14 @@ export async function analyzeVideo(url, onProgress) {
|
|
|
341
377
|
for (let b = 0; b < frameBuffers.length; b += BATCH_SIZE) {
|
|
342
378
|
const batch = frameBuffers.slice(b, b + BATCH_SIZE);
|
|
343
379
|
progress(`Đang phân tích scene ${b + 1}-${Math.min(b + BATCH_SIZE, frameBuffers.length)}/${frameBuffers.length}...`);
|
|
344
|
-
const results = await Promise.all(batch.map(async ({ idx,
|
|
380
|
+
const results = await Promise.all(batch.map(async ({ idx, frames, thumb, slot }) => {
|
|
345
381
|
try {
|
|
346
|
-
const visual_description = await visionDescribe(
|
|
382
|
+
const visual_description = await visionDescribe(frames.map(f => f.toString('base64')), slot.voiceover, castContext);
|
|
347
383
|
return {
|
|
348
384
|
scene_number: idx + 1,
|
|
349
385
|
timestamp_start: Math.round(slot.start * 10) / 10,
|
|
350
386
|
timestamp_end: Math.round(slot.end * 10) / 10,
|
|
351
|
-
thumbnail_base64:
|
|
387
|
+
thumbnail_base64: thumb.toString('base64'),
|
|
352
388
|
voiceover: slot.voiceover,
|
|
353
389
|
visual_description,
|
|
354
390
|
};
|