tuna-agent 0.1.133 → 0.1.135

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,6 +11,11 @@ export interface AnalyzeVideoResult {
11
11
  language: string;
12
12
  transcript: string;
13
13
  video_style: string;
14
+ master_cast_prompt: string;
15
+ characters: Array<{
16
+ name: string;
17
+ description: string;
18
+ }>;
14
19
  segments: Array<{
15
20
  start: number;
16
21
  end: number;
@@ -74,28 +74,46 @@ ${rawText}`,
74
74
  return rawText;
75
75
  }
76
76
  }
77
- async function visionDescribe(frameB64, voiceoverText) {
77
+ // Accepts 1..N frames sampled across a scene (start → mid → end). Multiple
78
+ // frames let the model observe MOTION direction ("walks left-to-right then
79
+ // turns") instead of guessing from a single frozen midpoint. The model is
80
+ // told the frames are chronological so it describes the action arc, not 3
81
+ // separate moments.
82
+ async function visionDescribe(frameB64s, voiceoverText, castContext = '') {
78
83
  if (!OPENAI_KEY)
79
84
  return '';
85
+ const frames = frameB64s.filter(Boolean);
86
+ if (frames.length === 0)
87
+ return '';
88
+ const castBlock = castContext
89
+ ? `\n\nKNOWN CHARACTER CAST (reuse these EXACT names when a subject appears — do NOT invent new labels for the same subject):\n${castContext}\n`
90
+ : '';
91
+ const seqNote = frames.length > 1
92
+ ? ` The ${frames.length} images are CHRONOLOGICAL samples from the SAME scene (start → middle → end). Treat them as one continuous shot: describe the MOTION ARC across them (direction of movement, camera push/pan, what changes from first to last frame), not three separate moments.`
93
+ : '';
94
+ const imageParts = frames.map(b64 => ({
95
+ type: 'image_url',
96
+ image_url: { url: `data:image/jpeg;base64,${b64}` },
97
+ }));
80
98
  const res = await fetch('https://api.openai.com/v1/chat/completions', {
81
99
  method: 'POST',
82
100
  headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
83
101
  body: JSON.stringify({
84
102
  model: 'gpt-4o-mini',
85
- max_tokens: 300,
103
+ max_tokens: 350,
86
104
  messages: [{
87
105
  role: 'user',
88
106
  content: [
89
- { type: 'text', text: `Describe this frame in detail (4-6 sentences, English). Include:
90
- - Characters: appearance (shape, color, size), facial expression, what they're doing
107
+ { type: 'text', text: `Describe this scene in detail (4-6 sentences, English).${seqNote}${castBlock} Include:
108
+ - Characters: name them using the KNOWN CHARACTER CAST labels above when they appear; appearance (shape, color, size), facial expression, what they're doing
91
109
  - Physical connections: Are characters physically joined/attached/fused together (e.g. organs connected at a junction, body parts linked)? Or are they separate/independent? Be VERY specific — "physically attached at Y-junction" is different from "standing next to each other"
92
110
  - Spatial positions: exact position of each character (left/right/above/below/center), distance between them
93
111
  - Environment: setting, lighting, color palette, atmosphere
94
- - Camera: angle, framing (close-up, wide, etc.)
95
- - Action: what is happening in this moment, movement direction
112
+ - Camera: angle, framing (close-up, wide, etc.), and any camera movement across the frames
113
+ - Action: the movement/action arc from first to last frame (direction, what changes)
96
114
 
97
- Voiceover at this moment: "${voiceoverText || '(none)'}"` },
98
- { type: 'image_url', image_url: { url: `data:image/jpeg;base64,${frameB64}` } },
115
+ Voiceover during this scene: "${voiceoverText || '(none)'}"` },
116
+ ...imageParts,
99
117
  ],
100
118
  }],
101
119
  }),
@@ -105,6 +123,79 @@ Voiceover at this moment: "${voiceoverText || '(none)'}"` },
105
123
  const data = await res.json();
106
124
  return data.choices?.[0]?.message?.content?.trim() || '';
107
125
  }
126
+ // Phase 1 (borrowed from AI_Video_Clone): extract the recurring character
127
+ // cast ONCE from frames sampled across the whole video + the transcript.
128
+ // Returns a master-cast prompt block in the exact [AESTHETIC & STYLE] /
129
+ // [CHARACTER CAST LIST] format that channel-manager's ScriptImporter parses,
130
+ // plus a structured characters[] list. Doing this upfront (a) populates
131
+ // idea.master_cast_prompt so FlowKit has a reference sheet to generate, and
132
+ // (b) gives every per-scene describe call a consistent naming vocabulary so
133
+ // scene 1 and scene 50 refer to "THE BISHOP" instead of "a man in a suit".
134
+ async function visionExtractMasterCast(frames, transcript, videoStyle) {
135
+ const empty = { master_cast_prompt: '', characters: [] };
136
+ if (!OPENAI_KEY || frames.length === 0)
137
+ return empty;
138
+ try {
139
+ const content = [
140
+ {
141
+ type: 'text',
142
+ text: `Act as a Master Film Director. These frames are sampled across an entire video. Identify EVERY recurring character/subject (people, anthropomorphic objects, animals, mascots).
143
+
144
+ Transcript context (may name characters): "${(transcript || '').slice(0, 1500)}"
145
+
146
+ Return ONLY a JSON object, no markdown fences:
147
+ {
148
+ "characters": [
149
+ { "name": "SHORT_UPPERCASE_LABEL", "description": "one-line English visual description: age/build, face, hair, outfit, colors, distinguishing features" }
150
+ ]
151
+ }
152
+
153
+ Rules:
154
+ - name: a stable short uppercase label you will reuse for this subject (e.g. "THE BISHOP", "RED CAR", "NARRATOR DOG"). Max 4 words.
155
+ - Only RECURRING subjects worth a reference sheet. Skip one-off background extras.
156
+ - description: ENGLISH only, factual, no camera/action words.
157
+ - Max 6 characters.`,
158
+ },
159
+ ];
160
+ for (const b64 of frames) {
161
+ content.push({ type: 'image_url', image_url: { url: `data:image/jpeg;base64,${b64}` } });
162
+ }
163
+ const res = await fetch('https://api.openai.com/v1/chat/completions', {
164
+ method: 'POST',
165
+ headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
166
+ body: JSON.stringify({ model: 'gpt-4o', max_tokens: 1200, messages: [{ role: 'user', content }] }),
167
+ });
168
+ if (!res.ok)
169
+ return empty;
170
+ const data = await res.json();
171
+ const raw = (data.choices?.[0]?.message?.content || '').trim();
172
+ let parsed = {};
173
+ try {
174
+ const m = raw.match(/\{[\s\S]*\}/);
175
+ parsed = JSON.parse(m ? m[0] : raw);
176
+ }
177
+ catch {
178
+ return empty;
179
+ }
180
+ const characters = (parsed.characters || [])
181
+ .filter(c => c && c.name && c.description)
182
+ .slice(0, 6)
183
+ .map(c => ({ name: String(c.name).trim(), description: String(c.description).trim() }));
184
+ if (characters.length === 0)
185
+ return empty;
186
+ // Assemble the verbatim-style master cast block ScriptImporter expects.
187
+ const styleLine = (videoStyle || '').trim() || 'Keep the original video’s visual style, color grading, and lighting.';
188
+ const castList = characters.map(c => `- ${c.name}: ${c.description}`).join('\n');
189
+ const master_cast_prompt = `[AESTHETIC & STYLE]\n${styleLine}\n` +
190
+ `[COMPOSITION & LAYOUT]\nCharacter Reference Sheet. Full-body side-by-side.\n` +
191
+ `[CHARACTER CAST LIST]\n${castList}\n` +
192
+ `[TECHNICAL SPECIFICATIONS]\nHigh detail, 8k resolution, consistent facial structures across all frames.`;
193
+ return { master_cast_prompt, characters };
194
+ }
195
+ catch {
196
+ return empty;
197
+ }
198
+ }
108
199
  async function visionExtractStyle(frames) {
109
200
  if (!OPENAI_KEY || frames.length === 0)
110
201
  return '';
@@ -164,71 +255,136 @@ export async function analyzeVideo(url, onProgress) {
164
255
  const segments = transcript.segments || [];
165
256
  const sceneSlots = [];
166
257
  const SILENCE_THRESHOLD = 5; // seconds — gaps longer than this become their own scene
167
- const MAX_SCENES = 30;
258
+ // Veo3 clips are 8s. The KPI (borrowed from AI_Video_Clone) is: never
259
+ // under-segment — a 40-min video must yield ~ceil(duration/8) scenes, not
260
+ // a fixed 30. Long transcript segments are SPLIT into 8s sub-slots so a
261
+ // 90s monologue becomes ~11 scenes instead of one giant clip. A hard
262
+ // ceiling still bounds runaway vision cost on very long videos.
263
+ const TARGET_SCENE_SEC = 8;
264
+ const HARD_CAP = 600; // ~80 min @ 8s — safety bound on vision API spend
265
+ const targetScenes = Math.max(1, Math.ceil(durationSec / TARGET_SCENE_SEC));
266
+ const MAX_SCENES = Math.min(targetScenes + 20, HARD_CAP);
267
+ // Split a [start,end] span into ≤TARGET_SCENE_SEC sub-slots, preserving
268
+ // the voiceover on the FIRST sub-slot (the rest are silent continuations
269
+ // of the same spoken line so lip-sync isn't duplicated downstream).
270
+ const pushSplit = (start, end, voiceover) => {
271
+ const span = end - start;
272
+ if (span <= TARGET_SCENE_SEC * 1.5) {
273
+ sceneSlots.push({ start, end, voiceover });
274
+ return;
275
+ }
276
+ const n = Math.ceil(span / TARGET_SCENE_SEC);
277
+ const step = span / n;
278
+ for (let k = 0; k < n; k++) {
279
+ sceneSlots.push({
280
+ start: start + k * step,
281
+ end: k === n - 1 ? end : start + (k + 1) * step,
282
+ voiceover: k === 0 ? voiceover : '',
283
+ });
284
+ }
285
+ };
168
286
  if (segments.length > 0) {
169
- // Add silence scene before first segment if gap > threshold
170
287
  if (segments[0].start > SILENCE_THRESHOLD) {
171
- sceneSlots.push({ start: 0, end: segments[0].start, voiceover: '' });
288
+ pushSplit(0, segments[0].start, '');
172
289
  }
173
290
  for (let i = 0; i < segments.length; i++) {
174
291
  const seg = segments[i];
175
- sceneSlots.push({ start: seg.start, end: seg.end, voiceover: seg.text?.trim() || '' });
176
- // Add silence scene between segments if gap > threshold
292
+ pushSplit(seg.start, seg.end, seg.text?.trim() || '');
177
293
  if (i < segments.length - 1) {
178
294
  const gap = segments[i + 1].start - seg.end;
179
295
  if (gap > SILENCE_THRESHOLD) {
180
- sceneSlots.push({ start: seg.end, end: segments[i + 1].start, voiceover: '' });
296
+ pushSplit(seg.end, segments[i + 1].start, '');
181
297
  }
182
298
  }
183
299
  }
184
- // Add silence scene after last segment if gap > threshold
185
300
  const lastEnd = segments[segments.length - 1].end;
186
301
  if (durationSec - lastEnd > SILENCE_THRESHOLD) {
187
- sceneSlots.push({ start: lastEnd, end: durationSec, voiceover: '' });
302
+ pushSplit(lastEnd, durationSec, '');
188
303
  }
189
304
  }
190
305
  else {
191
306
  // No transcript — split into scenes every 8s (Veo3 clip length)
192
- const interval = 8;
193
- for (let t = 0; t < durationSec; t += interval) {
194
- sceneSlots.push({ start: t, end: Math.min(t + interval, durationSec), voiceover: '' });
307
+ for (let t = 0; t < durationSec; t += TARGET_SCENE_SEC) {
308
+ sceneSlots.push({ start: t, end: Math.min(t + TARGET_SCENE_SEC, durationSec), voiceover: '' });
195
309
  }
196
310
  }
197
- // Cap max scenes
311
+ // Duration-aware cap (was a flat 30 — that silently truncated any video
312
+ // longer than ~4 min). Re-number after slicing.
198
313
  const finalSlots = sceneSlots.slice(0, MAX_SCENES);
199
314
  progress(`Đang cắt ${finalSlots.length} frames và phân tích...`);
200
315
  console.log('[analyze_video] Building', finalSlots.length, 'scenes (segments:', segments.length, ', duration:', durationSec, 's)');
201
- // Step 1: Extract all frames sequentially (ffmpeg can't run in parallel on same file efficiently)
316
+ // Step 1: Extract frames sequentially. Per scene we grab 3 chronological
317
+ // frames — start → middle → end — so the vision model can read the motion
318
+ // arc (direction of movement, camera push) instead of guessing from a
319
+ // single frozen midpoint. The MIDDLE frame doubles as the UI thumbnail.
320
+ // Tiny scenes (<1.5s) collapse to just the midpoint (the 3 frames would
321
+ // be near-identical — no motion info, wasted tokens). Start/end are
322
+ // nudged ~15% inward to dodge hard-cut / black transition frames.
202
323
  const frameBuffers = [];
203
324
  for (let i = 0; i < finalSlots.length; i++) {
204
325
  const slot = finalSlots[i];
205
- const midpoint = (slot.start + slot.end) / 2;
206
- const framePath = path.join(framesDir, `scene-${String(i).padStart(3, '0')}.jpg`);
207
- try {
208
- await run(FFMPEG, ['-y', '-ss', String(midpoint), '-i', videoPath, '-vframes', '1', '-vf', 'scale=640:-1', '-q:v', '5', framePath, '-loglevel', 'error']);
209
- const buf = await fs.readFile(framePath);
210
- frameBuffers.push({ idx: i, buf, slot });
326
+ const span = slot.end - slot.start;
327
+ const mid = (slot.start + slot.end) / 2;
328
+ const inset = Math.min(0.3, span * 0.15);
329
+ const stamps = span < 1.5
330
+ ? [mid]
331
+ : [slot.start + inset, mid, slot.end - inset];
332
+ const buffers = [];
333
+ let thumb = null;
334
+ for (let k = 0; k < stamps.length; k++) {
335
+ const framePath = path.join(framesDir, `scene-${String(i).padStart(3, '0')}-${k}.jpg`);
336
+ try {
337
+ await run(FFMPEG, ['-y', '-ss', String(Math.max(0, stamps[k])), '-i', videoPath, '-vframes', '1', '-vf', 'scale=640:-1', '-q:v', '5', framePath, '-loglevel', 'error']);
338
+ const buf = await fs.readFile(framePath);
339
+ buffers.push(buf);
340
+ // Middle frame = thumbnail (index 1 when 3 frames, index 0 when 1).
341
+ if (k === Math.floor(stamps.length / 2))
342
+ thumb = buf;
343
+ }
344
+ catch (err) {
345
+ const msg = err instanceof Error ? err.message : String(err);
346
+ console.warn('[analyze_video] Frame extract failed for scene', i, 'frame', k, msg);
347
+ }
211
348
  }
212
- catch (err) {
213
- const msg = err instanceof Error ? err.message : String(err);
214
- console.warn('[analyze_video] Frame extract failed for scene', i, msg);
349
+ if (buffers.length) {
350
+ frameBuffers.push({ idx: i, frames: buffers, thumb: thumb || buffers[0], slot });
215
351
  }
216
352
  }
217
- // Step 2: Vision describe all frames in parallel (batch of 5)
353
+ // Step 2: Style + Master Cast FIRST (Phase 1), so per-scene describe can
354
+ // reuse consistent character labels (the AI_Video_Clone lesson).
355
+ progress('Đang phân tích video style...');
356
+ const styleSamples = frameBuffers.slice(0, 3).map(f => f.thumb.toString('base64'));
357
+ const video_style = await visionExtractStyle(styleSamples);
358
+ console.log('[analyze_video] Video style:', video_style.substring(0, 100));
359
+ progress('Đang trích xuất dàn nhân vật (Master Cast)...');
360
+ // Sample up to 12 frames evenly across the whole video for cast detection.
361
+ const castSampleCount = Math.min(12, frameBuffers.length);
362
+ const castStep = Math.max(1, Math.floor(frameBuffers.length / castSampleCount));
363
+ const castSamples = frameBuffers
364
+ .filter((_, i) => i % castStep === 0)
365
+ .slice(0, castSampleCount)
366
+ .map(f => f.thumb.toString('base64'));
367
+ const { master_cast_prompt, characters } = await visionExtractMasterCast(castSamples, transcript.text || '', video_style);
368
+ const castContext = characters.length
369
+ ? characters.map(c => `- ${c.name}: ${c.description}`).join('\n')
370
+ : '';
371
+ console.log('[analyze_video] Master cast:', characters.map(c => c.name).join(', ') || '(none)');
372
+ // Step 3: Vision describe all frames in parallel (batch of 5), passing the
373
+ // cast context so naming stays consistent across the whole timeline.
218
374
  progress(`Đang phân tích ${frameBuffers.length} scenes song song...`);
219
375
  const BATCH_SIZE = 5;
220
376
  const sceneResults = [];
221
377
  for (let b = 0; b < frameBuffers.length; b += BATCH_SIZE) {
222
378
  const batch = frameBuffers.slice(b, b + BATCH_SIZE);
223
379
  progress(`Đang phân tích scene ${b + 1}-${Math.min(b + BATCH_SIZE, frameBuffers.length)}/${frameBuffers.length}...`);
224
- const results = await Promise.all(batch.map(async ({ idx, buf, slot }) => {
380
+ const results = await Promise.all(batch.map(async ({ idx, frames, thumb, slot }) => {
225
381
  try {
226
- const visual_description = await visionDescribe(buf.toString('base64'), slot.voiceover);
382
+ const visual_description = await visionDescribe(frames.map(f => f.toString('base64')), slot.voiceover, castContext);
227
383
  return {
228
384
  scene_number: idx + 1,
229
385
  timestamp_start: Math.round(slot.start * 10) / 10,
230
386
  timestamp_end: Math.round(slot.end * 10) / 10,
231
- thumbnail_base64: buf.toString('base64'),
387
+ thumbnail_base64: thumb.toString('base64'),
232
388
  voiceover: slot.voiceover,
233
389
  visual_description,
234
390
  };
@@ -242,16 +398,13 @@ export async function analyzeVideo(url, onProgress) {
242
398
  sceneResults.push(...results.filter((r) => r !== null));
243
399
  }
244
400
  const scenes = sceneResults.sort((a, b) => a.scene_number - b.scene_number);
245
- // Extract video style from 3 sample frames
246
- progress('Đang phân tích video style...');
247
- const styleSamples = frameBuffers.slice(0, 3).map(f => f.buf.toString('base64'));
248
- const video_style = await visionExtractStyle(styleSamples);
249
- console.log('[analyze_video] Video style:', video_style.substring(0, 100));
250
401
  return {
251
402
  duration_sec: Math.round(durationSec),
252
403
  language: transcript.language || 'unknown',
253
404
  transcript: transcript.text || '',
254
405
  video_style,
406
+ master_cast_prompt,
407
+ characters,
255
408
  segments: segments.map((s) => ({ start: s.start, end: s.end, text: s.text })),
256
409
  scenes,
257
410
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "tuna-agent",
3
- "version": "0.1.133",
3
+ "version": "0.1.135",
4
4
  "description": "Tuna Agent - Run AI coding tasks on your machine",
5
5
  "bin": {
6
6
  "tuna-agent": "dist/cli/index.js"