tuna-agent 0.1.138 → 0.1.139

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,6 +11,7 @@ export interface AnalyzeVideoResult {
11
11
  duration_sec: number;
12
12
  language: string;
13
13
  transcript: string;
14
+ summary: string;
14
15
  video_style: string;
15
16
  master_cast_prompt: string;
16
17
  characters: Array<{
@@ -166,38 +166,47 @@ Voiceover during this scene: "${voiceoverText || '(none)'}"` },
166
166
  const data = await res.json();
167
167
  return data.choices?.[0]?.message?.content?.trim() || '';
168
168
  }
169
- // Phase 1 (borrowed from AI_Video_Clone): extract the recurring character
170
- // cast ONCE from frames sampled across the whole video + the transcript.
171
- // Returns a master-cast prompt block in the exact [AESTHETIC & STYLE] /
172
- // [CHARACTER CAST LIST] format that channel-manager's ScriptImporter parses,
173
- // plus a structured characters[] list. Doing this upfront (a) populates
174
- // idea.master_cast_prompt so FlowKit has a reference sheet to generate, and
175
- // (b) gives every per-scene describe call a consistent naming vocabulary so
176
- // scene 1 and scene 50 refer to "THE BISHOP" instead of "a man in a suit".
177
- async function visionExtractMasterCast(frames, transcript, videoStyle) {
178
- const empty = { master_cast_prompt: '', characters: [] };
169
+ // Phase 1 (the strong part of AI_Video_Clone, ported): ONE gpt-4o call over
170
+ // frames sampled across the whole video + transcript that returns, together:
171
+ // - video_summary: a cinematic paragraph of the whole story (drives the
172
+ // downstream script-generation prompt the tool's biggest edge)
173
+ // - video_style: a rich 3-4 sentence aesthetic analysis (medium, palette,
174
+ // lighting, camera language) replaces the old terse 1-2 sentence
175
+ // visionExtractStyle gpt-4o-mini call entirely
176
+ // - characters[]: the recurring cast for the [CHARACTER CAST LIST] block
177
+ // Folding all three into one call is cheaper than the previous two calls
178
+ // (style + cast) AND uses gpt-4o for style (was gpt-4o-mini). The master-cast
179
+ // prompt is assembled here in the exact format ScriptImporter parses.
180
+ async function visionExtractPhase1(frames, transcript) {
181
+ const empty = {
182
+ video_summary: '',
183
+ video_style: '',
184
+ master_cast_prompt: '',
185
+ characters: [],
186
+ };
179
187
  if (!OPENAI_KEY || frames.length === 0)
180
188
  return empty;
181
189
  try {
182
190
  const content = [
183
191
  {
184
192
  type: 'text',
185
- text: `Act as a Master Film Director. These frames are sampled across an entire video. Identify EVERY recurring character/subject (people, anthropomorphic objects, animals, mascots).
193
+ text: `Act as a Master Film Director. These frames are sampled across an ENTIRE video, in order. Use them + the transcript to analyse the whole piece.
186
194
 
187
- Transcript context (may name characters): "${(transcript || '').slice(0, 1500)}"
195
+ Transcript context: "${(transcript || '').slice(0, 4000)}"
188
196
 
189
197
  Return ONLY a JSON object, no markdown fences:
190
198
  {
199
+ "video_summary": "One detailed cinematic paragraph (5-8 sentences, English) telling the WHOLE story start to finish: setup, key beats, climax, resolution. This is the narrative spine — be specific about what happens.",
200
+ "video_style": "3-4 sentences (English): artistic medium (2D/3D/live-action/CGI), color palette, lighting, camera language, overall aesthetic vibe. Cinematic, specific.",
191
201
  "characters": [
192
202
  { "name": "SHORT_UPPERCASE_LABEL", "description": "one-line English visual description: age/build, face, hair, outfit, colors, distinguishing features" }
193
203
  ]
194
204
  }
195
205
 
196
206
  Rules:
197
- - name: a stable short uppercase label you will reuse for this subject (e.g. "THE BISHOP", "RED CAR", "NARRATOR DOG"). Max 4 words.
198
- - Only RECURRING subjects worth a reference sheet. Skip one-off background extras.
199
- - description: ENGLISH only, factual, no camera/action words.
200
- - Max 6 characters.`,
207
+ - characters.name: a stable short uppercase label reused for this subject (e.g. "THE BISHOP", "U-94 SUBMARINE"). Max 4 words.
208
+ - Only RECURRING subjects worth a reference sheet. Skip one-off extras. Max 6.
209
+ - characters.description: ENGLISH only, factual, no camera/action words.`,
201
210
  },
202
211
  ];
203
212
  for (const b64 of frames) {
@@ -206,16 +215,16 @@ Rules:
206
215
  const res = await fetch('https://api.openai.com/v1/chat/completions', {
207
216
  method: 'POST',
208
217
  headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
209
- body: JSON.stringify({ model: 'gpt-4o', max_tokens: 1200, messages: [{ role: 'user', content }] }),
218
+ body: JSON.stringify({ model: 'gpt-4o', max_tokens: 1600, messages: [{ role: 'user', content }] }),
210
219
  });
211
220
  if (!res.ok)
212
221
  return empty;
213
222
  const data = await res.json();
214
- const raw = (data.choices?.[0]?.message?.content || '').trim();
223
+ const rawTxt = (data.choices?.[0]?.message?.content || '').trim();
215
224
  let parsed = {};
216
225
  try {
217
- const m = raw.match(/\{[\s\S]*\}/);
218
- parsed = JSON.parse(m ? m[0] : raw);
226
+ const m = rawTxt.match(/\{[\s\S]*\}/);
227
+ parsed = JSON.parse(m ? m[0] : rawTxt);
219
228
  }
220
229
  catch {
221
230
  return empty;
@@ -224,43 +233,22 @@ Rules:
224
233
  .filter(c => c && c.name && c.description)
225
234
  .slice(0, 6)
226
235
  .map(c => ({ name: String(c.name).trim(), description: String(c.description).trim() }));
227
- if (characters.length === 0)
228
- return empty;
229
- // Assemble the verbatim-style master cast block ScriptImporter expects.
230
- const styleLine = (videoStyle || '').trim() || 'Keep the original video’s visual style, color grading, and lighting.';
231
- const castList = characters.map(c => `- ${c.name}: ${c.description}`).join('\n');
232
- const master_cast_prompt = `[AESTHETIC & STYLE]\n${styleLine}\n` +
233
- `[COMPOSITION & LAYOUT]\nCharacter Reference Sheet. Full-body side-by-side.\n` +
234
- `[CHARACTER CAST LIST]\n${castList}\n` +
235
- `[TECHNICAL SPECIFICATIONS]\nHigh detail, 8k resolution, consistent facial structures across all frames.`;
236
- return { master_cast_prompt, characters };
237
- }
238
- catch {
239
- return empty;
240
- }
241
- }
242
- async function visionExtractStyle(frames) {
243
- if (!OPENAI_KEY || frames.length === 0)
244
- return '';
245
- try {
246
- const content = [
247
- { type: 'text', text: 'Analyze these frames from a video and extract a concise visual style description (1-2 sentences). Focus on: animation style (cartoon, realistic, anime, etc.), color palette, lighting, character design approach (anthropomorphized objects, real people, etc.), and overall aesthetic.\n\nReturn ONLY the style description, nothing else.' },
248
- ];
249
- for (const b64 of frames) {
250
- content.push({ type: 'image_url', image_url: { url: `data:image/jpeg;base64,${b64}` } });
236
+ const video_summary = (parsed.video_summary || '').trim();
237
+ const video_style = (parsed.video_style || '').trim();
238
+ let master_cast_prompt = '';
239
+ if (characters.length > 0) {
240
+ const styleLine = video_style || 'Keep the original video’s visual style, color grading, and lighting.';
241
+ const castList = characters.map(c => `- ${c.name}: ${c.description}`).join('\n');
242
+ master_cast_prompt =
243
+ `[AESTHETIC & STYLE]\n${styleLine}\n` +
244
+ `[COMPOSITION & LAYOUT]\nCharacter Reference Sheet. Full-body side-by-side.\n` +
245
+ `[CHARACTER CAST LIST]\n${castList}\n` +
246
+ `[TECHNICAL SPECIFICATIONS]\nHigh detail, 8k resolution, consistent facial structures across all frames.`;
251
247
  }
252
- const res = await fetch('https://api.openai.com/v1/chat/completions', {
253
- method: 'POST',
254
- headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
255
- body: JSON.stringify({ model: 'gpt-4o-mini', max_tokens: 200, messages: [{ role: 'user', content }] }),
256
- });
257
- if (!res.ok)
258
- return '';
259
- const data = await res.json();
260
- return (data.choices?.[0]?.message?.content || '').trim().replace(/\*\*/g, '');
248
+ return { video_summary, video_style, master_cast_prompt, characters };
261
249
  }
262
250
  catch {
263
- return '';
251
+ return empty;
264
252
  }
265
253
  }
266
254
  export async function analyzeVideo(url, onProgress) {
@@ -446,25 +434,22 @@ export async function analyzeVideo(url, onProgress) {
446
434
  frameBuffers.push({ idx: i, frames: buffers, thumb: thumb || buffers[0], slot });
447
435
  }
448
436
  }
449
- // Step 2: Style + Master Cast FIRST (Phase 1), so per-scene describe can
450
- // reuse consistent character labels (the AI_Video_Clone lesson).
451
- progress('Đang phân tích video style...');
452
- const styleSamples = frameBuffers.slice(0, 3).map(f => f.thumb.toString('base64'));
453
- const video_style = await visionExtractStyle(styleSamples);
454
- console.log('[analyze_video] Video style:', video_style.substring(0, 100));
455
- progress('Đang trích xuất dàn nhân vật (Master Cast)...');
456
- // Sample up to 12 frames evenly across the whole video for cast detection.
457
- const castSampleCount = Math.min(12, frameBuffers.length);
458
- const castStep = Math.max(1, Math.floor(frameBuffers.length / castSampleCount));
459
- const castSamples = frameBuffers
460
- .filter((_, i) => i % castStep === 0)
461
- .slice(0, castSampleCount)
437
+ // Step 2: Phase 1 ONE gpt-4o call returning summary + rich style +
438
+ // master cast + characters. Runs before per-scene describe so the cast
439
+ // context keeps naming consistent across the whole timeline.
440
+ progress('Đang phân tích tổng thể (summary + style + master cast)...');
441
+ // Sample up to 12 frames evenly across the whole video.
442
+ const p1SampleCount = Math.min(12, frameBuffers.length);
443
+ const p1Step = Math.max(1, Math.floor(frameBuffers.length / p1SampleCount));
444
+ const p1Samples = frameBuffers
445
+ .filter((_, i) => i % p1Step === 0)
446
+ .slice(0, p1SampleCount)
462
447
  .map(f => f.thumb.toString('base64'));
463
- const { master_cast_prompt, characters } = await visionExtractMasterCast(castSamples, transcript.text || '', video_style);
448
+ const { video_summary, video_style, master_cast_prompt, characters } = await visionExtractPhase1(p1Samples, transcript.text || '');
464
449
  const castContext = characters.length
465
450
  ? characters.map(c => `- ${c.name}: ${c.description}`).join('\n')
466
451
  : '';
467
- console.log('[analyze_video] Master cast:', characters.map(c => c.name).join(', ') || '(none)');
452
+ console.log('[analyze_video] Phase1 — style:', video_style.slice(0, 80), '| cast:', characters.map(c => c.name).join(', ') || '(none)', '| summary:', video_summary.length, 'chars');
468
453
  // Step 3: Vision describe all frames in parallel (batch of 5), passing the
469
454
  // cast context so naming stays consistent across the whole timeline.
470
455
  progress(`Đang phân tích ${frameBuffers.length} scenes song song...`);
@@ -499,6 +484,7 @@ export async function analyzeVideo(url, onProgress) {
499
484
  duration_sec: Math.round(durationSec),
500
485
  language: transcript.language || 'unknown',
501
486
  transcript: transcript.text || '',
487
+ summary: video_summary,
502
488
  video_style,
503
489
  master_cast_prompt,
504
490
  characters,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "tuna-agent",
3
- "version": "0.1.138",
3
+ "version": "0.1.139",
4
4
  "description": "Tuna Agent - Run AI coding tasks on your machine",
5
5
  "bin": {
6
6
  "tuna-agent": "dist/cli/index.js"