tuna-agent 0.1.147 → 0.1.149

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -29,6 +29,8 @@ const RATES = {
29
29
  'gpt-4o': { in: 2.50, out: 10.0 },
30
30
  // Gemini 3 Flash preview: text+image input share one rate, output 6x.
31
31
  'gemini-3-flash-preview': { in: 0.50, out: 3.0 },
32
+ // Gemini 2.5 Flash: cheaper image-heavy reads (used by Phase-1).
33
+ 'gemini-2.5-flash': { in: 0.30, out: 2.50 },
32
34
  };
33
35
  // Per-run cost+token accumulator. Single-threaded JS → plain mutation is safe
34
36
  // even across the parallel visionDescribe calls.
@@ -50,12 +52,12 @@ class CostTracker {
50
52
  }
51
53
  // Gemini reports usageMetadata.{promptTokenCount,candidatesTokenCount}
52
54
  // instead of OpenAI's prompt_tokens/completion_tokens.
53
- geminiVision(bucket, usage) {
55
+ geminiVision(bucket, usage, model = 'gemini-3-flash-preview') {
54
56
  if (!usage) {
55
57
  this.add(bucket, 0);
56
58
  return;
57
59
  }
58
- const r = RATES['gemini-3-flash-preview'];
60
+ const r = RATES[model];
59
61
  const cost = ((usage.promptTokenCount || 0) / 1e6) * r.in + ((usage.candidatesTokenCount || 0) / 1e6) * r.out;
60
62
  this.add(bucket, cost);
61
63
  }
@@ -174,7 +176,7 @@ ${rawText}`,
174
176
  // One Gemini generateContent call with key rotation + exponential backoff on
175
177
  // 429/5xx. A single free-tier key under the 5-way concurrent batch WILL
176
178
  // rate-limit; retrying (slower) beats dropping the scene description.
177
- async function geminiGenerate(parts, maxOutputTokens) {
179
+ async function geminiGenerate(parts, maxOutputTokens, model = GEMINI_MODEL) {
178
180
  if (!GEMINI_KEYS.length)
179
181
  return { text: '' };
180
182
  const body = JSON.stringify({
@@ -187,7 +189,7 @@ async function geminiGenerate(parts, maxOutputTokens) {
187
189
  for (let attempt = 0; attempt < MAX_ATTEMPTS; attempt++) {
188
190
  const key = GEMINI_KEYS[keyIdx % GEMINI_KEYS.length];
189
191
  try {
190
- const res = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/${encodeURIComponent(GEMINI_MODEL)}:generateContent?key=${key}`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body });
192
+ const res = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/${encodeURIComponent(model)}:generateContent?key=${key}`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body });
191
193
  if (res.status === 429 || res.status >= 500) {
192
194
  lastErr = `Gemini ${res.status}`;
193
195
  keyIdx++; // rotate to the next key before backing off
@@ -259,13 +261,10 @@ async function visionExtractPhase1(frames, transcript, cost) {
259
261
  master_cast_prompt: '',
260
262
  characters: [],
261
263
  };
262
- if (!OPENAI_KEY || frames.length === 0)
264
+ if (!GEMINI_KEYS.length || frames.length === 0)
263
265
  return empty;
264
266
  try {
265
- const content = [
266
- {
267
- type: 'text',
268
- text: `Act as a Master Film Director. These frames are sampled across an ENTIRE video, in order. Use them + the transcript to analyse the whole piece.
267
+ const promptText = `Act as a Master Film Director. These frames are sampled across an ENTIRE video, in order. Use them + the transcript to analyse the whole piece.
269
268
 
270
269
  Transcript context: "${(transcript || '').slice(0, 4000)}"
271
270
 
@@ -282,22 +281,18 @@ Rules:
282
281
  - characters.name: a stable short uppercase label reused for this subject (e.g. "THE BISHOP", "U-94 SUBMARINE"). Max 4 words.
283
282
  - RECALL (CRITICAL): list EVERY distinct recurring subject SEPARATELY. If a family or group recurs, include EACH member as its own entry (e.g. adult man, adult woman, older boy, younger girl) — never merge them into one. Skip only true one-off background extras. Be COMPLETE: missing a recurring character is worse than one extra. Up to 8.
284
283
  - characters.description: ENGLISH only, factual, no camera/action words.
285
- - DISTINCT FACES (CRITICAL): every character MUST have a HIGHLY UNIQUE facial structure, a distinct hairstyle, a specific body type and a clearly different age. NEVER reuse the same or a similar facial description for two characters — they must look completely different from one another.`,
286
- },
284
+ - DISTINCT FACES (CRITICAL): every character MUST have a HIGHLY UNIQUE facial structure, a distinct hairstyle, a specific body type and a clearly different age. NEVER reuse the same or a similar facial description for two characters — they must look completely different from one another.`;
285
+ // Phase-1 on Gemini 2.5 Flash: image-heavy read is far cheaper than gpt-4o,
286
+ // and cast recall is backstopped by the post-Phase-2 reconcile pass, so a
287
+ // small frame sample suffices here.
288
+ const parts = [
289
+ { text: promptText },
290
+ ...frames.map(b64 => ({ inlineData: { mimeType: 'image/jpeg', data: b64 } })),
287
291
  ];
288
- for (const b64 of frames) {
289
- content.push({ type: 'image_url', image_url: { url: `data:image/jpeg;base64,${b64}` } });
290
- }
291
- const res = await fetch('https://api.openai.com/v1/chat/completions', {
292
- method: 'POST',
293
- headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
294
- body: JSON.stringify({ model: 'gpt-4o', max_tokens: 1600, messages: [{ role: 'user', content }] }),
295
- });
296
- if (!res.ok)
292
+ const { text: rawTxt, usage } = await geminiGenerate(parts, 1600, 'gemini-2.5-flash');
293
+ if (!rawTxt)
297
294
  return empty;
298
- const data = await res.json();
299
- cost?.chat('phase1', 'gpt-4o', data.usage);
300
- const rawTxt = (data.choices?.[0]?.message?.content || '').trim();
295
+ cost?.geminiVision('phase1', usage, 'gemini-2.5-flash');
301
296
  let parsed = {};
302
297
  try {
303
298
  const m = rawTxt.match(/\{[\s\S]*\}/);
@@ -312,22 +307,103 @@ Rules:
312
307
  .map(c => ({ name: String(c.name).trim(), description: String(c.description).trim() }));
313
308
  const video_summary = (parsed.video_summary || '').trim();
314
309
  const video_style = (parsed.video_style || '').trim();
315
- let master_cast_prompt = '';
316
- if (characters.length > 0) {
317
- const styleLine = video_style || 'Keep the original video’s visual style, color grading, and lighting.';
318
- const castList = characters.map(c => `- ${c.name}: ${c.description}`).join('\n');
319
- master_cast_prompt =
320
- `[AESTHETIC & STYLE]\n${styleLine}\n` +
321
- `[COMPOSITION & LAYOUT]\nCharacter Reference Sheet. Full-body side-by-side.\n` +
322
- `[CHARACTER CAST LIST]\n${castList}\n` +
323
- `[TECHNICAL SPECIFICATIONS]\nHigh detail, 8k resolution, consistent facial structures across all frames. Each character has a completely distinct face, hairstyle, body type and age — no two characters look alike.`;
324
- }
310
+ const master_cast_prompt = buildMasterCastPrompt(video_style, characters);
325
311
  return { video_summary, video_style, master_cast_prompt, characters };
326
312
  }
327
313
  catch {
328
314
  return empty;
329
315
  }
330
316
  }
317
+ // Single source of truth for the master-cast prompt block (used by Phase-1
318
+ // and the post-Phase-2 cast reconciliation so the format never drifts).
319
+ function buildMasterCastPrompt(videoStyle, characters) {
320
+ if (!characters.length)
321
+ return '';
322
+ const styleLine = videoStyle || 'Keep the original video’s visual style, color grading, and lighting.';
323
+ const castList = characters.map(c => `- ${c.name}: ${c.description}`).join('\n');
324
+ return (`[AESTHETIC & STYLE]\n${styleLine}\n` +
325
+ `[COMPOSITION & LAYOUT]\nCharacter Reference Sheet. Full-body side-by-side.\n` +
326
+ `[CHARACTER CAST LIST]\n${castList}\n` +
327
+ `[TECHNICAL SPECIFICATIONS]\nHigh detail, 8k resolution, consistent facial structures across all frames. Each character has a completely distinct face, hairstyle, body type and age — no two characters look alike.`);
328
+ }
329
+ // Post-Phase-2 cast RECONCILE. Phase-1 only sees a 30-frame sample so it can
330
+ // miss a recurring character; the per-scene visionDescribe pass, however,
331
+ // looked at EVERY scene. Feed all per-scene descriptions (+ Phase-1 cast as a
332
+ // trusted seed) to one gpt-4o text call to produce the definitive cast: keep
333
+ // seed entries, merge duplicates, ADD any recurring subject Phase-1 missed.
334
+ // Returns null on any failure → caller keeps the Phase-1 cast.
335
+ async function reconcileCast(seed, sceneDescriptions, transcript, videoStyle, cost) {
336
+ if (!OPENAI_KEY)
337
+ return null;
338
+ const descs = sceneDescriptions.map(s => (s || '').trim()).filter(Boolean);
339
+ if (descs.length < 3)
340
+ return null;
341
+ // Bound tokens: cap each scene line + overall.
342
+ const joined = descs
343
+ .map((d, i) => `S${i + 1}: ${d.slice(0, 240)}`)
344
+ .join('\n')
345
+ .slice(0, 60000);
346
+ const seedBlock = seed.length
347
+ ? seed.map(c => `- ${c.name}: ${c.description}`).join('\n')
348
+ : '(none detected yet)';
349
+ try {
350
+ const res = await fetch('https://api.openai.com/v1/chat/completions', {
351
+ method: 'POST',
352
+ headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
353
+ body: JSON.stringify({
354
+ model: 'gpt-4o',
355
+ max_tokens: 1200,
356
+ messages: [{
357
+ role: 'user',
358
+ content: `You are reconciling the definitive recurring CHARACTER CAST of a video from per-scene visual descriptions that cover EVERY scene (the seed cast below was extracted from only a few sampled frames and may be INCOMPLETE).
359
+
360
+ SEED CAST (trusted — keep these, refine wording if scenes add detail):
361
+ ${seedBlock}
362
+
363
+ PER-SCENE DESCRIPTIONS (every scene, in order):
364
+ ${joined}
365
+
366
+ Transcript (may name characters): "${(transcript || '').slice(0, 1500)}"
367
+
368
+ Return ONLY a JSON object, no markdown:
369
+ { "characters": [ { "name": "SHORT_UPPERCASE_LABEL", "description": "age/build, face, hair, outfit, colors, distinguishing features (English, factual, no camera/action words)" } ] }
370
+
371
+ Rules:
372
+ - Start from the SEED CAST; KEEP every seed character (don't drop them).
373
+ - ADD any recurring subject that appears in multiple scenes but is missing from the seed (e.g. a family member, a recurring animal/mascot/object). Missing a recurring character is the main failure to avoid.
374
+ - Merge entries that clearly refer to the SAME subject under one name.
375
+ - If a group/family recurs, list EACH member SEPARATELY (e.g. adult man, adult woman, older boy, younger girl) — never merge them.
376
+ - Skip true one-off background extras.
377
+ - DISTINCT FACES: every character must have a UNIQUE facial structure, hairstyle, body type and a clearly different age — never reuse a similar facial description for two characters.
378
+ - Stable short UPPERCASE name, max 4 words. Up to 8 characters total.`,
379
+ }],
380
+ }),
381
+ });
382
+ if (!res.ok)
383
+ return null;
384
+ const data = await res.json();
385
+ cost?.chat('cast', 'gpt-4o', data.usage);
386
+ const raw = (data.choices?.[0]?.message?.content || '').trim();
387
+ let parsed = {};
388
+ try {
389
+ const m = raw.match(/\{[\s\S]*\}/);
390
+ parsed = JSON.parse(m ? m[0] : raw);
391
+ }
392
+ catch {
393
+ return null;
394
+ }
395
+ const characters = (parsed.characters || [])
396
+ .filter(c => c && c.name && c.description)
397
+ .slice(0, 8)
398
+ .map(c => ({ name: String(c.name).trim(), description: String(c.description).trim() }));
399
+ if (!characters.length)
400
+ return null;
401
+ return { characters, master_cast_prompt: buildMasterCastPrompt(videoStyle, characters) };
402
+ }
403
+ catch {
404
+ return null;
405
+ }
406
+ }
331
407
  export async function analyzeVideo(url, onProgress) {
332
408
  const progress = onProgress || (() => { });
333
409
  const cost = new CostTracker();
@@ -518,16 +594,19 @@ export async function analyzeVideo(url, onProgress) {
518
594
  // master cast + characters. Runs before per-scene describe so the cast
519
595
  // context keeps naming consistent across the whole timeline.
520
596
  progress('Đang phân tích tổng thể (summary + style + master cast)...');
521
- // Sample up to 30 frames evenly across the whole video denser sampling
522
- // is critical for cast RECALL on sparse-narration (ASMR) videos where
523
- // Phase-1 relies almost entirely on frames (matches AI_Video_Clone).
524
- const p1SampleCount = Math.min(30, frameBuffers.length);
597
+ // Sample up to 10 frames evenly enough for summary + style + a naming
598
+ // seed. Cast RECALL no longer depends on this sample: the post-Phase-2
599
+ // reconcile pass derives the definitive cast from every per-scene
600
+ // description, so a small sample keeps the (now Gemini 2.5 Flash) Phase-1
601
+ // call cheap.
602
+ const p1SampleCount = Math.min(10, frameBuffers.length);
525
603
  const p1Step = Math.max(1, Math.floor(frameBuffers.length / p1SampleCount));
526
604
  const p1Samples = frameBuffers
527
605
  .filter((_, i) => i % p1Step === 0)
528
606
  .slice(0, p1SampleCount)
529
607
  .map(f => f.thumb.toString('base64'));
530
- const { video_summary, video_style, master_cast_prompt, characters } = await visionExtractPhase1(p1Samples, transcript.text || '', cost);
608
+ // eslint-disable-next-line prefer-const
609
+ let { video_summary, video_style, master_cast_prompt, characters } = await visionExtractPhase1(p1Samples, transcript.text || '', cost);
531
610
  const castContext = characters.length
532
611
  ? characters.map(c => `- ${c.name}: ${c.description}`).join('\n')
533
612
  : '';
@@ -561,6 +640,16 @@ export async function analyzeVideo(url, onProgress) {
561
640
  sceneResults.push(...results.filter((r) => r !== null));
562
641
  }
563
642
  const scenes = sceneResults.sort((a, b) => a.scene_number - b.scene_number);
643
+ // Reconcile cast from EVERY per-scene description (recall fix — Phase-1
644
+ // only saw a sampled set of frames). Falls back to Phase-1 cast on failure.
645
+ progress('Đang đối soát dàn nhân vật từ toàn bộ scene...');
646
+ const reconciled = await reconcileCast(characters, scenes.map(s => s.visual_description || ''), transcript.text || '', video_style, cost);
647
+ if (reconciled && reconciled.characters.length) {
648
+ console.log('[analyze_video] Cast reconciled:', (characters.map(c => c.name).join(', ') || '(none)'), '→', reconciled.characters.map(c => c.name).join(', '));
649
+ characters = reconciled.characters;
650
+ if (reconciled.master_cast_prompt)
651
+ master_cast_prompt = reconciled.master_cast_prompt;
652
+ }
564
653
  return {
565
654
  source_title,
566
655
  duration_sec: Math.round(durationSec),
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "tuna-agent",
3
- "version": "0.1.147",
3
+ "version": "0.1.149",
4
4
  "description": "Tuna Agent - Run AI coding tasks on your machine",
5
5
  "bin": {
6
6
  "tuna-agent": "dist/cli/index.js"