tuna-agent 0.1.148 → 0.1.150

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -29,6 +29,8 @@ const RATES = {
29
29
  'gpt-4o': { in: 2.50, out: 10.0 },
30
30
  // Gemini 3 Flash preview: text+image input share one rate, output 6x.
31
31
  'gemini-3-flash-preview': { in: 0.50, out: 3.0 },
32
+ // Gemini 2.5 Flash: cheaper image-heavy reads (used by Phase-1).
33
+ 'gemini-2.5-flash': { in: 0.30, out: 2.50 },
32
34
  };
33
35
  // Per-run cost+token accumulator. Single-threaded JS → plain mutation is safe
34
36
  // even across the parallel visionDescribe calls.
@@ -50,12 +52,12 @@ class CostTracker {
50
52
  }
51
53
  // Gemini reports usageMetadata.{promptTokenCount,candidatesTokenCount}
52
54
  // instead of OpenAI's prompt_tokens/completion_tokens.
53
- geminiVision(bucket, usage) {
55
+ geminiVision(bucket, usage, model = 'gemini-3-flash-preview') {
54
56
  if (!usage) {
55
57
  this.add(bucket, 0);
56
58
  return;
57
59
  }
58
- const r = RATES['gemini-3-flash-preview'];
60
+ const r = RATES[model];
59
61
  const cost = ((usage.promptTokenCount || 0) / 1e6) * r.in + ((usage.candidatesTokenCount || 0) / 1e6) * r.out;
60
62
  this.add(bucket, cost);
61
63
  }
@@ -174,12 +176,20 @@ ${rawText}`,
174
176
  // One Gemini generateContent call with key rotation + exponential backoff on
175
177
  // 429/5xx. A single free-tier key under the 5-way concurrent batch WILL
176
178
  // rate-limit; retrying (slower) beats dropping the scene description.
177
- async function geminiGenerate(parts, maxOutputTokens) {
179
+ async function geminiGenerate(parts, maxOutputTokens, model = GEMINI_MODEL) {
178
180
  if (!GEMINI_KEYS.length)
179
181
  return { text: '' };
182
+ const generationConfig = { maxOutputTokens, temperature: 0.4 };
183
+ // Gemini 2.5 Flash is a thinking model: with thinking ON it burns the whole
184
+ // maxOutputTokens budget on internal reasoning and returns EMPTY text. These
185
+ // are extraction/description tasks (no CoT needed) → disable thinking so the
186
+ // budget goes to the actual answer.
187
+ if (model.includes('2.5-flash')) {
188
+ generationConfig.thinkingConfig = { thinkingBudget: 0 };
189
+ }
180
190
  const body = JSON.stringify({
181
191
  contents: [{ parts }],
182
- generationConfig: { maxOutputTokens, temperature: 0.4 },
192
+ generationConfig,
183
193
  });
184
194
  const MAX_ATTEMPTS = 6;
185
195
  let keyIdx = 0;
@@ -187,7 +197,7 @@ async function geminiGenerate(parts, maxOutputTokens) {
187
197
  for (let attempt = 0; attempt < MAX_ATTEMPTS; attempt++) {
188
198
  const key = GEMINI_KEYS[keyIdx % GEMINI_KEYS.length];
189
199
  try {
190
- const res = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/${encodeURIComponent(GEMINI_MODEL)}:generateContent?key=${key}`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body });
200
+ const res = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/${encodeURIComponent(model)}:generateContent?key=${key}`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body });
191
201
  if (res.status === 429 || res.status >= 500) {
192
202
  lastErr = `Gemini ${res.status}`;
193
203
  keyIdx++; // rotate to the next key before backing off
@@ -259,13 +269,10 @@ async function visionExtractPhase1(frames, transcript, cost) {
259
269
  master_cast_prompt: '',
260
270
  characters: [],
261
271
  };
262
- if (!OPENAI_KEY || frames.length === 0)
272
+ if (!GEMINI_KEYS.length || frames.length === 0)
263
273
  return empty;
264
274
  try {
265
- const content = [
266
- {
267
- type: 'text',
268
- text: `Act as a Master Film Director. These frames are sampled across an ENTIRE video, in order. Use them + the transcript to analyse the whole piece.
275
+ const promptText = `Act as a Master Film Director. These frames are sampled across an ENTIRE video, in order. Use them + the transcript to analyse the whole piece.
269
276
 
270
277
  Transcript context: "${(transcript || '').slice(0, 4000)}"
271
278
 
@@ -282,22 +289,18 @@ Rules:
282
289
  - characters.name: a stable short uppercase label reused for this subject (e.g. "THE BISHOP", "U-94 SUBMARINE"). Max 4 words.
283
290
  - RECALL (CRITICAL): list EVERY distinct recurring subject SEPARATELY. If a family or group recurs, include EACH member as its own entry (e.g. adult man, adult woman, older boy, younger girl) — never merge them into one. Skip only true one-off background extras. Be COMPLETE: missing a recurring character is worse than one extra. Up to 8.
284
291
  - characters.description: ENGLISH only, factual, no camera/action words.
285
- - DISTINCT FACES (CRITICAL): every character MUST have a HIGHLY UNIQUE facial structure, a distinct hairstyle, a specific body type and a clearly different age. NEVER reuse the same or a similar facial description for two characters — they must look completely different from one another.`,
286
- },
292
+ - DISTINCT FACES (CRITICAL): every character MUST have a HIGHLY UNIQUE facial structure, a distinct hairstyle, a specific body type and a clearly different age. NEVER reuse the same or a similar facial description for two characters — they must look completely different from one another.`;
293
+ // Phase-1 on Gemini 2.5 Flash: image-heavy read is far cheaper than gpt-4o,
294
+ // and cast recall is backstopped by the post-Phase-2 reconcile pass, so a
295
+ // small frame sample suffices here.
296
+ const parts = [
297
+ { text: promptText },
298
+ ...frames.map(b64 => ({ inlineData: { mimeType: 'image/jpeg', data: b64 } })),
287
299
  ];
288
- for (const b64 of frames) {
289
- content.push({ type: 'image_url', image_url: { url: `data:image/jpeg;base64,${b64}` } });
290
- }
291
- const res = await fetch('https://api.openai.com/v1/chat/completions', {
292
- method: 'POST',
293
- headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
294
- body: JSON.stringify({ model: 'gpt-4o', max_tokens: 1600, messages: [{ role: 'user', content }] }),
295
- });
296
- if (!res.ok)
300
+ const { text: rawTxt, usage } = await geminiGenerate(parts, 1600, 'gemini-2.5-flash');
301
+ if (!rawTxt)
297
302
  return empty;
298
- const data = await res.json();
299
- cost?.chat('phase1', 'gpt-4o', data.usage);
300
- const rawTxt = (data.choices?.[0]?.message?.content || '').trim();
303
+ cost?.geminiVision('phase1', usage, 'gemini-2.5-flash');
301
304
  let parsed = {};
302
305
  try {
303
306
  const m = rawTxt.match(/\{[\s\S]*\}/);
@@ -599,10 +602,12 @@ export async function analyzeVideo(url, onProgress) {
599
602
  // master cast + characters. Runs before per-scene describe so the cast
600
603
  // context keeps naming consistent across the whole timeline.
601
604
  progress('Đang phân tích tổng thể (summary + style + master cast)...');
602
- // Sample up to 30 frames evenly across the whole video denser sampling
603
- // is critical for cast RECALL on sparse-narration (ASMR) videos where
604
- // Phase-1 relies almost entirely on frames (matches AI_Video_Clone).
605
- const p1SampleCount = Math.min(30, frameBuffers.length);
605
+ // Sample up to 10 frames evenly enough for summary + style + a naming
606
+ // seed. Cast RECALL no longer depends on this sample: the post-Phase-2
607
+ // reconcile pass derives the definitive cast from every per-scene
608
+ // description, so a small sample keeps the (now Gemini 2.5 Flash) Phase-1
609
+ // call cheap.
610
+ const p1SampleCount = Math.min(10, frameBuffers.length);
606
611
  const p1Step = Math.max(1, Math.floor(frameBuffers.length / p1SampleCount));
607
612
  const p1Samples = frameBuffers
608
613
  .filter((_, i) => i % p1Step === 0)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "tuna-agent",
3
- "version": "0.1.148",
3
+ "version": "0.1.150",
4
4
  "description": "Tuna Agent - Run AI coding tasks on your machine",
5
5
  "bin": {
6
6
  "tuna-agent": "dist/cli/index.js"