tuna-agent 0.1.152 → 0.1.154
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -247,8 +247,10 @@ Voiceover during this scene: "${voiceoverText || '(none)'}"`;
|
|
|
247
247
|
{ text: promptText },
|
|
248
248
|
...frames.map(b64 => ({ inlineData: { mimeType: 'image/jpeg', data: b64 } })),
|
|
249
249
|
];
|
|
250
|
-
|
|
251
|
-
|
|
250
|
+
// Gemini 2.5 Flash (cheapest). geminiGenerate disables thinking for
|
|
251
|
+
// 2.5-flash so the 512 budget isn't starved → non-empty descriptions.
|
|
252
|
+
const { text, usage } = await geminiGenerate(parts, 512, 'gemini-2.5-flash');
|
|
253
|
+
cost?.geminiVision('vision', usage, 'gemini-2.5-flash');
|
|
252
254
|
return text;
|
|
253
255
|
}
|
|
254
256
|
// Phase 1 (the strong part of AI_Video_Clone, ported): ONE gpt-4o call over
|
|
@@ -269,7 +271,7 @@ async function visionExtractPhase1(frames, transcript, cost) {
|
|
|
269
271
|
master_cast_prompt: '',
|
|
270
272
|
characters: [],
|
|
271
273
|
};
|
|
272
|
-
if (!
|
|
274
|
+
if (!GEMINI_KEYS.length || frames.length === 0)
|
|
273
275
|
return empty;
|
|
274
276
|
try {
|
|
275
277
|
const promptText = `Act as a Master Film Director. These frames are sampled across an ENTIRE video, in order. Use them + the transcript to analyse the whole piece.
|
|
@@ -291,25 +293,18 @@ Rules:
|
|
|
291
293
|
- NO GROUP ENTRIES (CRITICAL): NEVER output a collective/crowd label as a single entry — forbidden: "VILLAGERS", "LADIES GROUP", "KNITTING GROUP", "CROWD", "GROUP OF ...", any "*_GROUP". If 2+ similar secondary people RECUR across scenes, list them as SEPARATE numbered individuals (e.g. WOMAN_1, WOMAN_2, WOMAN_3), each with its OWN distinct face/hair/body/age. Only a truly anonymous one-off background that never recurs may be omitted entirely.
|
|
292
294
|
- characters.description: ENGLISH only, factual, no camera/action words.
|
|
293
295
|
- DISTINCT FACES (CRITICAL): every character MUST have a HIGHLY UNIQUE facial structure, a distinct hairstyle, a specific body type and a clearly different age. NEVER reuse the same or a similar facial description for two characters — they must look completely different from one another.`;
|
|
294
|
-
// Phase-1
|
|
295
|
-
// spine for the cloned script) with a dense 30-frame seed. Only 1
|
|
296
|
+
// Phase-1 on Gemini 2.5 Flash (cheapest) with a dense 30-frame seed. 1
|
|
296
297
|
// call/video; final cast recall is double-covered by the reconcile pass.
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
body: JSON.stringify({ model: 'gpt-4o', max_tokens: 1600, messages: [{ role: 'user', content }] }),
|
|
305
|
-
});
|
|
306
|
-
if (!res.ok)
|
|
307
|
-
return empty;
|
|
308
|
-
const data = await res.json();
|
|
309
|
-
cost?.chat('phase1', 'gpt-4o', data.usage);
|
|
310
|
-
const rawTxt = (data.choices?.[0]?.message?.content || '').trim();
|
|
298
|
+
// geminiGenerate disables thinking for 2.5-flash + generous 3000 output
|
|
299
|
+
// budget so the JSON answer is never starved (was empty without this).
|
|
300
|
+
const parts = [
|
|
301
|
+
{ text: promptText },
|
|
302
|
+
...frames.map(b64 => ({ inlineData: { mimeType: 'image/jpeg', data: b64 } })),
|
|
303
|
+
];
|
|
304
|
+
const { text: rawTxt, usage } = await geminiGenerate(parts, 3000, 'gemini-2.5-flash');
|
|
311
305
|
if (!rawTxt)
|
|
312
306
|
return empty;
|
|
307
|
+
cost?.geminiVision('phase1', usage, 'gemini-2.5-flash');
|
|
313
308
|
let parsed = {};
|
|
314
309
|
try {
|
|
315
310
|
const m = rawTxt.match(/\{[\s\S]*\}/);
|