tuna-agent 0.1.152 → 0.1.153
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -269,7 +269,7 @@ async function visionExtractPhase1(frames, transcript, cost) {
|
|
|
269
269
|
master_cast_prompt: '',
|
|
270
270
|
characters: [],
|
|
271
271
|
};
|
|
272
|
-
if (!
|
|
272
|
+
if (!GEMINI_KEYS.length || frames.length === 0)
|
|
273
273
|
return empty;
|
|
274
274
|
try {
|
|
275
275
|
const promptText = `Act as a Master Film Director. These frames are sampled across an ENTIRE video, in order. Use them + the transcript to analyse the whole piece.
|
|
@@ -291,25 +291,18 @@ Rules:
|
|
|
291
291
|
- NO GROUP ENTRIES (CRITICAL): NEVER output a collective/crowd label as a single entry — forbidden: "VILLAGERS", "LADIES GROUP", "KNITTING GROUP", "CROWD", "GROUP OF ...", any "*_GROUP". If 2+ similar secondary people RECUR across scenes, list them as SEPARATE numbered individuals (e.g. WOMAN_1, WOMAN_2, WOMAN_3), each with its OWN distinct face/hair/body/age. Only a truly anonymous one-off background that never recurs may be omitted entirely.
|
|
292
292
|
- characters.description: ENGLISH only, factual, no camera/action words.
|
|
293
293
|
- DISTINCT FACES (CRITICAL): every character MUST have a HIGHLY UNIQUE facial structure, a distinct hairstyle, a specific body type and a clearly different age. NEVER reuse the same or a similar facial description for two characters — they must look completely different from one another.`;
|
|
294
|
-
// Phase-1
|
|
295
|
-
//
|
|
296
|
-
//
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
|
|
304
|
-
body: JSON.stringify({ model: 'gpt-4o', max_tokens: 1600, messages: [{ role: 'user', content }] }),
|
|
305
|
-
});
|
|
306
|
-
if (!res.ok)
|
|
307
|
-
return empty;
|
|
308
|
-
const data = await res.json();
|
|
309
|
-
cost?.chat('phase1', 'gpt-4o', data.usage);
|
|
310
|
-
const rawTxt = (data.choices?.[0]?.message?.content || '').trim();
|
|
294
|
+
// Phase-1 on Gemini 3 Flash (strong multimodal, far cheaper image tokens
|
|
295
|
+
// than gpt-4o) with a dense 30-frame seed. 1 call/video; final cast
|
|
296
|
+
// recall is double-covered by the reconcile pass. Generous output budget
|
|
297
|
+
// so any model-side thinking can't starve the JSON answer.
|
|
298
|
+
const parts = [
|
|
299
|
+
{ text: promptText },
|
|
300
|
+
...frames.map(b64 => ({ inlineData: { mimeType: 'image/jpeg', data: b64 } })),
|
|
301
|
+
];
|
|
302
|
+
const { text: rawTxt, usage } = await geminiGenerate(parts, 3000, 'gemini-3-flash-preview');
|
|
311
303
|
if (!rawTxt)
|
|
312
304
|
return empty;
|
|
305
|
+
cost?.geminiVision('phase1', usage, 'gemini-3-flash-preview');
|
|
313
306
|
let parsed = {};
|
|
314
307
|
try {
|
|
315
308
|
const m = rawTxt.match(/\{[\s\S]*\}/);
|