tuna-agent 0.1.123 → 0.1.125
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -82,11 +82,19 @@ async function visionDescribe(frameB64, voiceoverText) {
|
|
|
82
82
|
headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
|
|
83
83
|
body: JSON.stringify({
|
|
84
84
|
model: 'gpt-4o-mini',
|
|
85
|
-
max_tokens:
|
|
85
|
+
max_tokens: 300,
|
|
86
86
|
messages: [{
|
|
87
87
|
role: 'user',
|
|
88
88
|
content: [
|
|
89
|
-
{ type: 'text', text: `
|
|
89
|
+
{ type: 'text', text: `Describe this frame in detail (4-6 sentences, English). Include:
|
|
90
|
+
- Characters: appearance (shape, color, size), facial expression, what they're doing
|
|
91
|
+
- Physical connections: Are characters physically joined/attached/fused together (e.g. organs connected at a junction, body parts linked)? Or are they separate/independent? Be VERY specific — "physically attached at Y-junction" is different from "standing next to each other"
|
|
92
|
+
- Spatial positions: exact position of each character (left/right/above/below/center), distance between them
|
|
93
|
+
- Environment: setting, lighting, color palette, atmosphere
|
|
94
|
+
- Camera: angle, framing (close-up, wide, etc.)
|
|
95
|
+
- Action: what is happening in this moment, movement direction
|
|
96
|
+
|
|
97
|
+
Voiceover at this moment: "${voiceoverText || '(none)'}"` },
|
|
90
98
|
{ type: 'image_url', image_url: { url: `data:image/jpeg;base64,${frameB64}` } },
|
|
91
99
|
],
|
|
92
100
|
}],
|