tuna-agent 0.1.143 → 0.1.145
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Analyze Video Handler
|
|
3
3
|
* Downloads YouTube video via yt-dlp, extracts audio, transcribes via Whisper,
|
|
4
|
-
* extracts frames per segment, describes each
|
|
4
|
+
* extracts frames per segment, describes each scene via Gemini 3 Flash vision
|
|
5
|
+
* (per-scene; Phase-1 summary/style/cast still uses gpt-4o).
|
|
5
6
|
*
|
|
6
|
-
* Requires on the host machine: yt-dlp, ffmpeg, ffprobe, OPENAI_API_KEY
|
|
7
|
+
* Requires on the host machine: yt-dlp, ffmpeg, ffprobe, OPENAI_API_KEY
|
|
8
|
+
* (Whisper + correction + Phase-1) and GEMINI_API_KEY (per-scene vision) env vars.
|
|
7
9
|
*/
|
|
8
10
|
import { AgentWebSocketClient } from './ws-client.js';
|
|
9
11
|
export interface AnalyzeVideoResult {
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Analyze Video Handler
|
|
3
3
|
* Downloads YouTube video via yt-dlp, extracts audio, transcribes via Whisper,
|
|
4
|
-
* extracts frames per segment, describes each
|
|
4
|
+
* extracts frames per segment, describes each scene via Gemini 3 Flash vision
|
|
5
|
+
* (per-scene; Phase-1 summary/style/cast still uses gpt-4o).
|
|
5
6
|
*
|
|
6
|
-
* Requires on the host machine: yt-dlp, ffmpeg, ffprobe, OPENAI_API_KEY
|
|
7
|
+
* Requires on the host machine: yt-dlp, ffmpeg, ffprobe, OPENAI_API_KEY
|
|
8
|
+
* (Whisper + correction + Phase-1) and GEMINI_API_KEY (per-scene vision) env vars.
|
|
7
9
|
*/
|
|
8
10
|
import { spawn } from 'child_process';
|
|
9
11
|
import { promises as fs } from 'fs';
|
|
@@ -11,6 +13,12 @@ import path from 'path';
|
|
|
11
13
|
import os from 'os';
|
|
12
14
|
import crypto from 'crypto';
|
|
13
15
|
const OPENAI_KEY = process.env.OPENAI_API_KEY || '';
|
|
16
|
+
// Gemini 3 Flash powers per-scene visionDescribe: cheaper image tokens than
|
|
17
|
+
// gpt-4o-mini's ~33x multiplier and a stronger VLM. Comma/newline-separated
|
|
18
|
+
// list → rotate on 429 so a single free-tier key still completes (slower).
|
|
19
|
+
const GEMINI_KEYS = (process.env.GEMINI_API_KEY || '')
|
|
20
|
+
.split(/[,\n]+/).map(s => s.trim()).filter(Boolean);
|
|
21
|
+
const GEMINI_MODEL = process.env.GEMINI_MODEL || 'gemini-3-flash-preview';
|
|
14
22
|
const YT_DLP = process.env.YT_DLP_BIN || '/home/gatoasang94/.local/bin/yt-dlp';
|
|
15
23
|
const FFMPEG = process.env.FFMPEG_BIN || '/usr/bin/ffmpeg';
|
|
16
24
|
const FFPROBE = process.env.FFPROBE_BIN || '/usr/bin/ffprobe';
|
|
@@ -19,6 +27,8 @@ const RATES = {
|
|
|
19
27
|
whisperPerMin: 0.006,
|
|
20
28
|
'gpt-4o-mini': { in: 0.15, out: 0.60 },
|
|
21
29
|
'gpt-4o': { in: 2.50, out: 10.0 },
|
|
30
|
+
// Gemini 3 Flash preview: text+image input share one rate, output 6x.
|
|
31
|
+
'gemini-3-flash-preview': { in: 0.50, out: 3.0 },
|
|
22
32
|
};
|
|
23
33
|
// Per-run cost+token accumulator. Single-threaded JS → plain mutation is safe
|
|
24
34
|
// even across the parallel visionDescribe calls.
|
|
@@ -38,6 +48,17 @@ class CostTracker {
|
|
|
38
48
|
const cost = ((usage.prompt_tokens || 0) / 1e6) * r.in + ((usage.completion_tokens || 0) / 1e6) * r.out;
|
|
39
49
|
this.add(bucket, cost);
|
|
40
50
|
}
|
|
51
|
+
// Gemini reports usageMetadata.{promptTokenCount,candidatesTokenCount}
|
|
52
|
+
// instead of OpenAI's prompt_tokens/completion_tokens.
|
|
53
|
+
geminiVision(bucket, usage) {
|
|
54
|
+
if (!usage) {
|
|
55
|
+
this.add(bucket, 0);
|
|
56
|
+
return;
|
|
57
|
+
}
|
|
58
|
+
const r = RATES['gemini-3-flash-preview'];
|
|
59
|
+
const cost = ((usage.promptTokenCount || 0) / 1e6) * r.in + ((usage.candidatesTokenCount || 0) / 1e6) * r.out;
|
|
60
|
+
this.add(bucket, cost);
|
|
61
|
+
}
|
|
41
62
|
whisper(audioSec) { this.add('whisper', (audioSec / 60) * RATES.whisperPerMin); }
|
|
42
63
|
total() { return Object.values(this.breakdown).reduce((s, b) => s + b.cost, 0); }
|
|
43
64
|
}
|
|
@@ -150,8 +171,49 @@ ${rawText}`,
|
|
|
150
171
|
// turns") instead of guessing from a single frozen midpoint. The model is
|
|
151
172
|
// told the frames are chronological so it describes the action arc, not 3
|
|
152
173
|
// separate moments.
|
|
174
|
+
// One Gemini generateContent call with key rotation + exponential backoff on
|
|
175
|
+
// 429/5xx. A single free-tier key under the 5-way concurrent batch WILL
|
|
176
|
+
// rate-limit; retrying (slower) beats dropping the scene description.
|
|
177
|
+
async function geminiGenerate(parts, maxOutputTokens) {
|
|
178
|
+
if (!GEMINI_KEYS.length)
|
|
179
|
+
return { text: '' };
|
|
180
|
+
const body = JSON.stringify({
|
|
181
|
+
contents: [{ parts }],
|
|
182
|
+
generationConfig: { maxOutputTokens, temperature: 0.4 },
|
|
183
|
+
});
|
|
184
|
+
const MAX_ATTEMPTS = 6;
|
|
185
|
+
let keyIdx = 0;
|
|
186
|
+
let lastErr = '';
|
|
187
|
+
for (let attempt = 0; attempt < MAX_ATTEMPTS; attempt++) {
|
|
188
|
+
const key = GEMINI_KEYS[keyIdx % GEMINI_KEYS.length];
|
|
189
|
+
try {
|
|
190
|
+
const res = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/${encodeURIComponent(GEMINI_MODEL)}:generateContent?key=${key}`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body });
|
|
191
|
+
if (res.status === 429 || res.status >= 500) {
|
|
192
|
+
lastErr = `Gemini ${res.status}`;
|
|
193
|
+
keyIdx++; // rotate to the next key before backing off
|
|
194
|
+
const backoff = Math.min(60000, 1500 * 2 ** attempt) + Math.floor(Math.random() * 1000);
|
|
195
|
+
await new Promise(r => setTimeout(r, backoff));
|
|
196
|
+
continue;
|
|
197
|
+
}
|
|
198
|
+
if (!res.ok) {
|
|
199
|
+
lastErr = `Gemini ${res.status}: ${(await res.text()).slice(0, 200)}`;
|
|
200
|
+
break;
|
|
201
|
+
}
|
|
202
|
+
const data = await res.json();
|
|
203
|
+
const text = (data?.candidates?.[0]?.content?.parts || [])
|
|
204
|
+
.map(p => p.text || '').join('').trim();
|
|
205
|
+
return { text, usage: data?.usageMetadata };
|
|
206
|
+
}
|
|
207
|
+
catch (e) {
|
|
208
|
+
lastErr = e instanceof Error ? e.message : String(e);
|
|
209
|
+
await new Promise(r => setTimeout(r, Math.min(30000, 1000 * 2 ** attempt)));
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
console.warn(`[analyze_video] geminiGenerate failed after retries: ${lastErr}`);
|
|
213
|
+
return { text: '' };
|
|
214
|
+
}
|
|
153
215
|
async function visionDescribe(frameB64s, voiceoverText, castContext = '', cost) {
|
|
154
|
-
if (!
|
|
216
|
+
if (!GEMINI_KEYS.length)
|
|
155
217
|
return '';
|
|
156
218
|
const frames = frameB64s.filter(Boolean);
|
|
157
219
|
if (frames.length === 0)
|
|
@@ -162,20 +224,7 @@ async function visionDescribe(frameB64s, voiceoverText, castContext = '', cost)
|
|
|
162
224
|
const seqNote = frames.length > 1
|
|
163
225
|
? ` The ${frames.length} images are CHRONOLOGICAL samples from the SAME scene (start → middle → end). Treat them as one continuous shot: describe the MOTION ARC across them (direction of movement, camera push/pan, what changes from first to last frame), not three separate moments.`
|
|
164
226
|
: '';
|
|
165
|
-
const
|
|
166
|
-
type: 'image_url',
|
|
167
|
-
image_url: { url: `data:image/jpeg;base64,${b64}` },
|
|
168
|
-
}));
|
|
169
|
-
const res = await fetch('https://api.openai.com/v1/chat/completions', {
|
|
170
|
-
method: 'POST',
|
|
171
|
-
headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
|
|
172
|
-
body: JSON.stringify({
|
|
173
|
-
model: 'gpt-4o-mini',
|
|
174
|
-
max_tokens: 350,
|
|
175
|
-
messages: [{
|
|
176
|
-
role: 'user',
|
|
177
|
-
content: [
|
|
178
|
-
{ type: 'text', text: `Describe this scene in detail (4-6 sentences, English).${seqNote}${castBlock} Include:
|
|
227
|
+
const promptText = `Describe this scene in detail (4-6 sentences, English).${seqNote}${castBlock} Include:
|
|
179
228
|
- Characters: name them using the KNOWN CHARACTER CAST labels above when they appear; appearance (shape, color, size), facial expression, what they're doing
|
|
180
229
|
- Physical connections: Are characters physically joined/attached/fused together (e.g. organs connected at a junction, body parts linked)? Or are they separate/independent? Be VERY specific — "physically attached at Y-junction" is different from "standing next to each other"
|
|
181
230
|
- Spatial positions: exact position of each character (left/right/above/below/center), distance between them
|
|
@@ -183,17 +232,14 @@ async function visionDescribe(frameB64s, voiceoverText, castContext = '', cost)
|
|
|
183
232
|
- Camera: angle, framing (close-up, wide, etc.), and any camera movement across the frames
|
|
184
233
|
- Action: the movement/action arc from first to last frame (direction, what changes)
|
|
185
234
|
|
|
186
|
-
Voiceover during this scene: "${voiceoverText || '(none)'}"
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
});
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
const data = await res.json();
|
|
195
|
-
cost?.chat('vision', 'gpt-4o-mini', data.usage);
|
|
196
|
-
return data.choices?.[0]?.message?.content?.trim() || '';
|
|
235
|
+
Voiceover during this scene: "${voiceoverText || '(none)'}"`;
|
|
236
|
+
const parts = [
|
|
237
|
+
{ text: promptText },
|
|
238
|
+
...frames.map(b64 => ({ inlineData: { mimeType: 'image/jpeg', data: b64 } })),
|
|
239
|
+
];
|
|
240
|
+
const { text, usage } = await geminiGenerate(parts, 512);
|
|
241
|
+
cost?.geminiVision('vision', usage);
|
|
242
|
+
return text;
|
|
197
243
|
}
|
|
198
244
|
// Phase 1 (the strong part of AI_Video_Clone, ported): ONE gpt-4o call over
|
|
199
245
|
// frames sampled across the whole video + transcript that returns, together:
|
package/dist/mcp/idea-server.js
CHANGED
|
@@ -65,6 +65,8 @@ const TOOLS = [
|
|
|
65
65
|
properties: {
|
|
66
66
|
name: { type: 'string', description: 'Name/title of the idea' },
|
|
67
67
|
category: { type: 'string', description: 'Category of the idea (e.g. SaaS, Mobile App, AI Tool, etc.)' },
|
|
68
|
+
startup: { type: 'string', enum: ['tuctac', 'capple', 'thinkless'], description: 'Which startup this idea is scoped to (REQUIRED for scoped scans)' },
|
|
69
|
+
idea_kind: { type: 'string', enum: ['feature', 'improvement', 'expansion', 'counter-competitor'], description: 'Kind of idea relative to the startup (optional)' },
|
|
68
70
|
source: { type: 'string', enum: ['trending', 'research', 'social', 'creative'], description: 'Where the idea came from' },
|
|
69
71
|
strategy: { type: 'string', description: 'High-level strategy or approach for this idea (optional)' },
|
|
70
72
|
url: { type: 'string', description: 'Reference URL related to the idea (optional)' },
|
|
@@ -87,6 +89,8 @@ const TOOLS = [
|
|
|
87
89
|
idea_id: { type: 'string', description: 'The ID of the idea to update' },
|
|
88
90
|
name: { type: 'string', description: 'New name (optional)' },
|
|
89
91
|
category: { type: 'string', description: 'New category (optional)' },
|
|
92
|
+
startup: { type: 'string', enum: ['tuctac', 'capple', 'thinkless'], description: 'New startup scope (optional)' },
|
|
93
|
+
idea_kind: { type: 'string', enum: ['feature', 'improvement', 'expansion', 'counter-competitor'], description: 'New idea kind (optional)' },
|
|
90
94
|
status: { type: 'string', enum: ['new', 'watching', 'validated', 'archived'], description: 'New status (optional)' },
|
|
91
95
|
score: { type: 'number', description: 'New score (optional)' },
|
|
92
96
|
notes: { type: 'string', description: 'New notes (optional)' },
|
|
@@ -109,6 +113,8 @@ const TOOLS = [
|
|
|
109
113
|
properties: {
|
|
110
114
|
status: { type: 'string', enum: ['new', 'watching', 'validated', 'archived'], description: 'Filter by status (optional)' },
|
|
111
115
|
category: { type: 'string', description: 'Filter by category (optional)' },
|
|
116
|
+
startup: { type: 'string', enum: ['tuctac', 'capple', 'thinkless'], description: 'Filter by startup (optional)' },
|
|
117
|
+
idea_kind: { type: 'string', description: 'Filter by idea kind (optional)' },
|
|
112
118
|
source: { type: 'string', enum: ['trending', 'research', 'social', 'creative'], description: 'Filter by source (optional)' },
|
|
113
119
|
agent_id: { type: 'string', description: 'Filter by agent ID (optional, defaults to current agent)' },
|
|
114
120
|
page: { type: 'number', description: 'Page number (optional, default 1)' },
|
|
@@ -250,6 +256,10 @@ async function handleToolCall(config, toolName, args) {
|
|
|
250
256
|
body.competitors = args.competitors.split(',').map((c) => c.trim());
|
|
251
257
|
if (args.notes)
|
|
252
258
|
body.notes = args.notes;
|
|
259
|
+
if (args.startup)
|
|
260
|
+
body.startup = args.startup;
|
|
261
|
+
if (args.idea_kind)
|
|
262
|
+
body.idea_kind = args.idea_kind;
|
|
253
263
|
const data = await apiCall(config, 'POST', '/agent-idea', body);
|
|
254
264
|
return { content: [{ type: 'text', text: `Idea "${data.name}" created (ID: ${data._id})` }] };
|
|
255
265
|
}
|
|
@@ -282,6 +292,10 @@ async function handleToolCall(config, toolName, args) {
|
|
|
282
292
|
body.competitors = args.competitors.split(',').map((c) => c.trim());
|
|
283
293
|
if (args.times_seen)
|
|
284
294
|
body.times_seen = Number(args.times_seen);
|
|
295
|
+
if (args.startup)
|
|
296
|
+
body.startup = args.startup;
|
|
297
|
+
if (args.idea_kind)
|
|
298
|
+
body.idea_kind = args.idea_kind;
|
|
285
299
|
const data = await apiCall(config, 'PUT', `/agent-idea/${args.idea_id}`, body);
|
|
286
300
|
return { content: [{ type: 'text', text: `Idea "${data.name}" updated (ID: ${data._id})` }] };
|
|
287
301
|
}
|
|
@@ -293,6 +307,10 @@ async function handleToolCall(config, toolName, args) {
|
|
|
293
307
|
params.set('status', args.status);
|
|
294
308
|
if (args.category)
|
|
295
309
|
params.set('category', args.category);
|
|
310
|
+
if (args.startup)
|
|
311
|
+
params.set('startup', args.startup);
|
|
312
|
+
if (args.idea_kind)
|
|
313
|
+
params.set('idea_kind', args.idea_kind);
|
|
296
314
|
if (args.source)
|
|
297
315
|
params.set('source', args.source);
|
|
298
316
|
if (args.page)
|