tuna-agent 0.1.143 → 0.1.145

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,11 @@
1
1
  /**
2
2
  * Analyze Video Handler
3
3
  * Downloads YouTube video via yt-dlp, extracts audio, transcribes via Whisper,
4
- * extracts frames per segment, describes each frame via GPT-4o vision.
4
+ * extracts frames per segment, describes each scene via Gemini 3 Flash vision
5
+ * (per-scene; Phase-1 summary/style/cast still uses gpt-4o).
5
6
  *
6
- * Requires on the host machine: yt-dlp, ffmpeg, ffprobe, OPENAI_API_KEY env var.
7
+ * Requires on the host machine: yt-dlp, ffmpeg, ffprobe, OPENAI_API_KEY
8
+ * (Whisper + correction + Phase-1) and GEMINI_API_KEY (per-scene vision) env vars.
7
9
  */
8
10
  import { AgentWebSocketClient } from './ws-client.js';
9
11
  export interface AnalyzeVideoResult {
@@ -1,9 +1,11 @@
1
1
  /**
2
2
  * Analyze Video Handler
3
3
  * Downloads YouTube video via yt-dlp, extracts audio, transcribes via Whisper,
4
- * extracts frames per segment, describes each frame via GPT-4o vision.
4
+ * extracts frames per segment, describes each scene via Gemini 3 Flash vision
5
+ * (per-scene; Phase-1 summary/style/cast still uses gpt-4o).
5
6
  *
6
- * Requires on the host machine: yt-dlp, ffmpeg, ffprobe, OPENAI_API_KEY env var.
7
+ * Requires on the host machine: yt-dlp, ffmpeg, ffprobe, OPENAI_API_KEY
8
+ * (Whisper + correction + Phase-1) and GEMINI_API_KEY (per-scene vision) env vars.
7
9
  */
8
10
  import { spawn } from 'child_process';
9
11
  import { promises as fs } from 'fs';
@@ -11,6 +13,12 @@ import path from 'path';
11
13
  import os from 'os';
12
14
  import crypto from 'crypto';
13
15
  const OPENAI_KEY = process.env.OPENAI_API_KEY || '';
16
+ // Gemini 3 Flash powers per-scene visionDescribe: cheaper image tokens than
17
+ // gpt-4o-mini's ~33x multiplier and a stronger VLM. Comma/newline-separated
18
+ // list → rotate on 429 so a single free-tier key still completes (slower).
19
+ const GEMINI_KEYS = (process.env.GEMINI_API_KEY || '')
20
+ .split(/[,\n]+/).map(s => s.trim()).filter(Boolean);
21
+ const GEMINI_MODEL = process.env.GEMINI_MODEL || 'gemini-3-flash-preview';
14
22
  const YT_DLP = process.env.YT_DLP_BIN || '/home/gatoasang94/.local/bin/yt-dlp';
15
23
  const FFMPEG = process.env.FFMPEG_BIN || '/usr/bin/ffmpeg';
16
24
  const FFPROBE = process.env.FFPROBE_BIN || '/usr/bin/ffprobe';
@@ -19,6 +27,8 @@ const RATES = {
19
27
  whisperPerMin: 0.006,
20
28
  'gpt-4o-mini': { in: 0.15, out: 0.60 },
21
29
  'gpt-4o': { in: 2.50, out: 10.0 },
30
+ // Gemini 3 Flash preview: text+image input share one rate, output 6x.
31
+ 'gemini-3-flash-preview': { in: 0.50, out: 3.0 },
22
32
  };
23
33
  // Per-run cost+token accumulator. Single-threaded JS → plain mutation is safe
24
34
  // even across the parallel visionDescribe calls.
@@ -38,6 +48,17 @@ class CostTracker {
38
48
  const cost = ((usage.prompt_tokens || 0) / 1e6) * r.in + ((usage.completion_tokens || 0) / 1e6) * r.out;
39
49
  this.add(bucket, cost);
40
50
  }
51
+ // Gemini reports usageMetadata.{promptTokenCount,candidatesTokenCount}
52
+ // instead of OpenAI's prompt_tokens/completion_tokens.
53
+ geminiVision(bucket, usage) {
54
+ if (!usage) {
55
+ this.add(bucket, 0);
56
+ return;
57
+ }
58
+ const r = RATES['gemini-3-flash-preview'];
59
+ const cost = ((usage.promptTokenCount || 0) / 1e6) * r.in + ((usage.candidatesTokenCount || 0) / 1e6) * r.out;
60
+ this.add(bucket, cost);
61
+ }
41
62
  whisper(audioSec) { this.add('whisper', (audioSec / 60) * RATES.whisperPerMin); }
42
63
  total() { return Object.values(this.breakdown).reduce((s, b) => s + b.cost, 0); }
43
64
  }
@@ -150,8 +171,49 @@ ${rawText}`,
150
171
  // turns") instead of guessing from a single frozen midpoint. The model is
151
172
  // told the frames are chronological so it describes the action arc, not 3
152
173
  // separate moments.
174
+ // One Gemini generateContent call with key rotation + exponential backoff on
175
+ // 429/5xx. A single free-tier key under the 5-way concurrent batch WILL
176
+ // rate-limit; retrying (slower) beats dropping the scene description.
177
+ async function geminiGenerate(parts, maxOutputTokens) {
178
+ if (!GEMINI_KEYS.length)
179
+ return { text: '' };
180
+ const body = JSON.stringify({
181
+ contents: [{ parts }],
182
+ generationConfig: { maxOutputTokens, temperature: 0.4 },
183
+ });
184
+ const MAX_ATTEMPTS = 6;
185
+ let keyIdx = 0;
186
+ let lastErr = '';
187
+ for (let attempt = 0; attempt < MAX_ATTEMPTS; attempt++) {
188
+ const key = GEMINI_KEYS[keyIdx % GEMINI_KEYS.length];
189
+ try {
190
+ const res = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/${encodeURIComponent(GEMINI_MODEL)}:generateContent?key=${key}`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body });
191
+ if (res.status === 429 || res.status >= 500) {
192
+ lastErr = `Gemini ${res.status}`;
193
+ keyIdx++; // rotate to the next key before backing off
194
+ const backoff = Math.min(60000, 1500 * 2 ** attempt) + Math.floor(Math.random() * 1000);
195
+ await new Promise(r => setTimeout(r, backoff));
196
+ continue;
197
+ }
198
+ if (!res.ok) {
199
+ lastErr = `Gemini ${res.status}: ${(await res.text()).slice(0, 200)}`;
200
+ break;
201
+ }
202
+ const data = await res.json();
203
+ const text = (data?.candidates?.[0]?.content?.parts || [])
204
+ .map(p => p.text || '').join('').trim();
205
+ return { text, usage: data?.usageMetadata };
206
+ }
207
+ catch (e) {
208
+ lastErr = e instanceof Error ? e.message : String(e);
209
+ await new Promise(r => setTimeout(r, Math.min(30000, 1000 * 2 ** attempt)));
210
+ }
211
+ }
212
+ console.warn(`[analyze_video] geminiGenerate failed after retries: ${lastErr}`);
213
+ return { text: '' };
214
+ }
153
215
  async function visionDescribe(frameB64s, voiceoverText, castContext = '', cost) {
154
- if (!OPENAI_KEY)
216
+ if (!GEMINI_KEYS.length)
155
217
  return '';
156
218
  const frames = frameB64s.filter(Boolean);
157
219
  if (frames.length === 0)
@@ -162,20 +224,7 @@ async function visionDescribe(frameB64s, voiceoverText, castContext = '', cost)
162
224
  const seqNote = frames.length > 1
163
225
  ? ` The ${frames.length} images are CHRONOLOGICAL samples from the SAME scene (start → middle → end). Treat them as one continuous shot: describe the MOTION ARC across them (direction of movement, camera push/pan, what changes from first to last frame), not three separate moments.`
164
226
  : '';
165
- const imageParts = frames.map(b64 => ({
166
- type: 'image_url',
167
- image_url: { url: `data:image/jpeg;base64,${b64}` },
168
- }));
169
- const res = await fetch('https://api.openai.com/v1/chat/completions', {
170
- method: 'POST',
171
- headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
172
- body: JSON.stringify({
173
- model: 'gpt-4o-mini',
174
- max_tokens: 350,
175
- messages: [{
176
- role: 'user',
177
- content: [
178
- { type: 'text', text: `Describe this scene in detail (4-6 sentences, English).${seqNote}${castBlock} Include:
227
+ const promptText = `Describe this scene in detail (4-6 sentences, English).${seqNote}${castBlock} Include:
179
228
  - Characters: name them using the KNOWN CHARACTER CAST labels above when they appear; appearance (shape, color, size), facial expression, what they're doing
180
229
  - Physical connections: Are characters physically joined/attached/fused together (e.g. organs connected at a junction, body parts linked)? Or are they separate/independent? Be VERY specific — "physically attached at Y-junction" is different from "standing next to each other"
181
230
  - Spatial positions: exact position of each character (left/right/above/below/center), distance between them
@@ -183,17 +232,14 @@ async function visionDescribe(frameB64s, voiceoverText, castContext = '', cost)
183
232
  - Camera: angle, framing (close-up, wide, etc.), and any camera movement across the frames
184
233
  - Action: the movement/action arc from first to last frame (direction, what changes)
185
234
 
186
- Voiceover during this scene: "${voiceoverText || '(none)'}"` },
187
- ...imageParts,
188
- ],
189
- }],
190
- }),
191
- });
192
- if (!res.ok)
193
- return '';
194
- const data = await res.json();
195
- cost?.chat('vision', 'gpt-4o-mini', data.usage);
196
- return data.choices?.[0]?.message?.content?.trim() || '';
235
+ Voiceover during this scene: "${voiceoverText || '(none)'}"`;
236
+ const parts = [
237
+ { text: promptText },
238
+ ...frames.map(b64 => ({ inlineData: { mimeType: 'image/jpeg', data: b64 } })),
239
+ ];
240
+ const { text, usage } = await geminiGenerate(parts, 512);
241
+ cost?.geminiVision('vision', usage);
242
+ return text;
197
243
  }
198
244
  // Phase 1 (the strong part of AI_Video_Clone, ported): ONE gpt-4o call over
199
245
  // frames sampled across the whole video + transcript that returns, together:
@@ -65,6 +65,8 @@ const TOOLS = [
65
65
  properties: {
66
66
  name: { type: 'string', description: 'Name/title of the idea' },
67
67
  category: { type: 'string', description: 'Category of the idea (e.g. SaaS, Mobile App, AI Tool, etc.)' },
68
+ startup: { type: 'string', enum: ['tuctac', 'capple', 'thinkless'], description: 'Which startup this idea is scoped to (REQUIRED for scoped scans)' },
69
+ idea_kind: { type: 'string', enum: ['feature', 'improvement', 'expansion', 'counter-competitor'], description: 'Kind of idea relative to the startup (optional)' },
68
70
  source: { type: 'string', enum: ['trending', 'research', 'social', 'creative'], description: 'Where the idea came from' },
69
71
  strategy: { type: 'string', description: 'High-level strategy or approach for this idea (optional)' },
70
72
  url: { type: 'string', description: 'Reference URL related to the idea (optional)' },
@@ -87,6 +89,8 @@ const TOOLS = [
87
89
  idea_id: { type: 'string', description: 'The ID of the idea to update' },
88
90
  name: { type: 'string', description: 'New name (optional)' },
89
91
  category: { type: 'string', description: 'New category (optional)' },
92
+ startup: { type: 'string', enum: ['tuctac', 'capple', 'thinkless'], description: 'New startup scope (optional)' },
93
+ idea_kind: { type: 'string', enum: ['feature', 'improvement', 'expansion', 'counter-competitor'], description: 'New idea kind (optional)' },
90
94
  status: { type: 'string', enum: ['new', 'watching', 'validated', 'archived'], description: 'New status (optional)' },
91
95
  score: { type: 'number', description: 'New score (optional)' },
92
96
  notes: { type: 'string', description: 'New notes (optional)' },
@@ -109,6 +113,8 @@ const TOOLS = [
109
113
  properties: {
110
114
  status: { type: 'string', enum: ['new', 'watching', 'validated', 'archived'], description: 'Filter by status (optional)' },
111
115
  category: { type: 'string', description: 'Filter by category (optional)' },
116
+ startup: { type: 'string', enum: ['tuctac', 'capple', 'thinkless'], description: 'Filter by startup (optional)' },
117
+ idea_kind: { type: 'string', description: 'Filter by idea kind (optional)' },
112
118
  source: { type: 'string', enum: ['trending', 'research', 'social', 'creative'], description: 'Filter by source (optional)' },
113
119
  agent_id: { type: 'string', description: 'Filter by agent ID (optional, defaults to current agent)' },
114
120
  page: { type: 'number', description: 'Page number (optional, default 1)' },
@@ -250,6 +256,10 @@ async function handleToolCall(config, toolName, args) {
250
256
  body.competitors = args.competitors.split(',').map((c) => c.trim());
251
257
  if (args.notes)
252
258
  body.notes = args.notes;
259
+ if (args.startup)
260
+ body.startup = args.startup;
261
+ if (args.idea_kind)
262
+ body.idea_kind = args.idea_kind;
253
263
  const data = await apiCall(config, 'POST', '/agent-idea', body);
254
264
  return { content: [{ type: 'text', text: `Idea "${data.name}" created (ID: ${data._id})` }] };
255
265
  }
@@ -282,6 +292,10 @@ async function handleToolCall(config, toolName, args) {
282
292
  body.competitors = args.competitors.split(',').map((c) => c.trim());
283
293
  if (args.times_seen)
284
294
  body.times_seen = Number(args.times_seen);
295
+ if (args.startup)
296
+ body.startup = args.startup;
297
+ if (args.idea_kind)
298
+ body.idea_kind = args.idea_kind;
285
299
  const data = await apiCall(config, 'PUT', `/agent-idea/${args.idea_id}`, body);
286
300
  return { content: [{ type: 'text', text: `Idea "${data.name}" updated (ID: ${data._id})` }] };
287
301
  }
@@ -293,6 +307,10 @@ async function handleToolCall(config, toolName, args) {
293
307
  params.set('status', args.status);
294
308
  if (args.category)
295
309
  params.set('category', args.category);
310
+ if (args.startup)
311
+ params.set('startup', args.startup);
312
+ if (args.idea_kind)
313
+ params.set('idea_kind', args.idea_kind);
296
314
  if (args.source)
297
315
  params.set('source', args.source);
298
316
  if (args.page)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "tuna-agent",
3
- "version": "0.1.143",
3
+ "version": "0.1.145",
4
4
  "description": "Tuna Agent - Run AI coding tasks on your machine",
5
5
  "bin": {
6
6
  "tuna-agent": "dist/cli/index.js"