tuna-agent 0.1.142 → 0.1.143

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,6 +31,14 @@ export interface AnalyzeVideoResult {
31
31
  voiceover: string;
32
32
  visual_description: string;
33
33
  }>;
34
+ stats?: {
35
+ duration_ms: number;
36
+ cost_usd: number;
37
+ breakdown: Record<string, {
38
+ calls: number;
39
+ cost: number;
40
+ }>;
41
+ };
34
42
  isError?: boolean;
35
43
  error?: string;
36
44
  }
@@ -14,6 +14,33 @@ const OPENAI_KEY = process.env.OPENAI_API_KEY || '';
14
14
  const YT_DLP = process.env.YT_DLP_BIN || '/home/gatoasang94/.local/bin/yt-dlp';
15
15
  const FFMPEG = process.env.FFMPEG_BIN || '/usr/bin/ffmpeg';
16
16
  const FFPROBE = process.env.FFPROBE_BIN || '/usr/bin/ffprobe';
17
+ // USD rates (update if OpenAI pricing changes). Chat = per 1M tokens.
18
+ const RATES = {
19
+ whisperPerMin: 0.006,
20
+ 'gpt-4o-mini': { in: 0.15, out: 0.60 },
21
+ 'gpt-4o': { in: 2.50, out: 10.0 },
22
+ };
23
+ // Per-run cost+token accumulator. Single-threaded JS → plain mutation is safe
24
+ // even across the parallel visionDescribe calls.
25
+ class CostTracker {
26
+ breakdown = {};
27
+ add(bucket, cost) {
28
+ const b = this.breakdown[bucket] || (this.breakdown[bucket] = { calls: 0, cost: 0 });
29
+ b.calls++;
30
+ b.cost += cost;
31
+ }
32
+ chat(bucket, model, usage) {
33
+ if (!usage) {
34
+ this.add(bucket, 0);
35
+ return;
36
+ }
37
+ const r = RATES[model];
38
+ const cost = ((usage.prompt_tokens || 0) / 1e6) * r.in + ((usage.completion_tokens || 0) / 1e6) * r.out;
39
+ this.add(bucket, cost);
40
+ }
41
+ whisper(audioSec) { this.add('whisper', (audioSec / 60) * RATES.whisperPerMin); }
42
+ total() { return Object.values(this.breakdown).reduce((s, b) => s + b.cost, 0); }
43
+ }
17
44
  // Downloaded source videos are cached by URL hash so re-analyze doesn't
18
45
  // re-download (saves bandwidth + time on long clips). relabs01 shares disk
19
46
  // with Demucs + the local media server, so the cache is bounded: drop files
@@ -85,7 +112,7 @@ async function whisperTranscribe(audioPath) {
85
112
  throw new Error(`whisper ${res.status}: ${(await res.text()).slice(0, 300)}`);
86
113
  return res.json();
87
114
  }
88
- async function correctTranscript(rawText, language) {
115
+ async function correctTranscript(rawText, language, cost) {
89
116
  if (!OPENAI_KEY || !rawText || rawText.length < 20)
90
117
  return rawText;
91
118
  try {
@@ -111,6 +138,7 @@ ${rawText}`,
111
138
  if (!res.ok)
112
139
  return rawText;
113
140
  const data = await res.json();
141
+ cost?.chat('correction', 'gpt-4o-mini', data.usage);
114
142
  return data.choices?.[0]?.message?.content?.trim() || rawText;
115
143
  }
116
144
  catch {
@@ -122,7 +150,7 @@ ${rawText}`,
122
150
  // turns") instead of guessing from a single frozen midpoint. The model is
123
151
  // told the frames are chronological so it describes the action arc, not 3
124
152
  // separate moments.
125
- async function visionDescribe(frameB64s, voiceoverText, castContext = '') {
153
+ async function visionDescribe(frameB64s, voiceoverText, castContext = '', cost) {
126
154
  if (!OPENAI_KEY)
127
155
  return '';
128
156
  const frames = frameB64s.filter(Boolean);
@@ -164,6 +192,7 @@ Voiceover during this scene: "${voiceoverText || '(none)'}"` },
164
192
  if (!res.ok)
165
193
  return '';
166
194
  const data = await res.json();
195
+ cost?.chat('vision', 'gpt-4o-mini', data.usage);
167
196
  return data.choices?.[0]?.message?.content?.trim() || '';
168
197
  }
169
198
  // Phase 1 (the strong part of AI_Video_Clone, ported): ONE gpt-4o call over
@@ -177,7 +206,7 @@ Voiceover during this scene: "${voiceoverText || '(none)'}"` },
177
206
  // Folding all three into one call is cheaper than the previous two calls
178
207
  // (style + cast) AND uses gpt-4o for style (was gpt-4o-mini). The master-cast
179
208
  // prompt is assembled here in the exact format ScriptImporter parses.
180
- async function visionExtractPhase1(frames, transcript) {
209
+ async function visionExtractPhase1(frames, transcript, cost) {
181
210
  const empty = {
182
211
  video_summary: '',
183
212
  video_style: '',
@@ -220,6 +249,7 @@ Rules:
220
249
  if (!res.ok)
221
250
  return empty;
222
251
  const data = await res.json();
252
+ cost?.chat('phase1', 'gpt-4o', data.usage);
223
253
  const rawTxt = (data.choices?.[0]?.message?.content || '').trim();
224
254
  let parsed = {};
225
255
  try {
@@ -253,6 +283,8 @@ Rules:
253
283
  }
254
284
  export async function analyzeVideo(url, onProgress) {
255
285
  const progress = onProgress || (() => { });
286
+ const cost = new CostTracker();
287
+ const t0 = Date.now();
256
288
  const tmpDir = path.join(os.tmpdir(), 'tuna-analyze-' + crypto.randomBytes(6).toString('hex'));
257
289
  await fs.mkdir(tmpDir, { recursive: true });
258
290
  // Video lives in the persistent URL-keyed cache (NOT tmpDir) so re-analyze
@@ -314,9 +346,10 @@ export async function analyzeVideo(url, onProgress) {
314
346
  progress('Đang transcribe bằng Whisper...');
315
347
  console.log('[analyze_video] Transcribing via Whisper');
316
348
  const rawTranscript = await whisperTranscribe(audioPath);
349
+ cost.whisper(durationSec);
317
350
  progress('Đang sửa lỗi transcript...');
318
351
  console.log('[analyze_video] AI correcting transcript');
319
- const correctedText = await correctTranscript(rawTranscript.text, rawTranscript.language);
352
+ const correctedText = await correctTranscript(rawTranscript.text, rawTranscript.language, cost);
320
353
  const transcript = { ...rawTranscript, text: correctedText };
321
354
  const segments = transcript.segments || [];
322
355
  const sceneSlots = [];
@@ -445,7 +478,7 @@ export async function analyzeVideo(url, onProgress) {
445
478
  .filter((_, i) => i % p1Step === 0)
446
479
  .slice(0, p1SampleCount)
447
480
  .map(f => f.thumb.toString('base64'));
448
- const { video_summary, video_style, master_cast_prompt, characters } = await visionExtractPhase1(p1Samples, transcript.text || '');
481
+ const { video_summary, video_style, master_cast_prompt, characters } = await visionExtractPhase1(p1Samples, transcript.text || '', cost);
449
482
  const castContext = characters.length
450
483
  ? characters.map(c => `- ${c.name}: ${c.description}`).join('\n')
451
484
  : '';
@@ -460,7 +493,7 @@ export async function analyzeVideo(url, onProgress) {
460
493
  progress(`Đang phân tích scene ${b + 1}-${Math.min(b + BATCH_SIZE, frameBuffers.length)}/${frameBuffers.length}...`);
461
494
  const results = await Promise.all(batch.map(async ({ idx, frames, thumb, slot }) => {
462
495
  try {
463
- const visual_description = await visionDescribe(frames.map(f => f.toString('base64')), slot.voiceover, castContext);
496
+ const visual_description = await visionDescribe(frames.map(f => f.toString('base64')), slot.voiceover, castContext, cost);
464
497
  return {
465
498
  scene_number: idx + 1,
466
499
  timestamp_start: Math.round(slot.start * 10) / 10,
@@ -490,6 +523,11 @@ export async function analyzeVideo(url, onProgress) {
490
523
  characters,
491
524
  segments: segments.map((s) => ({ start: s.start, end: s.end, text: s.text })),
492
525
  scenes,
526
+ stats: {
527
+ duration_ms: Date.now() - t0,
528
+ cost_usd: +cost.total().toFixed(4),
529
+ breakdown: cost.breakdown,
530
+ },
493
531
  };
494
532
  }
495
533
  finally {
@@ -85,7 +85,15 @@ export async function handleClaudePrompt(ws, code, taskId, prompt, systemPrompt,
85
85
  timeoutMs: timeoutMs && timeoutMs > 0 ? timeoutMs : 60000,
86
86
  });
87
87
  const text = typeof result === 'string' ? result : result.result || JSON.stringify(result);
88
- console.log(`[claude_prompt] Result: ${text.substring(0, 200)}`);
88
+ // claude-cli reports the run's real cost/duration in --output-format json.
89
+ const meta = (result && typeof result === 'object')
90
+ ? {
91
+ costUsd: result.costUsd,
92
+ durationMs: result.durationMs,
93
+ numTurns: result.numTurns,
94
+ }
95
+ : undefined;
96
+ console.log(`[claude_prompt] Result: ${text.substring(0, 200)} | cost=$${meta?.costUsd ?? '?'} dur=${meta?.durationMs ?? '?'}ms`);
89
97
  // Try to parse JSON from response
90
98
  const cleaned = text.replace(/```json?\s*/g, '').replace(/```/g, '').trim();
91
99
  let parsed = null;
@@ -100,8 +108,8 @@ export async function handleClaudePrompt(ws, code, taskId, prompt, systemPrompt,
100
108
  }
101
109
  catch { }
102
110
  }
103
- ws.sendExtensionEvent(code, { type: 'prompt_result', taskId, result: parsed, raw: text });
104
- ws.sendExtensionDone(code, taskId, { result: parsed, raw: text });
111
+ ws.sendExtensionEvent(code, { type: 'prompt_result', taskId, result: parsed, raw: text, meta });
112
+ ws.sendExtensionDone(code, taskId, { result: parsed, raw: text, meta });
105
113
  }
106
114
  catch (err) {
107
115
  console.error(`[claude_prompt] Error: ${err.message}`);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "tuna-agent",
3
- "version": "0.1.142",
3
+ "version": "0.1.143",
4
4
  "description": "Tuna Agent - Run AI coding tasks on your machine",
5
5
  "bin": {
6
6
  "tuna-agent": "dist/cli/index.js"