npm - tuna-agent - Versions diffs - 0.1.142 → 0.1.143 - Mend

tuna-agent 0.1.142 → 0.1.143

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/daemon/analyze-video-handler.d.ts +8 -0
package/dist/daemon/analyze-video-handler.js +44 -6
package/dist/daemon/extension-handlers.js +11 -3
package/package.json +1 -1

package/dist/daemon/analyze-video-handler.d.ts CHANGED Viewed

@@ -31,6 +31,14 @@ export interface AnalyzeVideoResult {
         voiceover: string;
         visual_description: string;
     }>;
+    stats?: {
+        duration_ms: number;
+        cost_usd: number;
+        breakdown: Record<string, {
+            calls: number;
+            cost: number;
+        }>;
+    };
     isError?: boolean;
     error?: string;
 }

package/dist/daemon/analyze-video-handler.js CHANGED Viewed

@@ -14,6 +14,33 @@ const OPENAI_KEY = process.env.OPENAI_API_KEY || '';
 const YT_DLP = process.env.YT_DLP_BIN || '/home/gatoasang94/.local/bin/yt-dlp';
 const FFMPEG = process.env.FFMPEG_BIN || '/usr/bin/ffmpeg';
 const FFPROBE = process.env.FFPROBE_BIN || '/usr/bin/ffprobe';
+// USD rates (update if OpenAI pricing changes). Chat = per 1M tokens.
+const RATES = {
+    whisperPerMin: 0.006,
+    'gpt-4o-mini': { in: 0.15, out: 0.60 },
+    'gpt-4o': { in: 2.50, out: 10.0 },
+};
+// Per-run cost+token accumulator. Single-threaded JS → plain mutation is safe
+// even across the parallel visionDescribe calls.
+class CostTracker {
+    breakdown = {};
+    add(bucket, cost) {
+        const b = this.breakdown[bucket] || (this.breakdown[bucket] = { calls: 0, cost: 0 });
+        b.calls++;
+        b.cost += cost;
+    }
+    chat(bucket, model, usage) {
+        if (!usage) {
+            this.add(bucket, 0);
+            return;
+        }
+        const r = RATES[model];
+        const cost = ((usage.prompt_tokens || 0) / 1e6) * r.in + ((usage.completion_tokens || 0) / 1e6) * r.out;
+        this.add(bucket, cost);
+    }
+    whisper(audioSec) { this.add('whisper', (audioSec / 60) * RATES.whisperPerMin); }
+    total() { return Object.values(this.breakdown).reduce((s, b) => s + b.cost, 0); }
+}
 // Downloaded source videos are cached by URL hash so re-analyze doesn't
 // re-download (saves bandwidth + time on long clips). relabs01 shares disk
 // with Demucs + the local media server, so the cache is bounded: drop files
@@ -85,7 +112,7 @@ async function whisperTranscribe(audioPath) {
         throw new Error(`whisper ${res.status}: ${(await res.text()).slice(0, 300)}`);
     return res.json();
 }
-async function correctTranscript(rawText, language) {
+async function correctTranscript(rawText, language, cost) {
     if (!OPENAI_KEY || !rawText || rawText.length < 20)
         return rawText;
     try {
@@ -111,6 +138,7 @@ ${rawText}`,
         if (!res.ok)
             return rawText;
         const data = await res.json();
+        cost?.chat('correction', 'gpt-4o-mini', data.usage);
         return data.choices?.[0]?.message?.content?.trim() || rawText;
     }
     catch {
@@ -122,7 +150,7 @@ ${rawText}`,
 // turns") instead of guessing from a single frozen midpoint. The model is
 // told the frames are chronological so it describes the action arc, not 3
 // separate moments.
-async function visionDescribe(frameB64s, voiceoverText, castContext = '') {
+async function visionDescribe(frameB64s, voiceoverText, castContext = '', cost) {
     if (!OPENAI_KEY)
         return '';
     const frames = frameB64s.filter(Boolean);
@@ -164,6 +192,7 @@ Voiceover during this scene: "${voiceoverText || '(none)'}"` },
     if (!res.ok)
         return '';
     const data = await res.json();
+    cost?.chat('vision', 'gpt-4o-mini', data.usage);
     return data.choices?.[0]?.message?.content?.trim() || '';
 }
 // Phase 1 (the strong part of AI_Video_Clone, ported): ONE gpt-4o call over
@@ -177,7 +206,7 @@ Voiceover during this scene: "${voiceoverText || '(none)'}"` },
 // Folding all three into one call is cheaper than the previous two calls
 // (style + cast) AND uses gpt-4o for style (was gpt-4o-mini). The master-cast
 // prompt is assembled here in the exact format ScriptImporter parses.
-async function visionExtractPhase1(frames, transcript) {
+async function visionExtractPhase1(frames, transcript, cost) {
     const empty = {
         video_summary: '',
         video_style: '',
@@ -220,6 +249,7 @@ Rules:
         if (!res.ok)
             return empty;
         const data = await res.json();
+        cost?.chat('phase1', 'gpt-4o', data.usage);
         const rawTxt = (data.choices?.[0]?.message?.content || '').trim();
         let parsed = {};
         try {
@@ -253,6 +283,8 @@ Rules:
 }
 export async function analyzeVideo(url, onProgress) {
     const progress = onProgress || (() => { });
+    const cost = new CostTracker();
+    const t0 = Date.now();
     const tmpDir = path.join(os.tmpdir(), 'tuna-analyze-' + crypto.randomBytes(6).toString('hex'));
     await fs.mkdir(tmpDir, { recursive: true });
     // Video lives in the persistent URL-keyed cache (NOT tmpDir) so re-analyze
@@ -314,9 +346,10 @@ export async function analyzeVideo(url, onProgress) {
         progress('Đang transcribe bằng Whisper...');
         console.log('[analyze_video] Transcribing via Whisper');
         const rawTranscript = await whisperTranscribe(audioPath);
+        cost.whisper(durationSec);
         progress('Đang sửa lỗi transcript...');
         console.log('[analyze_video] AI correcting transcript');
-        const correctedText = await correctTranscript(rawTranscript.text, rawTranscript.language);
+        const correctedText = await correctTranscript(rawTranscript.text, rawTranscript.language, cost);
         const transcript = { ...rawTranscript, text: correctedText };
         const segments = transcript.segments || [];
         const sceneSlots = [];
@@ -445,7 +478,7 @@ export async function analyzeVideo(url, onProgress) {
             .filter((_, i) => i % p1Step === 0)
             .slice(0, p1SampleCount)
             .map(f => f.thumb.toString('base64'));
-        const { video_summary, video_style, master_cast_prompt, characters } = await visionExtractPhase1(p1Samples, transcript.text || '');
+        const { video_summary, video_style, master_cast_prompt, characters } = await visionExtractPhase1(p1Samples, transcript.text || '', cost);
         const castContext = characters.length
             ? characters.map(c => `- ${c.name}: ${c.description}`).join('\n')
             : '';
@@ -460,7 +493,7 @@ export async function analyzeVideo(url, onProgress) {
             progress(`Đang phân tích scene ${b + 1}-${Math.min(b + BATCH_SIZE, frameBuffers.length)}/${frameBuffers.length}...`);
             const results = await Promise.all(batch.map(async ({ idx, frames, thumb, slot }) => {
                 try {
-                    const visual_description = await visionDescribe(frames.map(f => f.toString('base64')), slot.voiceover, castContext);
+                    const visual_description = await visionDescribe(frames.map(f => f.toString('base64')), slot.voiceover, castContext, cost);
                     return {
                         scene_number: idx + 1,
                         timestamp_start: Math.round(slot.start * 10) / 10,
@@ -490,6 +523,11 @@ export async function analyzeVideo(url, onProgress) {
             characters,
             segments: segments.map((s) => ({ start: s.start, end: s.end, text: s.text })),
             scenes,
+            stats: {
+                duration_ms: Date.now() - t0,
+                cost_usd: +cost.total().toFixed(4),
+                breakdown: cost.breakdown,
+            },
         };
     }
     finally {

package/dist/daemon/extension-handlers.js CHANGED Viewed

@@ -85,7 +85,15 @@ export async function handleClaudePrompt(ws, code, taskId, prompt, systemPrompt,
             timeoutMs: timeoutMs && timeoutMs > 0 ? timeoutMs : 60000,
         });
         const text = typeof result === 'string' ? result : result.result || JSON.stringify(result);
-        console.log(`[claude_prompt] Result: ${text.substring(0, 200)}`);
+        // claude-cli reports the run's real cost/duration in --output-format json.
+        const meta = (result && typeof result === 'object')
+            ? {
+                costUsd: result.costUsd,
+                durationMs: result.durationMs,
+                numTurns: result.numTurns,
+            }
+            : undefined;
+        console.log(`[claude_prompt] Result: ${text.substring(0, 200)} | cost=$${meta?.costUsd ?? '?'} dur=${meta?.durationMs ?? '?'}ms`);
         // Try to parse JSON from response
         const cleaned = text.replace(/```json?\s*/g, '').replace(/```/g, '').trim();
         let parsed = null;
@@ -100,8 +108,8 @@ export async function handleClaudePrompt(ws, code, taskId, prompt, systemPrompt,
                 }
                 catch { }
         }
-        ws.sendExtensionEvent(code, { type: 'prompt_result', taskId, result: parsed, raw: text });
-        ws.sendExtensionDone(code, taskId, { result: parsed, raw: text });
+        ws.sendExtensionEvent(code, { type: 'prompt_result', taskId, result: parsed, raw: text, meta });
+        ws.sendExtensionDone(code, taskId, { result: parsed, raw: text, meta });
     }
     catch (err) {
         console.error(`[claude_prompt] Error: ${err.message}`);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "tuna-agent",
-  "version": "0.1.142",
+  "version": "0.1.143",
   "description": "Tuna Agent - Run AI coding tasks on your machine",
   "bin": {
     "tuna-agent": "dist/cli/index.js"