tuna-agent 0.1.142 → 0.1.143
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -31,6 +31,14 @@ export interface AnalyzeVideoResult {
|
|
|
31
31
|
voiceover: string;
|
|
32
32
|
visual_description: string;
|
|
33
33
|
}>;
|
|
34
|
+
stats?: {
|
|
35
|
+
duration_ms: number;
|
|
36
|
+
cost_usd: number;
|
|
37
|
+
breakdown: Record<string, {
|
|
38
|
+
calls: number;
|
|
39
|
+
cost: number;
|
|
40
|
+
}>;
|
|
41
|
+
};
|
|
34
42
|
isError?: boolean;
|
|
35
43
|
error?: string;
|
|
36
44
|
}
|
|
@@ -14,6 +14,33 @@ const OPENAI_KEY = process.env.OPENAI_API_KEY || '';
|
|
|
14
14
|
const YT_DLP = process.env.YT_DLP_BIN || '/home/gatoasang94/.local/bin/yt-dlp';
|
|
15
15
|
const FFMPEG = process.env.FFMPEG_BIN || '/usr/bin/ffmpeg';
|
|
16
16
|
const FFPROBE = process.env.FFPROBE_BIN || '/usr/bin/ffprobe';
|
|
17
|
+
// USD rates (update if OpenAI pricing changes). Chat = per 1M tokens.
|
|
18
|
+
const RATES = {
|
|
19
|
+
whisperPerMin: 0.006,
|
|
20
|
+
'gpt-4o-mini': { in: 0.15, out: 0.60 },
|
|
21
|
+
'gpt-4o': { in: 2.50, out: 10.0 },
|
|
22
|
+
};
|
|
23
|
+
// Per-run cost+token accumulator. Single-threaded JS → plain mutation is safe
|
|
24
|
+
// even across the parallel visionDescribe calls.
|
|
25
|
+
class CostTracker {
|
|
26
|
+
breakdown = {};
|
|
27
|
+
add(bucket, cost) {
|
|
28
|
+
const b = this.breakdown[bucket] || (this.breakdown[bucket] = { calls: 0, cost: 0 });
|
|
29
|
+
b.calls++;
|
|
30
|
+
b.cost += cost;
|
|
31
|
+
}
|
|
32
|
+
chat(bucket, model, usage) {
|
|
33
|
+
if (!usage) {
|
|
34
|
+
this.add(bucket, 0);
|
|
35
|
+
return;
|
|
36
|
+
}
|
|
37
|
+
const r = RATES[model];
|
|
38
|
+
const cost = ((usage.prompt_tokens || 0) / 1e6) * r.in + ((usage.completion_tokens || 0) / 1e6) * r.out;
|
|
39
|
+
this.add(bucket, cost);
|
|
40
|
+
}
|
|
41
|
+
whisper(audioSec) { this.add('whisper', (audioSec / 60) * RATES.whisperPerMin); }
|
|
42
|
+
total() { return Object.values(this.breakdown).reduce((s, b) => s + b.cost, 0); }
|
|
43
|
+
}
|
|
17
44
|
// Downloaded source videos are cached by URL hash so re-analyze doesn't
|
|
18
45
|
// re-download (saves bandwidth + time on long clips). relabs01 shares disk
|
|
19
46
|
// with Demucs + the local media server, so the cache is bounded: drop files
|
|
@@ -85,7 +112,7 @@ async function whisperTranscribe(audioPath) {
|
|
|
85
112
|
throw new Error(`whisper ${res.status}: ${(await res.text()).slice(0, 300)}`);
|
|
86
113
|
return res.json();
|
|
87
114
|
}
|
|
88
|
-
async function correctTranscript(rawText, language) {
|
|
115
|
+
async function correctTranscript(rawText, language, cost) {
|
|
89
116
|
if (!OPENAI_KEY || !rawText || rawText.length < 20)
|
|
90
117
|
return rawText;
|
|
91
118
|
try {
|
|
@@ -111,6 +138,7 @@ ${rawText}`,
|
|
|
111
138
|
if (!res.ok)
|
|
112
139
|
return rawText;
|
|
113
140
|
const data = await res.json();
|
|
141
|
+
cost?.chat('correction', 'gpt-4o-mini', data.usage);
|
|
114
142
|
return data.choices?.[0]?.message?.content?.trim() || rawText;
|
|
115
143
|
}
|
|
116
144
|
catch {
|
|
@@ -122,7 +150,7 @@ ${rawText}`,
|
|
|
122
150
|
// turns") instead of guessing from a single frozen midpoint. The model is
|
|
123
151
|
// told the frames are chronological so it describes the action arc, not 3
|
|
124
152
|
// separate moments.
|
|
125
|
-
async function visionDescribe(frameB64s, voiceoverText, castContext = '') {
|
|
153
|
+
async function visionDescribe(frameB64s, voiceoverText, castContext = '', cost) {
|
|
126
154
|
if (!OPENAI_KEY)
|
|
127
155
|
return '';
|
|
128
156
|
const frames = frameB64s.filter(Boolean);
|
|
@@ -164,6 +192,7 @@ Voiceover during this scene: "${voiceoverText || '(none)'}"` },
|
|
|
164
192
|
if (!res.ok)
|
|
165
193
|
return '';
|
|
166
194
|
const data = await res.json();
|
|
195
|
+
cost?.chat('vision', 'gpt-4o-mini', data.usage);
|
|
167
196
|
return data.choices?.[0]?.message?.content?.trim() || '';
|
|
168
197
|
}
|
|
169
198
|
// Phase 1 (the strong part of AI_Video_Clone, ported): ONE gpt-4o call over
|
|
@@ -177,7 +206,7 @@ Voiceover during this scene: "${voiceoverText || '(none)'}"` },
|
|
|
177
206
|
// Folding all three into one call is cheaper than the previous two calls
|
|
178
207
|
// (style + cast) AND uses gpt-4o for style (was gpt-4o-mini). The master-cast
|
|
179
208
|
// prompt is assembled here in the exact format ScriptImporter parses.
|
|
180
|
-
async function visionExtractPhase1(frames, transcript) {
|
|
209
|
+
async function visionExtractPhase1(frames, transcript, cost) {
|
|
181
210
|
const empty = {
|
|
182
211
|
video_summary: '',
|
|
183
212
|
video_style: '',
|
|
@@ -220,6 +249,7 @@ Rules:
|
|
|
220
249
|
if (!res.ok)
|
|
221
250
|
return empty;
|
|
222
251
|
const data = await res.json();
|
|
252
|
+
cost?.chat('phase1', 'gpt-4o', data.usage);
|
|
223
253
|
const rawTxt = (data.choices?.[0]?.message?.content || '').trim();
|
|
224
254
|
let parsed = {};
|
|
225
255
|
try {
|
|
@@ -253,6 +283,8 @@ Rules:
|
|
|
253
283
|
}
|
|
254
284
|
export async function analyzeVideo(url, onProgress) {
|
|
255
285
|
const progress = onProgress || (() => { });
|
|
286
|
+
const cost = new CostTracker();
|
|
287
|
+
const t0 = Date.now();
|
|
256
288
|
const tmpDir = path.join(os.tmpdir(), 'tuna-analyze-' + crypto.randomBytes(6).toString('hex'));
|
|
257
289
|
await fs.mkdir(tmpDir, { recursive: true });
|
|
258
290
|
// Video lives in the persistent URL-keyed cache (NOT tmpDir) so re-analyze
|
|
@@ -314,9 +346,10 @@ export async function analyzeVideo(url, onProgress) {
|
|
|
314
346
|
progress('Đang transcribe bằng Whisper...');
|
|
315
347
|
console.log('[analyze_video] Transcribing via Whisper');
|
|
316
348
|
const rawTranscript = await whisperTranscribe(audioPath);
|
|
349
|
+
cost.whisper(durationSec);
|
|
317
350
|
progress('Đang sửa lỗi transcript...');
|
|
318
351
|
console.log('[analyze_video] AI correcting transcript');
|
|
319
|
-
const correctedText = await correctTranscript(rawTranscript.text, rawTranscript.language);
|
|
352
|
+
const correctedText = await correctTranscript(rawTranscript.text, rawTranscript.language, cost);
|
|
320
353
|
const transcript = { ...rawTranscript, text: correctedText };
|
|
321
354
|
const segments = transcript.segments || [];
|
|
322
355
|
const sceneSlots = [];
|
|
@@ -445,7 +478,7 @@ export async function analyzeVideo(url, onProgress) {
|
|
|
445
478
|
.filter((_, i) => i % p1Step === 0)
|
|
446
479
|
.slice(0, p1SampleCount)
|
|
447
480
|
.map(f => f.thumb.toString('base64'));
|
|
448
|
-
const { video_summary, video_style, master_cast_prompt, characters } = await visionExtractPhase1(p1Samples, transcript.text || '');
|
|
481
|
+
const { video_summary, video_style, master_cast_prompt, characters } = await visionExtractPhase1(p1Samples, transcript.text || '', cost);
|
|
449
482
|
const castContext = characters.length
|
|
450
483
|
? characters.map(c => `- ${c.name}: ${c.description}`).join('\n')
|
|
451
484
|
: '';
|
|
@@ -460,7 +493,7 @@ export async function analyzeVideo(url, onProgress) {
|
|
|
460
493
|
progress(`Đang phân tích scene ${b + 1}-${Math.min(b + BATCH_SIZE, frameBuffers.length)}/${frameBuffers.length}...`);
|
|
461
494
|
const results = await Promise.all(batch.map(async ({ idx, frames, thumb, slot }) => {
|
|
462
495
|
try {
|
|
463
|
-
const visual_description = await visionDescribe(frames.map(f => f.toString('base64')), slot.voiceover, castContext);
|
|
496
|
+
const visual_description = await visionDescribe(frames.map(f => f.toString('base64')), slot.voiceover, castContext, cost);
|
|
464
497
|
return {
|
|
465
498
|
scene_number: idx + 1,
|
|
466
499
|
timestamp_start: Math.round(slot.start * 10) / 10,
|
|
@@ -490,6 +523,11 @@ export async function analyzeVideo(url, onProgress) {
|
|
|
490
523
|
characters,
|
|
491
524
|
segments: segments.map((s) => ({ start: s.start, end: s.end, text: s.text })),
|
|
492
525
|
scenes,
|
|
526
|
+
stats: {
|
|
527
|
+
duration_ms: Date.now() - t0,
|
|
528
|
+
cost_usd: +cost.total().toFixed(4),
|
|
529
|
+
breakdown: cost.breakdown,
|
|
530
|
+
},
|
|
493
531
|
};
|
|
494
532
|
}
|
|
495
533
|
finally {
|
|
@@ -85,7 +85,15 @@ export async function handleClaudePrompt(ws, code, taskId, prompt, systemPrompt,
|
|
|
85
85
|
timeoutMs: timeoutMs && timeoutMs > 0 ? timeoutMs : 60000,
|
|
86
86
|
});
|
|
87
87
|
const text = typeof result === 'string' ? result : result.result || JSON.stringify(result);
|
|
88
|
-
|
|
88
|
+
// claude-cli reports the run's real cost/duration in --output-format json.
|
|
89
|
+
const meta = (result && typeof result === 'object')
|
|
90
|
+
? {
|
|
91
|
+
costUsd: result.costUsd,
|
|
92
|
+
durationMs: result.durationMs,
|
|
93
|
+
numTurns: result.numTurns,
|
|
94
|
+
}
|
|
95
|
+
: undefined;
|
|
96
|
+
console.log(`[claude_prompt] Result: ${text.substring(0, 200)} | cost=$${meta?.costUsd ?? '?'} dur=${meta?.durationMs ?? '?'}ms`);
|
|
89
97
|
// Try to parse JSON from response
|
|
90
98
|
const cleaned = text.replace(/```json?\s*/g, '').replace(/```/g, '').trim();
|
|
91
99
|
let parsed = null;
|
|
@@ -100,8 +108,8 @@ export async function handleClaudePrompt(ws, code, taskId, prompt, systemPrompt,
|
|
|
100
108
|
}
|
|
101
109
|
catch { }
|
|
102
110
|
}
|
|
103
|
-
ws.sendExtensionEvent(code, { type: 'prompt_result', taskId, result: parsed, raw: text });
|
|
104
|
-
ws.sendExtensionDone(code, taskId, { result: parsed, raw: text });
|
|
111
|
+
ws.sendExtensionEvent(code, { type: 'prompt_result', taskId, result: parsed, raw: text, meta });
|
|
112
|
+
ws.sendExtensionDone(code, taskId, { result: parsed, raw: text, meta });
|
|
105
113
|
}
|
|
106
114
|
catch (err) {
|
|
107
115
|
console.error(`[claude_prompt] Error: ${err.message}`);
|