tuna-agent 0.1.137 → 0.1.139
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -14,6 +14,49 @@ const OPENAI_KEY = process.env.OPENAI_API_KEY || '';
|
|
|
14
14
|
const YT_DLP = process.env.YT_DLP_BIN || '/home/gatoasang94/.local/bin/yt-dlp';
|
|
15
15
|
const FFMPEG = process.env.FFMPEG_BIN || '/usr/bin/ffmpeg';
|
|
16
16
|
const FFPROBE = process.env.FFPROBE_BIN || '/usr/bin/ffprobe';
|
|
17
|
+
// Downloaded source videos are cached by URL hash so re-analyze doesn't
|
|
18
|
+
// re-download (saves bandwidth + time on long clips). relabs01 shares disk
|
|
19
|
+
// with Demucs + the local media server, so the cache is bounded: drop files
|
|
20
|
+
// older than 7 days, then if the total still exceeds 15 GB evict oldest-first.
|
|
21
|
+
const CACHE_DIR = path.join(os.homedir(), '.tuna-analyze-cache');
|
|
22
|
+
const CACHE_MAX_AGE_MS = 7 * 24 * 3600 * 1000;
|
|
23
|
+
const CACHE_MAX_BYTES = 15 * 1024 * 1024 * 1024;
|
|
24
|
+
async function pruneVideoCache() {
|
|
25
|
+
try {
|
|
26
|
+
await fs.mkdir(CACHE_DIR, { recursive: true });
|
|
27
|
+
const names = await fs.readdir(CACHE_DIR);
|
|
28
|
+
const now = Date.now();
|
|
29
|
+
const live = [];
|
|
30
|
+
for (const name of names) {
|
|
31
|
+
const p = path.join(CACHE_DIR, name);
|
|
32
|
+
try {
|
|
33
|
+
const st = await fs.stat(p);
|
|
34
|
+
if (!st.isFile())
|
|
35
|
+
continue;
|
|
36
|
+
if (now - st.mtimeMs > CACHE_MAX_AGE_MS) {
|
|
37
|
+
await fs.rm(p, { force: true });
|
|
38
|
+
continue;
|
|
39
|
+
}
|
|
40
|
+
live.push({ p, size: st.size, mtime: st.mtimeMs });
|
|
41
|
+
}
|
|
42
|
+
catch { /* race with another run deleting it — ignore */ }
|
|
43
|
+
}
|
|
44
|
+
let total = live.reduce((s, f) => s + f.size, 0);
|
|
45
|
+
if (total > CACHE_MAX_BYTES) {
|
|
46
|
+
live.sort((a, b) => a.mtime - b.mtime); // oldest first
|
|
47
|
+
for (const f of live) {
|
|
48
|
+
if (total <= CACHE_MAX_BYTES)
|
|
49
|
+
break;
|
|
50
|
+
try {
|
|
51
|
+
await fs.rm(f.p, { force: true });
|
|
52
|
+
total -= f.size;
|
|
53
|
+
}
|
|
54
|
+
catch { /* ignore */ }
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
catch { /* cache pruning is best-effort; never block analysis */ }
|
|
59
|
+
}
|
|
17
60
|
function run(cmd, args, opts = {}) {
|
|
18
61
|
return new Promise((resolve, reject) => {
|
|
19
62
|
const p = spawn(cmd, args, { stdio: ['ignore', 'pipe', 'pipe'], ...opts });
|
|
@@ -123,38 +166,47 @@ Voiceover during this scene: "${voiceoverText || '(none)'}"` },
|
|
|
123
166
|
const data = await res.json();
|
|
124
167
|
return data.choices?.[0]?.message?.content?.trim() || '';
|
|
125
168
|
}
|
|
126
|
-
// Phase 1 (
|
|
127
|
-
//
|
|
128
|
-
//
|
|
129
|
-
//
|
|
130
|
-
//
|
|
131
|
-
//
|
|
132
|
-
//
|
|
133
|
-
//
|
|
134
|
-
|
|
135
|
-
|
|
169
|
+
// Phase 1 (the strong part of AI_Video_Clone, ported): ONE gpt-4o call over
|
|
170
|
+
// frames sampled across the whole video + transcript that returns, together:
|
|
171
|
+
// - video_summary: a cinematic paragraph of the whole story (drives the
|
|
172
|
+
// downstream script-generation prompt — the tool's biggest edge)
|
|
173
|
+
// - video_style: a rich 3-4 sentence aesthetic analysis (medium, palette,
|
|
174
|
+
// lighting, camera language) — replaces the old terse 1-2 sentence
|
|
175
|
+
// visionExtractStyle gpt-4o-mini call entirely
|
|
176
|
+
// - characters[]: the recurring cast for the [CHARACTER CAST LIST] block
|
|
177
|
+
// Folding all three into one call is cheaper than the previous two calls
|
|
178
|
+
// (style + cast) AND uses gpt-4o for style (was gpt-4o-mini). The master-cast
|
|
179
|
+
// prompt is assembled here in the exact format ScriptImporter parses.
|
|
180
|
+
async function visionExtractPhase1(frames, transcript) {
|
|
181
|
+
const empty = {
|
|
182
|
+
video_summary: '',
|
|
183
|
+
video_style: '',
|
|
184
|
+
master_cast_prompt: '',
|
|
185
|
+
characters: [],
|
|
186
|
+
};
|
|
136
187
|
if (!OPENAI_KEY || frames.length === 0)
|
|
137
188
|
return empty;
|
|
138
189
|
try {
|
|
139
190
|
const content = [
|
|
140
191
|
{
|
|
141
192
|
type: 'text',
|
|
142
|
-
text: `Act as a Master Film Director. These frames are sampled across an
|
|
193
|
+
text: `Act as a Master Film Director. These frames are sampled across an ENTIRE video, in order. Use them + the transcript to analyse the whole piece.
|
|
143
194
|
|
|
144
|
-
Transcript context
|
|
195
|
+
Transcript context: "${(transcript || '').slice(0, 4000)}"
|
|
145
196
|
|
|
146
197
|
Return ONLY a JSON object, no markdown fences:
|
|
147
198
|
{
|
|
199
|
+
"video_summary": "One detailed cinematic paragraph (5-8 sentences, English) telling the WHOLE story start to finish: setup, key beats, climax, resolution. This is the narrative spine — be specific about what happens.",
|
|
200
|
+
"video_style": "3-4 sentences (English): artistic medium (2D/3D/live-action/CGI), color palette, lighting, camera language, overall aesthetic vibe. Cinematic, specific.",
|
|
148
201
|
"characters": [
|
|
149
202
|
{ "name": "SHORT_UPPERCASE_LABEL", "description": "one-line English visual description: age/build, face, hair, outfit, colors, distinguishing features" }
|
|
150
203
|
]
|
|
151
204
|
}
|
|
152
205
|
|
|
153
206
|
Rules:
|
|
154
|
-
- name: a stable short uppercase label
|
|
155
|
-
- Only RECURRING subjects worth a reference sheet. Skip one-off
|
|
156
|
-
- description: ENGLISH only, factual, no camera/action words
|
|
157
|
-
- Max 6 characters.`,
|
|
207
|
+
- characters.name: a stable short uppercase label reused for this subject (e.g. "THE BISHOP", "U-94 SUBMARINE"). Max 4 words.
|
|
208
|
+
- Only RECURRING subjects worth a reference sheet. Skip one-off extras. Max 6.
|
|
209
|
+
- characters.description: ENGLISH only, factual, no camera/action words.`,
|
|
158
210
|
},
|
|
159
211
|
];
|
|
160
212
|
for (const b64 of frames) {
|
|
@@ -163,16 +215,16 @@ Rules:
|
|
|
163
215
|
const res = await fetch('https://api.openai.com/v1/chat/completions', {
|
|
164
216
|
method: 'POST',
|
|
165
217
|
headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
|
|
166
|
-
body: JSON.stringify({ model: 'gpt-4o', max_tokens:
|
|
218
|
+
body: JSON.stringify({ model: 'gpt-4o', max_tokens: 1600, messages: [{ role: 'user', content }] }),
|
|
167
219
|
});
|
|
168
220
|
if (!res.ok)
|
|
169
221
|
return empty;
|
|
170
222
|
const data = await res.json();
|
|
171
|
-
const
|
|
223
|
+
const rawTxt = (data.choices?.[0]?.message?.content || '').trim();
|
|
172
224
|
let parsed = {};
|
|
173
225
|
try {
|
|
174
|
-
const m =
|
|
175
|
-
parsed = JSON.parse(m ? m[0] :
|
|
226
|
+
const m = rawTxt.match(/\{[\s\S]*\}/);
|
|
227
|
+
parsed = JSON.parse(m ? m[0] : rawTxt);
|
|
176
228
|
}
|
|
177
229
|
catch {
|
|
178
230
|
return empty;
|
|
@@ -181,57 +233,63 @@ Rules:
|
|
|
181
233
|
.filter(c => c && c.name && c.description)
|
|
182
234
|
.slice(0, 6)
|
|
183
235
|
.map(c => ({ name: String(c.name).trim(), description: String(c.description).trim() }));
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
catch {
|
|
196
|
-
return empty;
|
|
197
|
-
}
|
|
198
|
-
}
|
|
199
|
-
async function visionExtractStyle(frames) {
|
|
200
|
-
if (!OPENAI_KEY || frames.length === 0)
|
|
201
|
-
return '';
|
|
202
|
-
try {
|
|
203
|
-
const content = [
|
|
204
|
-
{ type: 'text', text: 'Analyze these frames from a video and extract a concise visual style description (1-2 sentences). Focus on: animation style (cartoon, realistic, anime, etc.), color palette, lighting, character design approach (anthropomorphized objects, real people, etc.), and overall aesthetic.\n\nReturn ONLY the style description, nothing else.' },
|
|
205
|
-
];
|
|
206
|
-
for (const b64 of frames) {
|
|
207
|
-
content.push({ type: 'image_url', image_url: { url: `data:image/jpeg;base64,${b64}` } });
|
|
236
|
+
const video_summary = (parsed.video_summary || '').trim();
|
|
237
|
+
const video_style = (parsed.video_style || '').trim();
|
|
238
|
+
let master_cast_prompt = '';
|
|
239
|
+
if (characters.length > 0) {
|
|
240
|
+
const styleLine = video_style || 'Keep the original video’s visual style, color grading, and lighting.';
|
|
241
|
+
const castList = characters.map(c => `- ${c.name}: ${c.description}`).join('\n');
|
|
242
|
+
master_cast_prompt =
|
|
243
|
+
`[AESTHETIC & STYLE]\n${styleLine}\n` +
|
|
244
|
+
`[COMPOSITION & LAYOUT]\nCharacter Reference Sheet. Full-body side-by-side.\n` +
|
|
245
|
+
`[CHARACTER CAST LIST]\n${castList}\n` +
|
|
246
|
+
`[TECHNICAL SPECIFICATIONS]\nHigh detail, 8k resolution, consistent facial structures across all frames.`;
|
|
208
247
|
}
|
|
209
|
-
|
|
210
|
-
method: 'POST',
|
|
211
|
-
headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
|
|
212
|
-
body: JSON.stringify({ model: 'gpt-4o-mini', max_tokens: 200, messages: [{ role: 'user', content }] }),
|
|
213
|
-
});
|
|
214
|
-
if (!res.ok)
|
|
215
|
-
return '';
|
|
216
|
-
const data = await res.json();
|
|
217
|
-
return (data.choices?.[0]?.message?.content || '').trim().replace(/\*\*/g, '');
|
|
248
|
+
return { video_summary, video_style, master_cast_prompt, characters };
|
|
218
249
|
}
|
|
219
250
|
catch {
|
|
220
|
-
return
|
|
251
|
+
return empty;
|
|
221
252
|
}
|
|
222
253
|
}
|
|
223
254
|
export async function analyzeVideo(url, onProgress) {
|
|
224
255
|
const progress = onProgress || (() => { });
|
|
225
256
|
const tmpDir = path.join(os.tmpdir(), 'tuna-analyze-' + crypto.randomBytes(6).toString('hex'));
|
|
226
257
|
await fs.mkdir(tmpDir, { recursive: true });
|
|
227
|
-
|
|
258
|
+
// Video lives in the persistent URL-keyed cache (NOT tmpDir) so re-analyze
|
|
259
|
+
// reuses it. Only audio/frames are per-run + cleaned up in `finally`.
|
|
260
|
+
const urlHash = crypto.createHash('sha1').update(url).digest('hex');
|
|
261
|
+
const videoPath = path.join(CACHE_DIR, `${urlHash}.mp4`);
|
|
228
262
|
const audioPath = path.join(tmpDir, 'audio.mp3');
|
|
229
263
|
const framesDir = path.join(tmpDir, 'frames');
|
|
230
264
|
await fs.mkdir(framesDir, { recursive: true });
|
|
231
265
|
try {
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
266
|
+
await pruneVideoCache();
|
|
267
|
+
const cached = await fs.stat(videoPath).then(st => st.isFile() && st.size > 0).catch(() => false);
|
|
268
|
+
if (cached) {
|
|
269
|
+
progress('Dùng video đã tải (cache)...');
|
|
270
|
+
console.log('[analyze_video] Cache HIT:', videoPath);
|
|
271
|
+
// Bump mtime so an actively re-analyzed video isn't evicted by age.
|
|
272
|
+
try {
|
|
273
|
+
const now = new Date();
|
|
274
|
+
await fs.utimes(videoPath, now, now);
|
|
275
|
+
}
|
|
276
|
+
catch { /* ignore */ }
|
|
277
|
+
}
|
|
278
|
+
else {
|
|
279
|
+
progress('Đang tải video...');
|
|
280
|
+
console.log('[analyze_video] Cache MISS, downloading:', url);
|
|
281
|
+
// Download to a temp name then atomically rename in, so a concurrent
|
|
282
|
+
// analyze of the same URL never reads a half-written file.
|
|
283
|
+
const dlTmp = path.join(CACHE_DIR, `${urlHash}.dl-${crypto.randomBytes(4).toString('hex')}.mp4`);
|
|
284
|
+
try {
|
|
285
|
+
await run(YT_DLP, ['-f', 'best[height<=720]/best', '-o', dlTmp, '--no-playlist', '--quiet', url]);
|
|
286
|
+
await fs.rename(dlTmp, videoPath);
|
|
287
|
+
}
|
|
288
|
+
catch (e) {
|
|
289
|
+
await fs.rm(dlTmp, { force: true }).catch(() => { });
|
|
290
|
+
throw e;
|
|
291
|
+
}
|
|
292
|
+
}
|
|
235
293
|
// Grab the original video title (metadata only, no extra download) so the
|
|
236
294
|
// clone idea gets a real name instead of "Clone: www.youtube.com".
|
|
237
295
|
let source_title = '';
|
|
@@ -376,25 +434,22 @@ export async function analyzeVideo(url, onProgress) {
|
|
|
376
434
|
frameBuffers.push({ idx: i, frames: buffers, thumb: thumb || buffers[0], slot });
|
|
377
435
|
}
|
|
378
436
|
}
|
|
379
|
-
// Step 2:
|
|
380
|
-
//
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
const castSamples = frameBuffers
|
|
390
|
-
.filter((_, i) => i % castStep === 0)
|
|
391
|
-
.slice(0, castSampleCount)
|
|
437
|
+
// Step 2: Phase 1 — ONE gpt-4o call returning summary + rich style +
|
|
438
|
+
// master cast + characters. Runs before per-scene describe so the cast
|
|
439
|
+
// context keeps naming consistent across the whole timeline.
|
|
440
|
+
progress('Đang phân tích tổng thể (summary + style + master cast)...');
|
|
441
|
+
// Sample up to 12 frames evenly across the whole video.
|
|
442
|
+
const p1SampleCount = Math.min(12, frameBuffers.length);
|
|
443
|
+
const p1Step = Math.max(1, Math.floor(frameBuffers.length / p1SampleCount));
|
|
444
|
+
const p1Samples = frameBuffers
|
|
445
|
+
.filter((_, i) => i % p1Step === 0)
|
|
446
|
+
.slice(0, p1SampleCount)
|
|
392
447
|
.map(f => f.thumb.toString('base64'));
|
|
393
|
-
const { master_cast_prompt, characters } = await
|
|
448
|
+
const { video_summary, video_style, master_cast_prompt, characters } = await visionExtractPhase1(p1Samples, transcript.text || '');
|
|
394
449
|
const castContext = characters.length
|
|
395
450
|
? characters.map(c => `- ${c.name}: ${c.description}`).join('\n')
|
|
396
451
|
: '';
|
|
397
|
-
console.log('[analyze_video]
|
|
452
|
+
console.log('[analyze_video] Phase1 — style:', video_style.slice(0, 80), '| cast:', characters.map(c => c.name).join(', ') || '(none)', '| summary:', video_summary.length, 'chars');
|
|
398
453
|
// Step 3: Vision describe all frames in parallel (batch of 5), passing the
|
|
399
454
|
// cast context so naming stays consistent across the whole timeline.
|
|
400
455
|
progress(`Đang phân tích ${frameBuffers.length} scenes song song...`);
|
|
@@ -429,6 +484,7 @@ export async function analyzeVideo(url, onProgress) {
|
|
|
429
484
|
duration_sec: Math.round(durationSec),
|
|
430
485
|
language: transcript.language || 'unknown',
|
|
431
486
|
transcript: transcript.text || '',
|
|
487
|
+
summary: video_summary,
|
|
432
488
|
video_style,
|
|
433
489
|
master_cast_prompt,
|
|
434
490
|
characters,
|