tuna-agent 0.1.138 → 0.1.140
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -166,38 +166,47 @@ Voiceover during this scene: "${voiceoverText || '(none)'}"` },
|
|
|
166
166
|
const data = await res.json();
|
|
167
167
|
return data.choices?.[0]?.message?.content?.trim() || '';
|
|
168
168
|
}
|
|
169
|
-
// Phase 1 (
|
|
170
|
-
//
|
|
171
|
-
//
|
|
172
|
-
//
|
|
173
|
-
//
|
|
174
|
-
//
|
|
175
|
-
//
|
|
176
|
-
//
|
|
177
|
-
|
|
178
|
-
|
|
169
|
+
// Phase 1 (the strong part of AI_Video_Clone, ported): ONE gpt-4o call over
|
|
170
|
+
// frames sampled across the whole video + transcript that returns, together:
|
|
171
|
+
// - video_summary: a cinematic paragraph of the whole story (drives the
|
|
172
|
+
// downstream script-generation prompt — the tool's biggest edge)
|
|
173
|
+
// - video_style: a rich 3-4 sentence aesthetic analysis (medium, palette,
|
|
174
|
+
// lighting, camera language) — replaces the old terse 1-2 sentence
|
|
175
|
+
// visionExtractStyle gpt-4o-mini call entirely
|
|
176
|
+
// - characters[]: the recurring cast for the [CHARACTER CAST LIST] block
|
|
177
|
+
// Folding all three into one call is cheaper than the previous two calls
|
|
178
|
+
// (style + cast) AND uses gpt-4o for style (was gpt-4o-mini). The master-cast
|
|
179
|
+
// prompt is assembled here in the exact format ScriptImporter parses.
|
|
180
|
+
async function visionExtractPhase1(frames, transcript) {
|
|
181
|
+
const empty = {
|
|
182
|
+
video_summary: '',
|
|
183
|
+
video_style: '',
|
|
184
|
+
master_cast_prompt: '',
|
|
185
|
+
characters: [],
|
|
186
|
+
};
|
|
179
187
|
if (!OPENAI_KEY || frames.length === 0)
|
|
180
188
|
return empty;
|
|
181
189
|
try {
|
|
182
190
|
const content = [
|
|
183
191
|
{
|
|
184
192
|
type: 'text',
|
|
185
|
-
text: `Act as a Master Film Director. These frames are sampled across an
|
|
193
|
+
text: `Act as a Master Film Director. These frames are sampled across an ENTIRE video, in order. Use them + the transcript to analyse the whole piece.
|
|
186
194
|
|
|
187
|
-
Transcript context
|
|
195
|
+
Transcript context: "${(transcript || '').slice(0, 4000)}"
|
|
188
196
|
|
|
189
197
|
Return ONLY a JSON object, no markdown fences:
|
|
190
198
|
{
|
|
199
|
+
"video_summary": "One detailed cinematic paragraph (5-8 sentences, English) telling the WHOLE story start to finish: setup, key beats, climax, resolution. This is the narrative spine — be specific about what happens.",
|
|
200
|
+
"video_style": "3-4 sentences (English): artistic medium (2D/3D/live-action/CGI), color palette, lighting, camera language, overall aesthetic vibe. Cinematic, specific.",
|
|
191
201
|
"characters": [
|
|
192
202
|
{ "name": "SHORT_UPPERCASE_LABEL", "description": "one-line English visual description: age/build, face, hair, outfit, colors, distinguishing features" }
|
|
193
203
|
]
|
|
194
204
|
}
|
|
195
205
|
|
|
196
206
|
Rules:
|
|
197
|
-
- name: a stable short uppercase label
|
|
198
|
-
- Only RECURRING subjects worth a reference sheet. Skip one-off
|
|
199
|
-
- description: ENGLISH only, factual, no camera/action words
|
|
200
|
-
- Max 6 characters.`,
|
|
207
|
+
- characters.name: a stable short uppercase label reused for this subject (e.g. "THE BISHOP", "U-94 SUBMARINE"). Max 4 words.
|
|
208
|
+
- Only RECURRING subjects worth a reference sheet. Skip one-off extras. Max 6.
|
|
209
|
+
- characters.description: ENGLISH only, factual, no camera/action words.`,
|
|
201
210
|
},
|
|
202
211
|
];
|
|
203
212
|
for (const b64 of frames) {
|
|
@@ -206,16 +215,16 @@ Rules:
|
|
|
206
215
|
const res = await fetch('https://api.openai.com/v1/chat/completions', {
|
|
207
216
|
method: 'POST',
|
|
208
217
|
headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
|
|
209
|
-
body: JSON.stringify({ model: 'gpt-4o', max_tokens:
|
|
218
|
+
body: JSON.stringify({ model: 'gpt-4o', max_tokens: 1600, messages: [{ role: 'user', content }] }),
|
|
210
219
|
});
|
|
211
220
|
if (!res.ok)
|
|
212
221
|
return empty;
|
|
213
222
|
const data = await res.json();
|
|
214
|
-
const
|
|
223
|
+
const rawTxt = (data.choices?.[0]?.message?.content || '').trim();
|
|
215
224
|
let parsed = {};
|
|
216
225
|
try {
|
|
217
|
-
const m =
|
|
218
|
-
parsed = JSON.parse(m ? m[0] :
|
|
226
|
+
const m = rawTxt.match(/\{[\s\S]*\}/);
|
|
227
|
+
parsed = JSON.parse(m ? m[0] : rawTxt);
|
|
219
228
|
}
|
|
220
229
|
catch {
|
|
221
230
|
return empty;
|
|
@@ -224,43 +233,22 @@ Rules:
|
|
|
224
233
|
.filter(c => c && c.name && c.description)
|
|
225
234
|
.slice(0, 6)
|
|
226
235
|
.map(c => ({ name: String(c.name).trim(), description: String(c.description).trim() }));
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
catch {
|
|
239
|
-
return empty;
|
|
240
|
-
}
|
|
241
|
-
}
|
|
242
|
-
async function visionExtractStyle(frames) {
|
|
243
|
-
if (!OPENAI_KEY || frames.length === 0)
|
|
244
|
-
return '';
|
|
245
|
-
try {
|
|
246
|
-
const content = [
|
|
247
|
-
{ type: 'text', text: 'Analyze these frames from a video and extract a concise visual style description (1-2 sentences). Focus on: animation style (cartoon, realistic, anime, etc.), color palette, lighting, character design approach (anthropomorphized objects, real people, etc.), and overall aesthetic.\n\nReturn ONLY the style description, nothing else.' },
|
|
248
|
-
];
|
|
249
|
-
for (const b64 of frames) {
|
|
250
|
-
content.push({ type: 'image_url', image_url: { url: `data:image/jpeg;base64,${b64}` } });
|
|
236
|
+
const video_summary = (parsed.video_summary || '').trim();
|
|
237
|
+
const video_style = (parsed.video_style || '').trim();
|
|
238
|
+
let master_cast_prompt = '';
|
|
239
|
+
if (characters.length > 0) {
|
|
240
|
+
const styleLine = video_style || 'Keep the original video’s visual style, color grading, and lighting.';
|
|
241
|
+
const castList = characters.map(c => `- ${c.name}: ${c.description}`).join('\n');
|
|
242
|
+
master_cast_prompt =
|
|
243
|
+
`[AESTHETIC & STYLE]\n${styleLine}\n` +
|
|
244
|
+
`[COMPOSITION & LAYOUT]\nCharacter Reference Sheet. Full-body side-by-side.\n` +
|
|
245
|
+
`[CHARACTER CAST LIST]\n${castList}\n` +
|
|
246
|
+
`[TECHNICAL SPECIFICATIONS]\nHigh detail, 8k resolution, consistent facial structures across all frames.`;
|
|
251
247
|
}
|
|
252
|
-
|
|
253
|
-
method: 'POST',
|
|
254
|
-
headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
|
|
255
|
-
body: JSON.stringify({ model: 'gpt-4o-mini', max_tokens: 200, messages: [{ role: 'user', content }] }),
|
|
256
|
-
});
|
|
257
|
-
if (!res.ok)
|
|
258
|
-
return '';
|
|
259
|
-
const data = await res.json();
|
|
260
|
-
return (data.choices?.[0]?.message?.content || '').trim().replace(/\*\*/g, '');
|
|
248
|
+
return { video_summary, video_style, master_cast_prompt, characters };
|
|
261
249
|
}
|
|
262
250
|
catch {
|
|
263
|
-
return
|
|
251
|
+
return empty;
|
|
264
252
|
}
|
|
265
253
|
}
|
|
266
254
|
export async function analyzeVideo(url, onProgress) {
|
|
@@ -446,25 +434,22 @@ export async function analyzeVideo(url, onProgress) {
|
|
|
446
434
|
frameBuffers.push({ idx: i, frames: buffers, thumb: thumb || buffers[0], slot });
|
|
447
435
|
}
|
|
448
436
|
}
|
|
449
|
-
// Step 2:
|
|
450
|
-
//
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
const castSamples = frameBuffers
|
|
460
|
-
.filter((_, i) => i % castStep === 0)
|
|
461
|
-
.slice(0, castSampleCount)
|
|
437
|
+
// Step 2: Phase 1 — ONE gpt-4o call returning summary + rich style +
|
|
438
|
+
// master cast + characters. Runs before per-scene describe so the cast
|
|
439
|
+
// context keeps naming consistent across the whole timeline.
|
|
440
|
+
progress('Đang phân tích tổng thể (summary + style + master cast)...');
|
|
441
|
+
// Sample up to 12 frames evenly across the whole video.
|
|
442
|
+
const p1SampleCount = Math.min(12, frameBuffers.length);
|
|
443
|
+
const p1Step = Math.max(1, Math.floor(frameBuffers.length / p1SampleCount));
|
|
444
|
+
const p1Samples = frameBuffers
|
|
445
|
+
.filter((_, i) => i % p1Step === 0)
|
|
446
|
+
.slice(0, p1SampleCount)
|
|
462
447
|
.map(f => f.thumb.toString('base64'));
|
|
463
|
-
const { master_cast_prompt, characters } = await
|
|
448
|
+
const { video_summary, video_style, master_cast_prompt, characters } = await visionExtractPhase1(p1Samples, transcript.text || '');
|
|
464
449
|
const castContext = characters.length
|
|
465
450
|
? characters.map(c => `- ${c.name}: ${c.description}`).join('\n')
|
|
466
451
|
: '';
|
|
467
|
-
console.log('[analyze_video]
|
|
452
|
+
console.log('[analyze_video] Phase1 — style:', video_style.slice(0, 80), '| cast:', characters.map(c => c.name).join(', ') || '(none)', '| summary:', video_summary.length, 'chars');
|
|
468
453
|
// Step 3: Vision describe all frames in parallel (batch of 5), passing the
|
|
469
454
|
// cast context so naming stays consistent across the whole timeline.
|
|
470
455
|
progress(`Đang phân tích ${frameBuffers.length} scenes song song...`);
|
|
@@ -499,6 +484,7 @@ export async function analyzeVideo(url, onProgress) {
|
|
|
499
484
|
duration_sec: Math.round(durationSec),
|
|
500
485
|
language: transcript.language || 'unknown',
|
|
501
486
|
transcript: transcript.text || '',
|
|
487
|
+
summary: video_summary,
|
|
502
488
|
video_style,
|
|
503
489
|
master_cast_prompt,
|
|
504
490
|
characters,
|
package/dist/utils/claude-cli.js
CHANGED
|
@@ -103,7 +103,16 @@ export function runClaude(options) {
|
|
|
103
103
|
const fileList = options.inputFiles.map(f => `- ${f}`).join('\n');
|
|
104
104
|
prompt += `\n\n[User attached ${options.inputFiles.length} image(s). Read these files to see the images:]\n${fileList}`;
|
|
105
105
|
}
|
|
106
|
-
|
|
106
|
+
// Interactive runs (permission protocol) must keep stdin free for y/n
|
|
107
|
+
// replies, so the prompt stays an argv. Non-interactive runs pipe the
|
|
108
|
+
// prompt via stdin instead: `claude -p` reads the prompt from stdin when
|
|
109
|
+
// no prompt arg is given. Passing a 40 KB+ clone prompt as an argv blows
|
|
110
|
+
// past ARG_MAX → spawn E2BIG. stdin has no such limit.
|
|
111
|
+
const useInteractiveStdin = !!options.permissionMode && !!options.onPermissionRequest;
|
|
112
|
+
const promptViaStdin = !useInteractiveStdin;
|
|
113
|
+
const args = promptViaStdin
|
|
114
|
+
? ['-p', '--output-format', format]
|
|
115
|
+
: ['-p', prompt, '--output-format', format];
|
|
107
116
|
if (options.allowedTools?.length) {
|
|
108
117
|
args.push('--allowedTools', options.allowedTools.join(','));
|
|
109
118
|
}
|
|
@@ -144,7 +153,6 @@ export function runClaude(options) {
|
|
|
144
153
|
args.push('--include-partial-messages');
|
|
145
154
|
}
|
|
146
155
|
}
|
|
147
|
-
const useInteractiveStdin = !!options.permissionMode && !!options.onPermissionRequest;
|
|
148
156
|
console.log(`[claude-cli] Spawning with args: ${args.filter(a => !a.startsWith('sk-') && a.length < 200).join(' ')}`);
|
|
149
157
|
const claudeBin = getClaudeBinPath();
|
|
150
158
|
// Ensure PATH includes common bin dirs so shebang `#!/usr/bin/env node` resolves
|
|
@@ -174,9 +182,22 @@ export function runClaude(options) {
|
|
|
174
182
|
}
|
|
175
183
|
const proc = spawn(claudeBin, args, {
|
|
176
184
|
cwd: options.cwd,
|
|
177
|
-
|
|
185
|
+
// stdin piped for both: interactive permission replies, or feeding the
|
|
186
|
+
// prompt to a non-interactive `claude -p`.
|
|
187
|
+
stdio: [(useInteractiveStdin || promptViaStdin) ? 'pipe' : 'ignore', 'pipe', 'pipe'],
|
|
178
188
|
env: spawnEnv,
|
|
179
189
|
});
|
|
190
|
+
// Feed the prompt via stdin for non-interactive runs (avoids argv E2BIG
|
|
191
|
+
// on large clone prompts). Write then close so `claude -p` starts.
|
|
192
|
+
if (promptViaStdin) {
|
|
193
|
+
try {
|
|
194
|
+
proc.stdin.write(prompt);
|
|
195
|
+
proc.stdin.end();
|
|
196
|
+
}
|
|
197
|
+
catch (e) {
|
|
198
|
+
console.error('[claude-cli] failed writing prompt to stdin:', e.message);
|
|
199
|
+
}
|
|
200
|
+
}
|
|
180
201
|
// 30-minute timeout by default
|
|
181
202
|
const timeoutMs = options.timeoutMs ?? 30 * 60 * 1000;
|
|
182
203
|
const timeoutTimer = setTimeout(() => {
|