tuna-agent 0.1.132 → 0.1.134

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -964,7 +964,7 @@ export class ClaudeCodeAdapter {
964
964
  }
965
965
  }
966
966
  // Filter 1: Quality gate — reject garbage rules
967
- const MAX_LEARNED_RULES = 20;
967
+ const MAX_LEARNED_RULES = 50;
968
968
  const MIN_CONFIDENCE = 2;
969
969
  const qualityPatterns = patterns.filter(p => {
970
970
  const r = p.rule.trim();
@@ -998,12 +998,50 @@ export class ClaudeCodeAdapter {
998
998
  }
999
999
  return true;
1000
1000
  });
1001
- // Cap total rules if already at max, skip adding more
1002
- if (existingRules.length >= MAX_LEARNED_RULES) {
1003
- console.log(`[Self-Improve] Already at max ${MAX_LEARNED_RULES} rules — skipping`);
1004
- return;
1001
+ // If at max, try rotation: replace lowest-confidence existing rules with higher-confidence new ones
1002
+ let slotsAvailable = MAX_LEARNED_RULES - existingRules.length;
1003
+ let rotatedOut = [];
1004
+ if (slotsAvailable <= 0 && qualityPatterns.length > 0) {
1005
+ // Parse existing rules with their confidence values and line text
1006
+ const existingWithConf = [];
1007
+ if (sectionIdx !== -1) {
1008
+ const sectionContent = existingContent.substring(sectionIdx + SECTION_HEADER.length);
1009
+ const nextSection = sectionContent.indexOf('\n## ');
1010
+ const rulesText = nextSection !== -1 ? sectionContent.substring(0, nextSection) : sectionContent;
1011
+ for (const line of rulesText.split('\n')) {
1012
+ const trimmed = line.trim();
1013
+ if (trimmed.startsWith('- ')) {
1014
+ const confMatch = trimmed.match(/\(confidence:\s*([\d.]+)\)\s*$/);
1015
+ const conf = confMatch ? parseFloat(confMatch[1]) : 1;
1016
+ existingWithConf.push({ text: trimmed.substring(2), confidence: conf, line: trimmed });
1017
+ }
1018
+ }
1019
+ }
1020
+ // Sort existing by confidence ascending (weakest first)
1021
+ existingWithConf.sort((a, b) => a.confidence - b.confidence);
1022
+ // Find new patterns with higher confidence than weakest existing rules
1023
+ const maxNewConfidence = Math.max(...qualityPatterns.map(p => p.confidence));
1024
+ const weakest = existingWithConf.filter(r => r.confidence < maxNewConfidence);
1025
+ if (weakest.length > 0) {
1026
+ // Rotate out up to 3 weakest rules
1027
+ rotatedOut = weakest.slice(0, 3);
1028
+ slotsAvailable = rotatedOut.length;
1029
+ // Remove rotated-out rules from existingContent
1030
+ for (const old of rotatedOut) {
1031
+ existingContent = existingContent.replace(old.line + '\n', '');
1032
+ // Also remove from existingRules for dedup check
1033
+ const ruleText = old.line.replace(/^- /, '').replace(/\s*\(confidence:\s*[\d.]+\)\s*$/, '');
1034
+ const idx = existingRules.indexOf(ruleText);
1035
+ if (idx !== -1)
1036
+ existingRules.splice(idx, 1);
1037
+ }
1038
+ console.log(`[Self-Improve] Rotation: removing ${rotatedOut.length} weak rules (confidence: ${rotatedOut.map(r => r.confidence).join(', ')})`);
1039
+ }
1040
+ else {
1041
+ console.log(`[Self-Improve] At max ${MAX_LEARNED_RULES} rules, no weaker rules to rotate out — skipping`);
1042
+ return;
1043
+ }
1005
1044
  }
1006
- const slotsAvailable = MAX_LEARNED_RULES - existingRules.length;
1007
1045
  // Filter 2: skip patterns similar to ANY existing rule
1008
1046
  const newPatterns = qualityPatterns.filter(p => {
1009
1047
  return !existingRules.some(existing => ClaudeCodeAdapter.isSimilarRule(p.rule, existing));
@@ -11,6 +11,11 @@ export interface AnalyzeVideoResult {
11
11
  language: string;
12
12
  transcript: string;
13
13
  video_style: string;
14
+ master_cast_prompt: string;
15
+ characters: Array<{
16
+ name: string;
17
+ description: string;
18
+ }>;
14
19
  segments: Array<{
15
20
  start: number;
16
21
  end: number;
@@ -74,9 +74,12 @@ ${rawText}`,
74
74
  return rawText;
75
75
  }
76
76
  }
77
- async function visionDescribe(frameB64, voiceoverText) {
77
+ async function visionDescribe(frameB64, voiceoverText, castContext = '') {
78
78
  if (!OPENAI_KEY)
79
79
  return '';
80
+ const castBlock = castContext
81
+ ? `\n\nKNOWN CHARACTER CAST (reuse these EXACT names when a subject appears — do NOT invent new labels for the same subject):\n${castContext}\n`
82
+ : '';
80
83
  const res = await fetch('https://api.openai.com/v1/chat/completions', {
81
84
  method: 'POST',
82
85
  headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
@@ -86,8 +89,8 @@ async function visionDescribe(frameB64, voiceoverText) {
86
89
  messages: [{
87
90
  role: 'user',
88
91
  content: [
89
- { type: 'text', text: `Describe this frame in detail (4-6 sentences, English). Include:
90
- - Characters: appearance (shape, color, size), facial expression, what they're doing
92
+ { type: 'text', text: `Describe this frame in detail (4-6 sentences, English).${castBlock} Include:
93
+ - Characters: name them using the KNOWN CHARACTER CAST labels above when they appear; appearance (shape, color, size), facial expression, what they're doing
91
94
  - Physical connections: Are characters physically joined/attached/fused together (e.g. organs connected at a junction, body parts linked)? Or are they separate/independent? Be VERY specific — "physically attached at Y-junction" is different from "standing next to each other"
92
95
  - Spatial positions: exact position of each character (left/right/above/below/center), distance between them
93
96
  - Environment: setting, lighting, color palette, atmosphere
@@ -105,6 +108,79 @@ Voiceover at this moment: "${voiceoverText || '(none)'}"` },
105
108
  const data = await res.json();
106
109
  return data.choices?.[0]?.message?.content?.trim() || '';
107
110
  }
111
+ // Phase 1 (borrowed from AI_Video_Clone): extract the recurring character
112
+ // cast ONCE from frames sampled across the whole video + the transcript.
113
+ // Returns a master-cast prompt block in the exact [AESTHETIC & STYLE] /
114
+ // [CHARACTER CAST LIST] format that channel-manager's ScriptImporter parses,
115
+ // plus a structured characters[] list. Doing this upfront (a) populates
116
+ // idea.master_cast_prompt so FlowKit has a reference sheet to generate, and
117
+ // (b) gives every per-scene describe call a consistent naming vocabulary so
118
+ // scene 1 and scene 50 refer to "THE BISHOP" instead of "a man in a suit".
119
+ async function visionExtractMasterCast(frames, transcript, videoStyle) {
120
+ const empty = { master_cast_prompt: '', characters: [] };
121
+ if (!OPENAI_KEY || frames.length === 0)
122
+ return empty;
123
+ try {
124
+ const content = [
125
+ {
126
+ type: 'text',
127
+ text: `Act as a Master Film Director. These frames are sampled across an entire video. Identify EVERY recurring character/subject (people, anthropomorphic objects, animals, mascots).
128
+
129
+ Transcript context (may name characters): "${(transcript || '').slice(0, 1500)}"
130
+
131
+ Return ONLY a JSON object, no markdown fences:
132
+ {
133
+ "characters": [
134
+ { "name": "SHORT_UPPERCASE_LABEL", "description": "one-line English visual description: age/build, face, hair, outfit, colors, distinguishing features" }
135
+ ]
136
+ }
137
+
138
+ Rules:
139
+ - name: a stable short uppercase label you will reuse for this subject (e.g. "THE BISHOP", "RED CAR", "NARRATOR DOG"). Max 4 words.
140
+ - Only RECURRING subjects worth a reference sheet. Skip one-off background extras.
141
+ - description: ENGLISH only, factual, no camera/action words.
142
+ - Max 6 characters.`,
143
+ },
144
+ ];
145
+ for (const b64 of frames) {
146
+ content.push({ type: 'image_url', image_url: { url: `data:image/jpeg;base64,${b64}` } });
147
+ }
148
+ const res = await fetch('https://api.openai.com/v1/chat/completions', {
149
+ method: 'POST',
150
+ headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
151
+ body: JSON.stringify({ model: 'gpt-4o', max_tokens: 1200, messages: [{ role: 'user', content }] }),
152
+ });
153
+ if (!res.ok)
154
+ return empty;
155
+ const data = await res.json();
156
+ const raw = (data.choices?.[0]?.message?.content || '').trim();
157
+ let parsed = {};
158
+ try {
159
+ const m = raw.match(/\{[\s\S]*\}/);
160
+ parsed = JSON.parse(m ? m[0] : raw);
161
+ }
162
+ catch {
163
+ return empty;
164
+ }
165
+ const characters = (parsed.characters || [])
166
+ .filter(c => c && c.name && c.description)
167
+ .slice(0, 6)
168
+ .map(c => ({ name: String(c.name).trim(), description: String(c.description).trim() }));
169
+ if (characters.length === 0)
170
+ return empty;
171
+ // Assemble the verbatim-style master cast block ScriptImporter expects.
172
+ const styleLine = (videoStyle || '').trim() || 'Keep the original video’s visual style, color grading, and lighting.';
173
+ const castList = characters.map(c => `- ${c.name}: ${c.description}`).join('\n');
174
+ const master_cast_prompt = `[AESTHETIC & STYLE]\n${styleLine}\n` +
175
+ `[COMPOSITION & LAYOUT]\nCharacter Reference Sheet. Full-body side-by-side.\n` +
176
+ `[CHARACTER CAST LIST]\n${castList}\n` +
177
+ `[TECHNICAL SPECIFICATIONS]\nHigh detail, 8k resolution, consistent facial structures across all frames.`;
178
+ return { master_cast_prompt, characters };
179
+ }
180
+ catch {
181
+ return empty;
182
+ }
183
+ }
108
184
  async function visionExtractStyle(frames) {
109
185
  if (!OPENAI_KEY || frames.length === 0)
110
186
  return '';
@@ -164,37 +240,61 @@ export async function analyzeVideo(url, onProgress) {
164
240
  const segments = transcript.segments || [];
165
241
  const sceneSlots = [];
166
242
  const SILENCE_THRESHOLD = 5; // seconds — gaps longer than this become their own scene
167
- const MAX_SCENES = 30;
243
+ // Veo3 clips are 8s. The KPI (borrowed from AI_Video_Clone) is: never
244
+ // under-segment — a 40-min video must yield ~ceil(duration/8) scenes, not
245
+ // a fixed 30. Long transcript segments are SPLIT into 8s sub-slots so a
246
+ // 90s monologue becomes ~11 scenes instead of one giant clip. A hard
247
+ // ceiling still bounds runaway vision cost on very long videos.
248
+ const TARGET_SCENE_SEC = 8;
249
+ const HARD_CAP = 600; // ~80 min @ 8s — safety bound on vision API spend
250
+ const targetScenes = Math.max(1, Math.ceil(durationSec / TARGET_SCENE_SEC));
251
+ const MAX_SCENES = Math.min(targetScenes + 20, HARD_CAP);
252
+ // Split a [start,end] span into ≤TARGET_SCENE_SEC sub-slots, preserving
253
+ // the voiceover on the FIRST sub-slot (the rest are silent continuations
254
+ // of the same spoken line so lip-sync isn't duplicated downstream).
255
+ const pushSplit = (start, end, voiceover) => {
256
+ const span = end - start;
257
+ if (span <= TARGET_SCENE_SEC * 1.5) {
258
+ sceneSlots.push({ start, end, voiceover });
259
+ return;
260
+ }
261
+ const n = Math.ceil(span / TARGET_SCENE_SEC);
262
+ const step = span / n;
263
+ for (let k = 0; k < n; k++) {
264
+ sceneSlots.push({
265
+ start: start + k * step,
266
+ end: k === n - 1 ? end : start + (k + 1) * step,
267
+ voiceover: k === 0 ? voiceover : '',
268
+ });
269
+ }
270
+ };
168
271
  if (segments.length > 0) {
169
- // Add silence scene before first segment if gap > threshold
170
272
  if (segments[0].start > SILENCE_THRESHOLD) {
171
- sceneSlots.push({ start: 0, end: segments[0].start, voiceover: '' });
273
+ pushSplit(0, segments[0].start, '');
172
274
  }
173
275
  for (let i = 0; i < segments.length; i++) {
174
276
  const seg = segments[i];
175
- sceneSlots.push({ start: seg.start, end: seg.end, voiceover: seg.text?.trim() || '' });
176
- // Add silence scene between segments if gap > threshold
277
+ pushSplit(seg.start, seg.end, seg.text?.trim() || '');
177
278
  if (i < segments.length - 1) {
178
279
  const gap = segments[i + 1].start - seg.end;
179
280
  if (gap > SILENCE_THRESHOLD) {
180
- sceneSlots.push({ start: seg.end, end: segments[i + 1].start, voiceover: '' });
281
+ pushSplit(seg.end, segments[i + 1].start, '');
181
282
  }
182
283
  }
183
284
  }
184
- // Add silence scene after last segment if gap > threshold
185
285
  const lastEnd = segments[segments.length - 1].end;
186
286
  if (durationSec - lastEnd > SILENCE_THRESHOLD) {
187
- sceneSlots.push({ start: lastEnd, end: durationSec, voiceover: '' });
287
+ pushSplit(lastEnd, durationSec, '');
188
288
  }
189
289
  }
190
290
  else {
191
291
  // No transcript — split into scenes every 8s (Veo3 clip length)
192
- const interval = 8;
193
- for (let t = 0; t < durationSec; t += interval) {
194
- sceneSlots.push({ start: t, end: Math.min(t + interval, durationSec), voiceover: '' });
292
+ for (let t = 0; t < durationSec; t += TARGET_SCENE_SEC) {
293
+ sceneSlots.push({ start: t, end: Math.min(t + TARGET_SCENE_SEC, durationSec), voiceover: '' });
195
294
  }
196
295
  }
197
- // Cap max scenes
296
+ // Duration-aware cap (was a flat 30 — that silently truncated any video
297
+ // longer than ~4 min). Re-number after slicing.
198
298
  const finalSlots = sceneSlots.slice(0, MAX_SCENES);
199
299
  progress(`Đang cắt ${finalSlots.length} frames và phân tích...`);
200
300
  console.log('[analyze_video] Building', finalSlots.length, 'scenes (segments:', segments.length, ', duration:', durationSec, 's)');
@@ -214,7 +314,27 @@ export async function analyzeVideo(url, onProgress) {
214
314
  console.warn('[analyze_video] Frame extract failed for scene', i, msg);
215
315
  }
216
316
  }
217
- // Step 2: Vision describe all frames in parallel (batch of 5)
317
+ // Step 2: Style + Master Cast FIRST (Phase 1), so per-scene describe can
318
+ // reuse consistent character labels (the AI_Video_Clone lesson).
319
+ progress('Đang phân tích video style...');
320
+ const styleSamples = frameBuffers.slice(0, 3).map(f => f.buf.toString('base64'));
321
+ const video_style = await visionExtractStyle(styleSamples);
322
+ console.log('[analyze_video] Video style:', video_style.substring(0, 100));
323
+ progress('Đang trích xuất dàn nhân vật (Master Cast)...');
324
+ // Sample up to 12 frames evenly across the whole video for cast detection.
325
+ const castSampleCount = Math.min(12, frameBuffers.length);
326
+ const castStep = Math.max(1, Math.floor(frameBuffers.length / castSampleCount));
327
+ const castSamples = frameBuffers
328
+ .filter((_, i) => i % castStep === 0)
329
+ .slice(0, castSampleCount)
330
+ .map(f => f.buf.toString('base64'));
331
+ const { master_cast_prompt, characters } = await visionExtractMasterCast(castSamples, transcript.text || '', video_style);
332
+ const castContext = characters.length
333
+ ? characters.map(c => `- ${c.name}: ${c.description}`).join('\n')
334
+ : '';
335
+ console.log('[analyze_video] Master cast:', characters.map(c => c.name).join(', ') || '(none)');
336
+ // Step 3: Vision describe all frames in parallel (batch of 5), passing the
337
+ // cast context so naming stays consistent across the whole timeline.
218
338
  progress(`Đang phân tích ${frameBuffers.length} scenes song song...`);
219
339
  const BATCH_SIZE = 5;
220
340
  const sceneResults = [];
@@ -223,7 +343,7 @@ export async function analyzeVideo(url, onProgress) {
223
343
  progress(`Đang phân tích scene ${b + 1}-${Math.min(b + BATCH_SIZE, frameBuffers.length)}/${frameBuffers.length}...`);
224
344
  const results = await Promise.all(batch.map(async ({ idx, buf, slot }) => {
225
345
  try {
226
- const visual_description = await visionDescribe(buf.toString('base64'), slot.voiceover);
346
+ const visual_description = await visionDescribe(buf.toString('base64'), slot.voiceover, castContext);
227
347
  return {
228
348
  scene_number: idx + 1,
229
349
  timestamp_start: Math.round(slot.start * 10) / 10,
@@ -242,16 +362,13 @@ export async function analyzeVideo(url, onProgress) {
242
362
  sceneResults.push(...results.filter((r) => r !== null));
243
363
  }
244
364
  const scenes = sceneResults.sort((a, b) => a.scene_number - b.scene_number);
245
- // Extract video style from 3 sample frames
246
- progress('Đang phân tích video style...');
247
- const styleSamples = frameBuffers.slice(0, 3).map(f => f.buf.toString('base64'));
248
- const video_style = await visionExtractStyle(styleSamples);
249
- console.log('[analyze_video] Video style:', video_style.substring(0, 100));
250
365
  return {
251
366
  duration_sec: Math.round(durationSec),
252
367
  language: transcript.language || 'unknown',
253
368
  transcript: transcript.text || '',
254
369
  video_style,
370
+ master_cast_prompt,
371
+ characters,
255
372
  segments: segments.map((s) => ({ start: s.start, end: s.end, text: s.text })),
256
373
  scenes,
257
374
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "tuna-agent",
3
- "version": "0.1.132",
3
+ "version": "0.1.134",
4
4
  "description": "Tuna Agent - Run AI coding tasks on your machine",
5
5
  "bin": {
6
6
  "tuna-agent": "dist/cli/index.js"