tuna-agent 0.1.135 → 0.1.137

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,6 +7,7 @@
7
7
  */
8
8
  import { AgentWebSocketClient } from './ws-client.js';
9
9
  export interface AnalyzeVideoResult {
10
+ source_title: string;
10
11
  duration_sec: number;
11
12
  language: string;
12
13
  transcript: string;
@@ -232,6 +232,14 @@ export async function analyzeVideo(url, onProgress) {
232
232
  progress('Đang tải video...');
233
233
  console.log('[analyze_video] Downloading:', url);
234
234
  await run(YT_DLP, ['-f', 'best[height<=720]/best', '-o', videoPath, '--no-playlist', '--quiet', url]);
235
+ // Grab the original video title (metadata only, no extra download) so the
236
+ // clone idea gets a real name instead of "Clone: www.youtube.com".
237
+ let source_title = '';
238
+ try {
239
+ const t = await run(YT_DLP, ['--skip-download', '--no-warnings', '--no-playlist', '--print', '%(title)s', url]);
240
+ source_title = (t.out || '').trim().split('\n')[0].slice(0, 200);
241
+ }
242
+ catch { /* title is best-effort — analysis still proceeds without it */ }
235
243
  progress('Đang tách audio...');
236
244
  console.log('[analyze_video] Extracting audio');
237
245
  await run(FFMPEG, ['-y', '-i', videoPath, '-vn', '-ar', '16000', '-ac', '1', '-b:a', '64k', audioPath, '-loglevel', 'error']);
@@ -261,56 +269,74 @@ export async function analyzeVideo(url, onProgress) {
261
269
  // 90s monologue becomes ~11 scenes instead of one giant clip. A hard
262
270
  // ceiling still bounds runaway vision cost on very long videos.
263
271
  const TARGET_SCENE_SEC = 8;
264
- const HARD_CAP = 600; // ~80 min @ 8s safety bound on vision API spend
265
- const targetScenes = Math.max(1, Math.ceil(durationSec / TARGET_SCENE_SEC));
266
- const MAX_SCENES = Math.min(targetScenes + 20, HARD_CAP);
267
- // Split a [start,end] span into ≤TARGET_SCENE_SEC sub-slots, preserving
268
- // the voiceover on the FIRST sub-slot (the rest are silent continuations
269
- // of the same spoken line so lip-sync isn't duplicated downstream).
270
- const pushSplit = (start, end, voiceover) => {
271
- const span = end - start;
272
- if (span <= TARGET_SCENE_SEC * 1.5) {
273
- sceneSlots.push({ start, end, voiceover });
274
- return;
275
- }
276
- const n = Math.ceil(span / TARGET_SCENE_SEC);
277
- const step = span / n;
278
- for (let k = 0; k < n; k++) {
279
- sceneSlots.push({
280
- start: start + k * step,
281
- end: k === n - 1 ? end : start + (k + 1) * step,
282
- voiceover: k === 0 ? voiceover : '',
283
- });
284
- }
285
- };
272
+ // Safety ceiling ONLY (≈80 min @ 8s). It must NOT be derived from
273
+ // ceil(duration/8): Whisper emits hundreds of 2-4s segments for a talky
274
+ // video, so a tighter cap + slice() silently dropped the back half of
275
+ // the video (13-min clip 118 slots → only first 6:21 kept). The
276
+ // normalise pass below already collapses tiny segments into ~8s scenes,
277
+ // so the natural count ceil(duration/8) and this only guards runaway.
278
+ const HARD_CAP = 600;
279
+ const spans = [];
286
280
  if (segments.length > 0) {
287
- if (segments[0].start > SILENCE_THRESHOLD) {
288
- pushSplit(0, segments[0].start, '');
289
- }
281
+ if (segments[0].start > SILENCE_THRESHOLD)
282
+ spans.push({ start: 0, end: segments[0].start, voiceover: '' });
290
283
  for (let i = 0; i < segments.length; i++) {
291
284
  const seg = segments[i];
292
- pushSplit(seg.start, seg.end, seg.text?.trim() || '');
285
+ spans.push({ start: seg.start, end: seg.end, voiceover: seg.text?.trim() || '' });
293
286
  if (i < segments.length - 1) {
294
287
  const gap = segments[i + 1].start - seg.end;
295
- if (gap > SILENCE_THRESHOLD) {
296
- pushSplit(seg.end, segments[i + 1].start, '');
297
- }
288
+ if (gap > SILENCE_THRESHOLD)
289
+ spans.push({ start: seg.end, end: segments[i + 1].start, voiceover: '' });
298
290
  }
299
291
  }
300
292
  const lastEnd = segments[segments.length - 1].end;
301
- if (durationSec - lastEnd > SILENCE_THRESHOLD) {
302
- pushSplit(lastEnd, durationSec, '');
303
- }
293
+ if (durationSec - lastEnd > SILENCE_THRESHOLD)
294
+ spans.push({ start: lastEnd, end: durationSec, voiceover: '' });
304
295
  }
305
296
  else {
306
- // No transcript — split into scenes every 8s (Veo3 clip length)
307
297
  for (let t = 0; t < durationSec; t += TARGET_SCENE_SEC) {
308
- sceneSlots.push({ start: t, end: Math.min(t + TARGET_SCENE_SEC, durationSec), voiceover: '' });
298
+ spans.push({ start: t, end: Math.min(t + TARGET_SCENE_SEC, durationSec), voiceover: '' });
299
+ }
300
+ }
301
+ // 2) Normalise every span to ~TARGET-second scenes covering the FULL
302
+ // timeline:
303
+ // - long span (> 1.5×TARGET): split into ceil(span/TARGET) equal slots
304
+ // - short spans: greedily MERGE consecutive ones until ≈TARGET so a
305
+ // talky video becomes ~ceil(duration/8) Veo3-length scenes instead
306
+ // of hundreds of 2s fragments — crucially WITHOUT dropping the tail.
307
+ for (let i = 0; i < spans.length;) {
308
+ const s = spans[i];
309
+ const span = s.end - s.start;
310
+ if (span > TARGET_SCENE_SEC * 1.5) {
311
+ const n = Math.ceil(span / TARGET_SCENE_SEC);
312
+ const step = span / n;
313
+ for (let k = 0; k < n; k++) {
314
+ sceneSlots.push({
315
+ start: s.start + k * step,
316
+ end: k === n - 1 ? s.end : s.start + (k + 1) * step,
317
+ voiceover: k === 0 ? s.voiceover : '',
318
+ });
319
+ }
320
+ i++;
321
+ }
322
+ else {
323
+ let end = s.end;
324
+ const vo = s.voiceover ? [s.voiceover] : [];
325
+ let j = i + 1;
326
+ while (j < spans.length &&
327
+ (end - s.start) < TARGET_SCENE_SEC &&
328
+ (spans[j].end - s.start) <= TARGET_SCENE_SEC * 1.5) {
329
+ end = spans[j].end;
330
+ if (spans[j].voiceover)
331
+ vo.push(spans[j].voiceover);
332
+ j++;
333
+ }
334
+ sceneSlots.push({ start: s.start, end, voiceover: vo.join(' ') });
335
+ i = j;
309
336
  }
310
337
  }
311
- // Duration-aware cap (was a flat 30 that silently truncated any video
312
- // longer than ~4 min). Re-number after slicing.
313
- const finalSlots = sceneSlots.slice(0, MAX_SCENES);
338
+ // slice() now only ever trims pathological >80-min inputs.
339
+ const finalSlots = sceneSlots.slice(0, HARD_CAP);
314
340
  progress(`Đang cắt ${finalSlots.length} frames và phân tích...`);
315
341
  console.log('[analyze_video] Building', finalSlots.length, 'scenes (segments:', segments.length, ', duration:', durationSec, 's)');
316
342
  // Step 1: Extract frames sequentially. Per scene we grab 3 chronological
@@ -399,6 +425,7 @@ export async function analyzeVideo(url, onProgress) {
399
425
  }
400
426
  const scenes = sceneResults.sort((a, b) => a.scene_number - b.scene_number);
401
427
  return {
428
+ source_title,
402
429
  duration_sec: Math.round(durationSec),
403
430
  language: transcript.language || 'unknown',
404
431
  transcript: transcript.text || '',
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "tuna-agent",
3
- "version": "0.1.135",
3
+ "version": "0.1.137",
4
4
  "description": "Tuna Agent - Run AI coding tasks on your machine",
5
5
  "bin": {
6
6
  "tuna-agent": "dist/cli/index.js"