tuna-agent 0.1.135 → 0.1.137
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -232,6 +232,14 @@ export async function analyzeVideo(url, onProgress) {
|
|
|
232
232
|
progress('Đang tải video...');
|
|
233
233
|
console.log('[analyze_video] Downloading:', url);
|
|
234
234
|
await run(YT_DLP, ['-f', 'best[height<=720]/best', '-o', videoPath, '--no-playlist', '--quiet', url]);
|
|
235
|
+
// Grab the original video title (metadata only, no extra download) so the
|
|
236
|
+
// clone idea gets a real name instead of "Clone: www.youtube.com".
|
|
237
|
+
let source_title = '';
|
|
238
|
+
try {
|
|
239
|
+
const t = await run(YT_DLP, ['--skip-download', '--no-warnings', '--no-playlist', '--print', '%(title)s', url]);
|
|
240
|
+
source_title = (t.out || '').trim().split('\n')[0].slice(0, 200);
|
|
241
|
+
}
|
|
242
|
+
catch { /* title is best-effort — analysis still proceeds without it */ }
|
|
235
243
|
progress('Đang tách audio...');
|
|
236
244
|
console.log('[analyze_video] Extracting audio');
|
|
237
245
|
await run(FFMPEG, ['-y', '-i', videoPath, '-vn', '-ar', '16000', '-ac', '1', '-b:a', '64k', audioPath, '-loglevel', 'error']);
|
|
@@ -261,56 +269,74 @@ export async function analyzeVideo(url, onProgress) {
|
|
|
261
269
|
// 90s monologue becomes ~11 scenes instead of one giant clip. A hard
|
|
262
270
|
// ceiling still bounds runaway vision cost on very long videos.
|
|
263
271
|
const TARGET_SCENE_SEC = 8;
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
//
|
|
268
|
-
//
|
|
269
|
-
//
|
|
270
|
-
const
|
|
271
|
-
|
|
272
|
-
if (span <= TARGET_SCENE_SEC * 1.5) {
|
|
273
|
-
sceneSlots.push({ start, end, voiceover });
|
|
274
|
-
return;
|
|
275
|
-
}
|
|
276
|
-
const n = Math.ceil(span / TARGET_SCENE_SEC);
|
|
277
|
-
const step = span / n;
|
|
278
|
-
for (let k = 0; k < n; k++) {
|
|
279
|
-
sceneSlots.push({
|
|
280
|
-
start: start + k * step,
|
|
281
|
-
end: k === n - 1 ? end : start + (k + 1) * step,
|
|
282
|
-
voiceover: k === 0 ? voiceover : '',
|
|
283
|
-
});
|
|
284
|
-
}
|
|
285
|
-
};
|
|
272
|
+
// Safety ceiling ONLY (≈80 min @ 8s). It must NOT be derived from
|
|
273
|
+
// ceil(duration/8): Whisper emits hundreds of 2-4s segments for a talky
|
|
274
|
+
// video, so a tighter cap + slice() silently dropped the back half of
|
|
275
|
+
// the video (13-min clip → 118 slots → only first 6:21 kept). The
|
|
276
|
+
// normalise pass below already collapses tiny segments into ~8s scenes,
|
|
277
|
+
// so the natural count ≈ ceil(duration/8) and this only guards runaway.
|
|
278
|
+
const HARD_CAP = 600;
|
|
279
|
+
const spans = [];
|
|
286
280
|
if (segments.length > 0) {
|
|
287
|
-
if (segments[0].start > SILENCE_THRESHOLD)
|
|
288
|
-
|
|
289
|
-
}
|
|
281
|
+
if (segments[0].start > SILENCE_THRESHOLD)
|
|
282
|
+
spans.push({ start: 0, end: segments[0].start, voiceover: '' });
|
|
290
283
|
for (let i = 0; i < segments.length; i++) {
|
|
291
284
|
const seg = segments[i];
|
|
292
|
-
|
|
285
|
+
spans.push({ start: seg.start, end: seg.end, voiceover: seg.text?.trim() || '' });
|
|
293
286
|
if (i < segments.length - 1) {
|
|
294
287
|
const gap = segments[i + 1].start - seg.end;
|
|
295
|
-
if (gap > SILENCE_THRESHOLD)
|
|
296
|
-
|
|
297
|
-
}
|
|
288
|
+
if (gap > SILENCE_THRESHOLD)
|
|
289
|
+
spans.push({ start: seg.end, end: segments[i + 1].start, voiceover: '' });
|
|
298
290
|
}
|
|
299
291
|
}
|
|
300
292
|
const lastEnd = segments[segments.length - 1].end;
|
|
301
|
-
if (durationSec - lastEnd > SILENCE_THRESHOLD)
|
|
302
|
-
|
|
303
|
-
}
|
|
293
|
+
if (durationSec - lastEnd > SILENCE_THRESHOLD)
|
|
294
|
+
spans.push({ start: lastEnd, end: durationSec, voiceover: '' });
|
|
304
295
|
}
|
|
305
296
|
else {
|
|
306
|
-
// No transcript — split into scenes every 8s (Veo3 clip length)
|
|
307
297
|
for (let t = 0; t < durationSec; t += TARGET_SCENE_SEC) {
|
|
308
|
-
|
|
298
|
+
spans.push({ start: t, end: Math.min(t + TARGET_SCENE_SEC, durationSec), voiceover: '' });
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
// 2) Normalise every span to ~TARGET-second scenes covering the FULL
|
|
302
|
+
// timeline:
|
|
303
|
+
// - long span (> 1.5×TARGET): split into ceil(span/TARGET) equal slots
|
|
304
|
+
// - short spans: greedily MERGE consecutive ones until ≈TARGET so a
|
|
305
|
+
// talky video becomes ~ceil(duration/8) Veo3-length scenes instead
|
|
306
|
+
// of hundreds of 2s fragments — crucially WITHOUT dropping the tail.
|
|
307
|
+
for (let i = 0; i < spans.length;) {
|
|
308
|
+
const s = spans[i];
|
|
309
|
+
const span = s.end - s.start;
|
|
310
|
+
if (span > TARGET_SCENE_SEC * 1.5) {
|
|
311
|
+
const n = Math.ceil(span / TARGET_SCENE_SEC);
|
|
312
|
+
const step = span / n;
|
|
313
|
+
for (let k = 0; k < n; k++) {
|
|
314
|
+
sceneSlots.push({
|
|
315
|
+
start: s.start + k * step,
|
|
316
|
+
end: k === n - 1 ? s.end : s.start + (k + 1) * step,
|
|
317
|
+
voiceover: k === 0 ? s.voiceover : '',
|
|
318
|
+
});
|
|
319
|
+
}
|
|
320
|
+
i++;
|
|
321
|
+
}
|
|
322
|
+
else {
|
|
323
|
+
let end = s.end;
|
|
324
|
+
const vo = s.voiceover ? [s.voiceover] : [];
|
|
325
|
+
let j = i + 1;
|
|
326
|
+
while (j < spans.length &&
|
|
327
|
+
(end - s.start) < TARGET_SCENE_SEC &&
|
|
328
|
+
(spans[j].end - s.start) <= TARGET_SCENE_SEC * 1.5) {
|
|
329
|
+
end = spans[j].end;
|
|
330
|
+
if (spans[j].voiceover)
|
|
331
|
+
vo.push(spans[j].voiceover);
|
|
332
|
+
j++;
|
|
333
|
+
}
|
|
334
|
+
sceneSlots.push({ start: s.start, end, voiceover: vo.join(' ') });
|
|
335
|
+
i = j;
|
|
309
336
|
}
|
|
310
337
|
}
|
|
311
|
-
//
|
|
312
|
-
|
|
313
|
-
const finalSlots = sceneSlots.slice(0, MAX_SCENES);
|
|
338
|
+
// slice() now only ever trims pathological >80-min inputs.
|
|
339
|
+
const finalSlots = sceneSlots.slice(0, HARD_CAP);
|
|
314
340
|
progress(`Đang cắt ${finalSlots.length} frames và phân tích...`);
|
|
315
341
|
console.log('[analyze_video] Building', finalSlots.length, 'scenes (segments:', segments.length, ', duration:', durationSec, 's)');
|
|
316
342
|
// Step 1: Extract frames sequentially. Per scene we grab 3 chronological
|
|
@@ -399,6 +425,7 @@ export async function analyzeVideo(url, onProgress) {
|
|
|
399
425
|
}
|
|
400
426
|
const scenes = sceneResults.sort((a, b) => a.scene_number - b.scene_number);
|
|
401
427
|
return {
|
|
428
|
+
source_title,
|
|
402
429
|
duration_sec: Math.round(durationSec),
|
|
403
430
|
language: transcript.language || 'unknown',
|
|
404
431
|
transcript: transcript.text || '',
|