tuna-agent 0.1.119 → 0.1.120

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,31 @@
1
+ /**
2
+ * Analyze Video Handler
3
+ * Downloads YouTube video via yt-dlp, extracts audio, transcribes via Whisper,
4
+ * extracts frames per segment, describes each frame via GPT-4o vision.
5
+ *
6
+ * Requires on the host machine: yt-dlp, ffmpeg, ffprobe, OPENAI_API_KEY env var.
7
+ */
8
+ import { AgentWebSocketClient } from './ws-client.js';
9
+ export interface AnalyzeVideoResult {
10
+ duration_sec: number;
11
+ language: string;
12
+ transcript: string;
13
+ segments: Array<{
14
+ start: number;
15
+ end: number;
16
+ text: string;
17
+ }>;
18
+ scenes: Array<{
19
+ scene_number: number;
20
+ timestamp_start: number;
21
+ timestamp_end: number;
22
+ thumbnail_base64: string;
23
+ voiceover: string;
24
+ visual_description: string;
25
+ }>;
26
+ isError?: boolean;
27
+ error?: string;
28
+ }
29
+ export declare function analyzeVideo(url: string): Promise<AnalyzeVideoResult>;
30
+ /** Extension task handler — wraps analyzeVideo with WS response */
31
+ export declare function handleAnalyzeVideo(ws: AgentWebSocketClient, code: string, taskId: string, url: string): Promise<void>;
@@ -0,0 +1,180 @@
1
+ /**
2
+ * Analyze Video Handler
3
+ * Downloads YouTube video via yt-dlp, extracts audio, transcribes via Whisper,
4
+ * extracts frames per segment, describes each frame via GPT-4o vision.
5
+ *
6
+ * Requires on the host machine: yt-dlp, ffmpeg, ffprobe, OPENAI_API_KEY env var.
7
+ */
8
+ import { spawn } from 'child_process';
9
+ import { promises as fs } from 'fs';
10
+ import path from 'path';
11
+ import os from 'os';
12
+ import crypto from 'crypto';
13
+ const OPENAI_KEY = process.env.OPENAI_API_KEY || '';
14
+ const YT_DLP = process.env.YT_DLP_BIN || '/home/gatoasang94/.local/bin/yt-dlp';
15
+ const FFMPEG = process.env.FFMPEG_BIN || '/usr/bin/ffmpeg';
16
+ const FFPROBE = process.env.FFPROBE_BIN || '/usr/bin/ffprobe';
17
+ function run(cmd, args, opts = {}) {
18
+ return new Promise((resolve, reject) => {
19
+ const p = spawn(cmd, args, { stdio: ['ignore', 'pipe', 'pipe'], ...opts });
20
+ let out = '', err = '';
21
+ p.stdout.on('data', (d) => out += d);
22
+ p.stderr.on('data', (d) => err += d);
23
+ p.on('close', (code) => code === 0 ? resolve({ out, err }) : reject(new Error(`${cmd} exit ${code}: ${err.slice(0, 500)}`)));
24
+ });
25
+ }
26
+ async function whisperTranscribe(audioPath) {
27
+ if (!OPENAI_KEY)
28
+ throw new Error('OPENAI_API_KEY not set');
29
+ const buf = await fs.readFile(audioPath);
30
+ const form = new FormData();
31
+ const blob = new Blob([buf], { type: 'audio/mpeg' });
32
+ form.append('file', blob, path.basename(audioPath));
33
+ form.append('model', 'whisper-1');
34
+ form.append('response_format', 'verbose_json');
35
+ form.append('timestamp_granularities[]', 'segment');
36
+ const res = await fetch('https://api.openai.com/v1/audio/transcriptions', {
37
+ method: 'POST',
38
+ headers: { Authorization: `Bearer ${OPENAI_KEY}` },
39
+ body: form,
40
+ });
41
+ if (!res.ok)
42
+ throw new Error(`whisper ${res.status}: ${(await res.text()).slice(0, 300)}`);
43
+ return res.json();
44
+ }
45
+ async function correctTranscript(rawText, language) {
46
+ if (!OPENAI_KEY || !rawText || rawText.length < 20)
47
+ return rawText;
48
+ try {
49
+ const res = await fetch('https://api.openai.com/v1/chat/completions', {
50
+ method: 'POST',
51
+ headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
52
+ body: JSON.stringify({
53
+ model: 'gpt-4o-mini',
54
+ max_tokens: 2000,
55
+ messages: [{
56
+ role: 'user',
57
+ content: `Below is a raw transcript from Whisper speech-to-text (language: ${language || 'unknown'}). It may contain recognition errors: wrong words, missing punctuation, or garbled phrases.
58
+
59
+ Fix any obvious errors while keeping the meaning intact. Add proper punctuation and capitalization. If a word sounds wrong in context, replace it with the most likely correct word.
60
+
61
+ Return ONLY the corrected transcript text, nothing else.
62
+
63
+ Raw transcript:
64
+ ${rawText}`,
65
+ }],
66
+ }),
67
+ });
68
+ if (!res.ok)
69
+ return rawText;
70
+ const data = await res.json();
71
+ return data.choices?.[0]?.message?.content?.trim() || rawText;
72
+ }
73
+ catch {
74
+ return rawText;
75
+ }
76
+ }
77
+ async function visionDescribe(frameB64, voiceoverText) {
78
+ if (!OPENAI_KEY)
79
+ return '';
80
+ const res = await fetch('https://api.openai.com/v1/chat/completions', {
81
+ method: 'POST',
82
+ headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${OPENAI_KEY}` },
83
+ body: JSON.stringify({
84
+ model: 'gpt-4o-mini',
85
+ max_tokens: 150,
86
+ messages: [{
87
+ role: 'user',
88
+ content: [
89
+ { type: 'text', text: `Mô tả ngắn gọn (1-2 câu tiếng Việt) khung hình này: nhân vật, bối cảnh, góc quay, tâm trạng. Voiceover đang nói: "${voiceoverText || '(không có)'}"` },
90
+ { type: 'image_url', image_url: { url: `data:image/jpeg;base64,${frameB64}` } },
91
+ ],
92
+ }],
93
+ }),
94
+ });
95
+ if (!res.ok)
96
+ return '';
97
+ const data = await res.json();
98
+ return data.choices?.[0]?.message?.content?.trim() || '';
99
+ }
100
+ export async function analyzeVideo(url) {
101
+ const tmpDir = path.join(os.tmpdir(), 'tuna-analyze-' + crypto.randomBytes(6).toString('hex'));
102
+ await fs.mkdir(tmpDir, { recursive: true });
103
+ const videoPath = path.join(tmpDir, 'video.mp4');
104
+ const audioPath = path.join(tmpDir, 'audio.mp3');
105
+ const framesDir = path.join(tmpDir, 'frames');
106
+ await fs.mkdir(framesDir, { recursive: true });
107
+ try {
108
+ console.log('[analyze_video] Downloading:', url);
109
+ await run(YT_DLP, ['-f', 'best[height<=720]/best', '-o', videoPath, '--no-playlist', '--quiet', url]);
110
+ console.log('[analyze_video] Extracting audio');
111
+ await run(FFMPEG, ['-y', '-i', videoPath, '-vn', '-ar', '16000', '-ac', '1', '-b:a', '64k', audioPath, '-loglevel', 'error']);
112
+ console.log('[analyze_video] Probing duration');
113
+ const durationSec = await (async () => {
114
+ try {
115
+ const r = await run(FFPROBE, ['-v', 'error', '-show_entries', 'format=duration', '-of', 'default=nw=1:nk=1', videoPath]);
116
+ return parseFloat(r.out.trim()) || 0;
117
+ }
118
+ catch {
119
+ return 0;
120
+ }
121
+ })();
122
+ console.log('[analyze_video] Transcribing via Whisper');
123
+ const rawTranscript = await whisperTranscribe(audioPath);
124
+ console.log('[analyze_video] AI correcting transcript');
125
+ const correctedText = await correctTranscript(rawTranscript.text, rawTranscript.language);
126
+ const transcript = { ...rawTranscript, text: correctedText };
127
+ const segments = transcript.segments || [];
128
+ console.log('[analyze_video] Extracting', segments.length, 'frames (1 per segment)');
129
+ const scenes = [];
130
+ for (let i = 0; i < segments.length; i++) {
131
+ const seg = segments[i];
132
+ const midpoint = (seg.start + seg.end) / 2;
133
+ const framePath = path.join(framesDir, `seg-${String(i).padStart(3, '0')}.jpg`);
134
+ try {
135
+ await run(FFMPEG, ['-y', '-ss', String(midpoint), '-i', videoPath, '-vframes', '1', '-vf', 'scale=640:-1', '-q:v', '5', framePath, '-loglevel', 'error']);
136
+ const buf = await fs.readFile(framePath);
137
+ const voiceover = seg.text?.trim() || '';
138
+ const visual_description = await visionDescribe(buf.toString('base64'), voiceover);
139
+ scenes.push({
140
+ scene_number: i + 1,
141
+ timestamp_start: Math.round(seg.start * 10) / 10,
142
+ timestamp_end: Math.round(seg.end * 10) / 10,
143
+ thumbnail_base64: buf.toString('base64'),
144
+ voiceover,
145
+ visual_description,
146
+ });
147
+ }
148
+ catch (err) {
149
+ const msg = err instanceof Error ? err.message : String(err);
150
+ console.warn('[analyze_video] Frame extract failed for segment', i, msg);
151
+ }
152
+ }
153
+ return {
154
+ duration_sec: Math.round(durationSec),
155
+ language: transcript.language || 'unknown',
156
+ transcript: transcript.text || '',
157
+ segments: segments.map((s) => ({ start: s.start, end: s.end, text: s.text })),
158
+ scenes,
159
+ };
160
+ }
161
+ finally {
162
+ try {
163
+ await fs.rm(tmpDir, { recursive: true, force: true });
164
+ }
165
+ catch { /* ignore */ }
166
+ }
167
+ }
168
+ /** Extension task handler — wraps analyzeVideo with WS response */
169
+ export async function handleAnalyzeVideo(ws, code, taskId, url) {
170
+ console.log(`[analyze_video] Starting for ${url}`);
171
+ try {
172
+ const result = await analyzeVideo(url);
173
+ ws.sendExtensionDone(code, taskId, { ...result, isError: false });
174
+ }
175
+ catch (err) {
176
+ const errMsg = err instanceof Error ? err.message : String(err);
177
+ console.error(`[analyze_video] Error: ${errMsg}`);
178
+ ws.sendExtensionDone(code, taskId, { error: errMsg, isError: true });
179
+ }
180
+ }
@@ -10,6 +10,7 @@ import { chatWithPM } from '../pm/planner.js';
10
10
  import { executePlanAndReport, simplifyMarkdown, waitForInput } from '../utils/execution-helpers.js';
11
11
  import { runClaude } from '../utils/claude-cli.js';
12
12
  import { handleGetHistory, handleRetryVideo, handleGenerateIdeas, handleGenerateScript, handleGenerateScene, handleGenerateScenes, handleRenderVideo, handleListCharacters, handleCreateCharacter, handleSaveCharacterSelection, handleSyncApps, handleClaudePrompt, handleClaudePromptStream, } from './extension-handlers.js';
13
+ import { handleAnalyzeVideo } from './analyze-video-handler.js';
13
14
  import { downloadAttachments, cleanupAttachments } from '../utils/image-download.js';
14
15
  import { scanSkills } from '../utils/skill-scanner.js';
15
16
  import { setupMcpConfig } from '../mcp/setup.js';
@@ -561,19 +562,7 @@ ${skillContent.slice(0, 15000)}`;
561
562
  }
562
563
  if (extTask === 'analyze_video' && msg.url) {
563
564
  (async () => {
564
- try {
565
- // Load custom handler from ~/.tuna-agent/ if available
566
- const handlerPath = path.join(os.homedir(), '.tuna-agent', 'analyze-video-handler.mjs');
567
- const { analyzeVideo } = await import(handlerPath);
568
- console.log(`[Daemon] analyze_video: using custom handler for ${msg.url}`);
569
- const result = await analyzeVideo(msg.url);
570
- ws.sendExtensionDone(extCode, extTaskId, { ...result, isError: false });
571
- }
572
- catch (err) {
573
- const errMsg = err instanceof Error ? err.message : String(err);
574
- console.error(`[Daemon] analyze_video error: ${errMsg}`);
575
- ws.sendExtensionDone(extCode, extTaskId, { error: errMsg, isError: true });
576
- }
565
+ await handleAnalyzeVideo(ws, extCode, extTaskId, msg.url);
577
566
  })();
578
567
  break;
579
568
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "tuna-agent",
3
- "version": "0.1.119",
3
+ "version": "0.1.120",
4
4
  "description": "Tuna Agent - Run AI coding tasks on your machine",
5
5
  "bin": {
6
6
  "tuna-agent": "dist/cli/index.js"