zerg-ztc 0.1.7 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/dist/App.d.ts.map +1 -1
  2. package/dist/App.js +75 -8
  3. package/dist/App.js.map +1 -1
  4. package/dist/agent/agent.d.ts +2 -0
  5. package/dist/agent/agent.d.ts.map +1 -1
  6. package/dist/agent/agent.js +111 -10
  7. package/dist/agent/agent.js.map +1 -1
  8. package/dist/agent/backends/anthropic.d.ts.map +1 -1
  9. package/dist/agent/backends/anthropic.js +15 -3
  10. package/dist/agent/backends/anthropic.js.map +1 -1
  11. package/dist/agent/backends/gemini.d.ts.map +1 -1
  12. package/dist/agent/backends/gemini.js +12 -0
  13. package/dist/agent/backends/gemini.js.map +1 -1
  14. package/dist/agent/backends/index.d.ts +1 -1
  15. package/dist/agent/backends/index.d.ts.map +1 -1
  16. package/dist/agent/backends/openai_compatible.d.ts.map +1 -1
  17. package/dist/agent/backends/openai_compatible.js +12 -0
  18. package/dist/agent/backends/openai_compatible.js.map +1 -1
  19. package/dist/agent/backends/types.d.ts +21 -1
  20. package/dist/agent/backends/types.d.ts.map +1 -1
  21. package/dist/agent/commands/dictation.d.ts +3 -0
  22. package/dist/agent/commands/dictation.d.ts.map +1 -0
  23. package/dist/agent/commands/dictation.js +10 -0
  24. package/dist/agent/commands/dictation.js.map +1 -0
  25. package/dist/agent/commands/index.d.ts.map +1 -1
  26. package/dist/agent/commands/index.js +2 -1
  27. package/dist/agent/commands/index.js.map +1 -1
  28. package/dist/agent/commands/types.d.ts +7 -0
  29. package/dist/agent/commands/types.d.ts.map +1 -1
  30. package/dist/agent/runtime/capabilities.d.ts +2 -1
  31. package/dist/agent/runtime/capabilities.d.ts.map +1 -1
  32. package/dist/agent/runtime/capabilities.js +1 -0
  33. package/dist/agent/runtime/capabilities.js.map +1 -1
  34. package/dist/agent/tools/index.d.ts +1 -0
  35. package/dist/agent/tools/index.d.ts.map +1 -1
  36. package/dist/agent/tools/index.js +6 -1
  37. package/dist/agent/tools/index.js.map +1 -1
  38. package/dist/agent/tools/screenshot.d.ts +23 -0
  39. package/dist/agent/tools/screenshot.d.ts.map +1 -0
  40. package/dist/agent/tools/screenshot.js +735 -0
  41. package/dist/agent/tools/screenshot.js.map +1 -0
  42. package/dist/components/InputArea.d.ts +1 -0
  43. package/dist/components/InputArea.d.ts.map +1 -1
  44. package/dist/components/InputArea.js +591 -43
  45. package/dist/components/InputArea.js.map +1 -1
  46. package/dist/components/SingleMessage.d.ts.map +1 -1
  47. package/dist/components/SingleMessage.js +157 -7
  48. package/dist/components/SingleMessage.js.map +1 -1
  49. package/dist/config/types.d.ts +6 -0
  50. package/dist/config/types.d.ts.map +1 -1
  51. package/dist/ui/views/status_bar.js +2 -2
  52. package/dist/ui/views/status_bar.js.map +1 -1
  53. package/dist/utils/dictation.d.ts +46 -0
  54. package/dist/utils/dictation.d.ts.map +1 -0
  55. package/dist/utils/dictation.js +409 -0
  56. package/dist/utils/dictation.js.map +1 -0
  57. package/dist/utils/dictation_native.d.ts +51 -0
  58. package/dist/utils/dictation_native.d.ts.map +1 -0
  59. package/dist/utils/dictation_native.js +216 -0
  60. package/dist/utils/dictation_native.js.map +1 -0
  61. package/dist/utils/path_complete.d.ts.map +1 -1
  62. package/dist/utils/path_complete.js +31 -6
  63. package/dist/utils/path_complete.js.map +1 -1
  64. package/dist/utils/path_format.d.ts +20 -0
  65. package/dist/utils/path_format.d.ts.map +1 -0
  66. package/dist/utils/path_format.js +90 -0
  67. package/dist/utils/path_format.js.map +1 -0
  68. package/dist/utils/table.d.ts +38 -0
  69. package/dist/utils/table.d.ts.map +1 -0
  70. package/dist/utils/table.js +133 -0
  71. package/dist/utils/table.js.map +1 -0
  72. package/dist/utils/tool_trace.d.ts +7 -2
  73. package/dist/utils/tool_trace.d.ts.map +1 -1
  74. package/dist/utils/tool_trace.js +156 -51
  75. package/dist/utils/tool_trace.js.map +1 -1
  76. package/package.json +4 -1
  77. package/packages/ztc-dictation/Cargo.toml +43 -0
  78. package/packages/ztc-dictation/README.md +65 -0
  79. package/packages/ztc-dictation/bin/.gitkeep +0 -0
  80. package/packages/ztc-dictation/index.d.ts +16 -0
  81. package/packages/ztc-dictation/index.js +74 -0
  82. package/packages/ztc-dictation/package.json +41 -0
  83. package/packages/ztc-dictation/src/main.rs +430 -0
  84. package/src/App.tsx +110 -7
  85. package/src/agent/agent.ts +116 -11
  86. package/src/agent/backends/anthropic.ts +15 -5
  87. package/src/agent/backends/gemini.ts +12 -0
  88. package/src/agent/backends/index.ts +1 -0
  89. package/src/agent/backends/openai_compatible.ts +12 -0
  90. package/src/agent/backends/types.ts +25 -1
  91. package/src/agent/commands/dictation.ts +11 -0
  92. package/src/agent/commands/index.ts +2 -0
  93. package/src/agent/commands/types.ts +8 -0
  94. package/src/agent/runtime/capabilities.ts +2 -1
  95. package/src/agent/tools/index.ts +6 -1
  96. package/src/agent/tools/screenshot.ts +821 -0
  97. package/src/components/InputArea.tsx +606 -42
  98. package/src/components/SingleMessage.tsx +248 -9
  99. package/src/config/types.ts +7 -0
  100. package/src/ui/views/status_bar.ts +2 -2
  101. package/src/utils/dictation.ts +467 -0
  102. package/src/utils/dictation_native.ts +258 -0
  103. package/src/utils/path_complete.ts +30 -4
  104. package/src/utils/path_format.ts +99 -0
  105. package/src/utils/table.ts +171 -0
  106. package/src/utils/tool_trace.ts +184 -54
@@ -0,0 +1,821 @@
1
+ import { exec } from 'child_process';
2
+ import { promisify } from 'util';
3
+ import { readFile, unlink } from 'fs/promises';
4
+ import { tmpdir } from 'os';
5
+ import { join } from 'path';
6
+ import { Tool } from './types.js';
7
+ import { ToolCapability } from '../runtime/capabilities.js';
8
+
9
+ const execAsync = promisify(exec);
10
+
11
+ // --- Types ---
12
+
13
+ export interface ScreenshotResult {
14
+ type: 'image';
15
+ mediaType: string;
16
+ data: string; // base64
17
+ description: string;
18
+ }
19
+
20
+ export interface WindowInfo {
21
+ windowId: number;
22
+ pid: number;
23
+ appName: string;
24
+ title: string;
25
+ bounds?: { x: number; y: number; width: number; height: number };
26
+ }
27
+
28
+ // --- Helper Functions ---
29
+
30
+ async function getWindowListMac(): Promise<WindowInfo[]> {
31
+ // Use Swift to get window list via CGWindowListCopyWindowInfo
32
+ // Swift provides reliable access to CoreGraphics APIs
33
+ const swiftScript = `
34
+ import Foundation
35
+ import CoreGraphics
36
+
37
+ if let windowList = CGWindowListCopyWindowInfo([.optionOnScreenOnly, .excludeDesktopElements], kCGNullWindowID) as? [[String: Any]] {
38
+ var results: [[String: Any]] = []
39
+
40
+ for window in windowList {
41
+ guard let windowId = window[kCGWindowNumber as String] as? Int,
42
+ let pid = window[kCGWindowOwnerPID as String] as? Int,
43
+ let appName = window[kCGWindowOwnerName as String] as? String,
44
+ windowId > 0 else { continue }
45
+
46
+ let title = window[kCGWindowName as String] as? String ?? ""
47
+ let layer = window[kCGWindowLayer as String] as? Int ?? 0
48
+
49
+ if layer < 0 { continue }
50
+
51
+ if let bounds = window[kCGWindowBounds as String] as? [String: Any],
52
+ let width = bounds["Width"] as? Double,
53
+ let height = bounds["Height"] as? Double {
54
+ // Skip tiny windows (menu bar items, etc.) but keep reasonably sized ones
55
+ if width < 50 || height < 50 { continue }
56
+
57
+ results.append([
58
+ "windowId": windowId,
59
+ "pid": pid,
60
+ "appName": appName,
61
+ "title": title,
62
+ "bounds": ["x": 0, "y": 0, "width": Int(width), "height": Int(height)]
63
+ ])
64
+ }
65
+ }
66
+
67
+ if let data = try? JSONSerialization.data(withJSONObject: results, options: []),
68
+ let json = String(data: data, encoding: .utf8) {
69
+ print(json)
70
+ }
71
+ }
72
+ `;
73
+
74
+ try {
75
+ const { writeFile, unlink: unlinkFile } = await import('fs/promises');
76
+ const scriptPath = join(tmpdir(), `ztc-windowlist-${Date.now()}.swift`);
77
+ await writeFile(scriptPath, swiftScript);
78
+
79
+ try {
80
+ const { stdout } = await execAsync(`swift "${scriptPath}"`, {
81
+ timeout: 15000
82
+ });
83
+ await unlinkFile(scriptPath).catch(() => {});
84
+
85
+ const parsed = JSON.parse(stdout.trim());
86
+ if (Array.isArray(parsed) && parsed.length > 0) {
87
+ return parsed;
88
+ }
89
+ } catch {
90
+ await unlinkFile(scriptPath).catch(() => {});
91
+ }
92
+
93
+ // Fallback to simpler AppleScript approach
94
+ return getWindowListMacFallback();
95
+ } catch {
96
+ return getWindowListMacFallback();
97
+ }
98
+ }
99
+
100
+ async function getWindowListMacFallback(): Promise<WindowInfo[]> {
101
+ // Simpler fallback using AppleScript
102
+ const script = `
103
+ tell application "System Events"
104
+ set windowList to {}
105
+ repeat with proc in (every process whose background only is false)
106
+ try
107
+ set procName to name of proc
108
+ set procPID to unix id of proc
109
+ repeat with win in (every window of proc)
110
+ try
111
+ set winName to name of win
112
+ set end of windowList to procName & "|||" & procPID & "|||" & winName
113
+ end try
114
+ end repeat
115
+ end try
116
+ end repeat
117
+ return windowList
118
+ end tell
119
+ `;
120
+
121
+ try {
122
+ const { stdout } = await execAsync(`osascript -e '${script.replace(/'/g, "'\"'\"'")}'`, {
123
+ timeout: 10000
124
+ });
125
+
126
+ const lines = stdout.trim().split(', ');
127
+ return lines.map((line, idx) => {
128
+ const [appName, pid, title] = line.split('|||');
129
+ return {
130
+ windowId: idx, // AppleScript doesn't give us real window IDs
131
+ pid: parseInt(pid) || 0,
132
+ appName: appName || 'Unknown',
133
+ title: title || ''
134
+ };
135
+ }).filter(w => w.appName && w.appName !== 'Unknown');
136
+ } catch {
137
+ return [];
138
+ }
139
+ }
140
+
141
+ async function getWindowListLinux(): Promise<WindowInfo[]> {
142
+ try {
143
+ // Try wmctrl first
144
+ const { stdout } = await execAsync('wmctrl -l -p', { timeout: 5000 });
145
+ const lines = stdout.trim().split('\n');
146
+
147
+ return lines.map(line => {
148
+ const parts = line.split(/\s+/);
149
+ const windowId = parseInt(parts[0], 16);
150
+ const pid = parseInt(parts[2]) || 0;
151
+ const title = parts.slice(4).join(' ');
152
+
153
+ return {
154
+ windowId,
155
+ pid,
156
+ appName: title.split(' - ').pop() || title,
157
+ title
158
+ };
159
+ });
160
+ } catch {
161
+ // Try xdotool as fallback
162
+ try {
163
+ const { stdout } = await execAsync('xdotool search --onlyvisible --name ""', { timeout: 5000 });
164
+ const windowIds = stdout.trim().split('\n').filter(Boolean);
165
+
166
+ const windows: WindowInfo[] = [];
167
+ for (const id of windowIds.slice(0, 20)) { // Limit to 20 windows
168
+ try {
169
+ const { stdout: name } = await execAsync(`xdotool getwindowname ${id}`, { timeout: 1000 });
170
+ const { stdout: pid } = await execAsync(`xdotool getwindowpid ${id}`, { timeout: 1000 });
171
+ windows.push({
172
+ windowId: parseInt(id),
173
+ pid: parseInt(pid.trim()) || 0,
174
+ appName: name.trim().split(' - ').pop() || name.trim(),
175
+ title: name.trim()
176
+ });
177
+ } catch {
178
+ continue;
179
+ }
180
+ }
181
+ return windows;
182
+ } catch {
183
+ return [];
184
+ }
185
+ }
186
+ }
187
+
188
+ async function getWindowListWindows(): Promise<WindowInfo[]> {
189
+ const psScript = `
190
+ Add-Type @"
191
+ using System;
192
+ using System.Runtime.InteropServices;
193
+ using System.Collections.Generic;
194
+ using System.Text;
195
+ using System.Diagnostics;
196
+
197
+ public class WindowHelper {
198
+ [DllImport("user32.dll")]
199
+ public static extern bool EnumWindows(EnumWindowsProc lpEnumFunc, IntPtr lParam);
200
+
201
+ [DllImport("user32.dll")]
202
+ public static extern int GetWindowText(IntPtr hWnd, StringBuilder lpString, int nMaxCount);
203
+
204
+ [DllImport("user32.dll")]
205
+ public static extern bool IsWindowVisible(IntPtr hWnd);
206
+
207
+ [DllImport("user32.dll")]
208
+ public static extern uint GetWindowThreadProcessId(IntPtr hWnd, out uint lpdwProcessId);
209
+
210
+ public delegate bool EnumWindowsProc(IntPtr hWnd, IntPtr lParam);
211
+
212
+ public static List<string> GetWindows() {
213
+ var windows = new List<string>();
214
+ EnumWindows((hWnd, lParam) => {
215
+ if (IsWindowVisible(hWnd)) {
216
+ var sb = new StringBuilder(256);
217
+ GetWindowText(hWnd, sb, 256);
218
+ var title = sb.ToString();
219
+ if (!string.IsNullOrWhiteSpace(title)) {
220
+ uint pid;
221
+ GetWindowThreadProcessId(hWnd, out pid);
222
+ try {
223
+ var proc = Process.GetProcessById((int)pid);
224
+ windows.Add(hWnd.ToInt64() + "|||" + pid + "|||" + proc.ProcessName + "|||" + title);
225
+ } catch {}
226
+ }
227
+ }
228
+ return true;
229
+ }, IntPtr.Zero);
230
+ return windows;
231
+ }
232
+ }
233
+ "@
234
+ [WindowHelper]::GetWindows() | ForEach-Object { $_ }
235
+ `;
236
+
237
+ try {
238
+ const { stdout } = await execAsync(`powershell -Command "${psScript.replace(/"/g, '\\"').replace(/\n/g, ' ')}"`, {
239
+ timeout: 15000
240
+ });
241
+
242
+ return stdout.trim().split('\n').filter(Boolean).map(line => {
243
+ const [windowId, pid, appName, title] = line.split('|||');
244
+ return {
245
+ windowId: parseInt(windowId) || 0,
246
+ pid: parseInt(pid) || 0,
247
+ appName: appName || 'Unknown',
248
+ title: title || ''
249
+ };
250
+ });
251
+ } catch {
252
+ return [];
253
+ }
254
+ }
255
+
256
+ async function findWindowByPid(pid: number): Promise<WindowInfo | null> {
257
+ const platform = process.platform;
258
+ let windows: WindowInfo[] = [];
259
+
260
+ if (platform === 'darwin') {
261
+ windows = await getWindowListMac();
262
+ } else if (platform === 'linux') {
263
+ windows = await getWindowListLinux();
264
+ } else if (platform === 'win32') {
265
+ windows = await getWindowListWindows();
266
+ }
267
+
268
+ return windows.find(w => w.pid === pid) || null;
269
+ }
270
+
271
+ async function findWindowByApp(appName: string): Promise<WindowInfo | null> {
272
+ const platform = process.platform;
273
+ let windows: WindowInfo[] = [];
274
+
275
+ if (platform === 'darwin') {
276
+ windows = await getWindowListMac();
277
+ } else if (platform === 'linux') {
278
+ windows = await getWindowListLinux();
279
+ } else if (platform === 'win32') {
280
+ windows = await getWindowListWindows();
281
+ }
282
+
283
+ const lower = appName.toLowerCase();
284
+ return windows.find(w =>
285
+ w.appName.toLowerCase().includes(lower) ||
286
+ w.title.toLowerCase().includes(lower)
287
+ ) || null;
288
+ }
289
+
290
+ // Check if we have screen recording permission on macOS
291
+ async function checkScreenRecordingPermission(): Promise<boolean> {
292
+ if (process.platform !== 'darwin') return true;
293
+
294
+ try {
295
+ // Try to capture a tiny region - if it fails with permission error, we know
296
+ const testPath = join(tmpdir(), `ztc-perm-test-${Date.now()}.png`);
297
+ await execAsync(`screencapture -x -R0,0,1,1 "${testPath}"`, { timeout: 5000 });
298
+ await unlink(testPath).catch(() => {});
299
+ return true;
300
+ } catch {
301
+ return false;
302
+ }
303
+ }
304
+
305
+ // Compress image if it exceeds size limit (4MB to leave room for base64 overhead)
306
+ const MAX_IMAGE_SIZE = 4 * 1024 * 1024; // 4MB
307
+
308
+ async function compressImageIfNeeded(imagePath: string): Promise<Buffer> {
309
+ const imageBuffer = await readFile(imagePath);
310
+
311
+ // If under the limit, return as-is
312
+ if (imageBuffer.length <= MAX_IMAGE_SIZE) {
313
+ return imageBuffer;
314
+ }
315
+
316
+ const platform = process.platform;
317
+
318
+ // Try to compress using platform tools
319
+ if (platform === 'darwin') {
320
+ // Use sips to resize the image
321
+ const compressedPath = imagePath.replace('.png', '-compressed.jpg');
322
+
323
+ // Calculate target max dimension based on size ratio
324
+ // Rough estimate: 4K screen is ~8M pixels at 24-bit = ~24MB uncompressed PNG
325
+ // We want to get to ~4MB, so roughly 1/6 the pixels = ~40% linear scale
326
+ const ratio = Math.sqrt(MAX_IMAGE_SIZE / imageBuffer.length) * 0.7;
327
+ const maxDimension = Math.max(800, Math.floor(2000 * ratio)); // At least 800px, scale from 2000px base
328
+
329
+ try {
330
+ // Convert to JPEG and resize - sips uses pixel values, not percentages
331
+ await execAsync(
332
+ `sips -s format jpeg -s formatOptions 70 -Z ${maxDimension} "${imagePath}" --out "${compressedPath}"`,
333
+ { timeout: 30000 }
334
+ );
335
+
336
+ const compressedBuffer = await readFile(compressedPath);
337
+ await unlink(compressedPath).catch(() => {});
338
+
339
+ if (compressedBuffer.length <= MAX_IMAGE_SIZE) {
340
+ return compressedBuffer;
341
+ }
342
+
343
+ // If still too large, try more aggressive compression
344
+ await execAsync(
345
+ `sips -s format jpeg -s formatOptions 50 -Z 1200 "${imagePath}" --out "${compressedPath}"`,
346
+ { timeout: 30000 }
347
+ );
348
+
349
+ const moreCompressedBuffer = await readFile(compressedPath);
350
+ await unlink(compressedPath).catch(() => {});
351
+
352
+ if (moreCompressedBuffer.length <= MAX_IMAGE_SIZE) {
353
+ return moreCompressedBuffer;
354
+ }
355
+
356
+ // Last resort: very aggressive compression
357
+ await execAsync(
358
+ `sips -s format jpeg -s formatOptions 40 -Z 800 "${imagePath}" --out "${compressedPath}"`,
359
+ { timeout: 30000 }
360
+ );
361
+
362
+ const finalBuffer = await readFile(compressedPath);
363
+ await unlink(compressedPath).catch(() => {});
364
+ return finalBuffer;
365
+ } catch {
366
+ // If compression fails, return original (API will error if too large)
367
+ return imageBuffer;
368
+ }
369
+ } else if (platform === 'linux') {
370
+ // Try using ImageMagick convert
371
+ const compressedPath = imagePath.replace('.png', '-compressed.jpg');
372
+ const ratio = Math.sqrt(MAX_IMAGE_SIZE / imageBuffer.length) * 0.8;
373
+ const scalePercent = Math.max(20, Math.min(90, Math.floor(ratio * 100)));
374
+
375
+ try {
376
+ await execAsync(
377
+ `convert "${imagePath}" -resize ${scalePercent}% -quality 70 "${compressedPath}"`,
378
+ { timeout: 30000 }
379
+ );
380
+
381
+ const compressedBuffer = await readFile(compressedPath);
382
+ await unlink(compressedPath).catch(() => {});
383
+ return compressedBuffer;
384
+ } catch {
385
+ return imageBuffer;
386
+ }
387
+ }
388
+
389
+ // No compression available, return original
390
+ return imageBuffer;
391
+ }
392
+
393
+ async function captureWindow(windowId: number, tempPath: string): Promise<void> {
394
+ const platform = process.platform;
395
+
396
+ if (platform === 'darwin') {
397
+ // macOS: screencapture -l <windowID>
398
+ try {
399
+ await execAsync(`screencapture -x -l ${windowId} "${tempPath}"`, { timeout: 30000 });
400
+ } catch (err) {
401
+ const message = (err as Error).message;
402
+ if (message.includes('could not create image from window')) {
403
+ // Check if it's a permission issue
404
+ const hasPermission = await checkScreenRecordingPermission();
405
+ if (!hasPermission) {
406
+ throw new Error(
407
+ 'Screen Recording permission required. Go to System Settings > Privacy & Security > Screen Recording and enable your terminal app (iTerm, Terminal, etc.)'
408
+ );
409
+ }
410
+ throw new Error(`Window capture failed for window ID ${windowId}. The window may be minimized or on a different Space.`);
411
+ }
412
+ throw err;
413
+ }
414
+ } else if (platform === 'linux') {
415
+ // Linux: try import with window ID
416
+ try {
417
+ await execAsync(`import -window ${windowId} "${tempPath}"`, { timeout: 30000 });
418
+ } catch {
419
+ // Fallback to xwd + convert
420
+ await execAsync(`xwd -id ${windowId} | convert xwd:- "${tempPath}"`, { timeout: 30000 });
421
+ }
422
+ } else if (platform === 'win32') {
423
+ // Windows: use .NET to capture specific window
424
+ const psScript = `
425
+ Add-Type -AssemblyName System.Windows.Forms
426
+ Add-Type -AssemblyName System.Drawing
427
+ Add-Type @"
428
+ using System;
429
+ using System.Runtime.InteropServices;
430
+ using System.Drawing;
431
+
432
+ public class WindowCapture {
433
+ [DllImport("user32.dll")]
434
+ public static extern bool GetWindowRect(IntPtr hWnd, out RECT lpRect);
435
+
436
+ [StructLayout(LayoutKind.Sequential)]
437
+ public struct RECT {
438
+ public int Left, Top, Right, Bottom;
439
+ }
440
+
441
+ public static Rectangle GetBounds(IntPtr hWnd) {
442
+ RECT rect;
443
+ GetWindowRect(hWnd, out rect);
444
+ return new Rectangle(rect.Left, rect.Top, rect.Right - rect.Left, rect.Bottom - rect.Top);
445
+ }
446
+ }
447
+ "@
448
+ $hwnd = [IntPtr]${windowId}
449
+ $bounds = [WindowCapture]::GetBounds($hwnd)
450
+ $bitmap = New-Object System.Drawing.Bitmap($bounds.Width, $bounds.Height)
451
+ $graphics = [System.Drawing.Graphics]::FromImage($bitmap)
452
+ $graphics.CopyFromScreen($bounds.Location, [System.Drawing.Point]::Empty, $bounds.Size)
453
+ $bitmap.Save('${tempPath.replace(/\\/g, '\\\\')}')
454
+ $graphics.Dispose()
455
+ $bitmap.Dispose()
456
+ `;
457
+ await execAsync(`powershell -Command "${psScript.replace(/"/g, '\\"').replace(/\n/g, '; ')}"`, { timeout: 30000 });
458
+ } else {
459
+ throw new Error(`Window capture not supported on platform: ${platform}`);
460
+ }
461
+ }
462
+
463
+ async function captureFullScreen(tempPath: string, display?: number, delay?: number): Promise<void> {
464
+ const platform = process.platform;
465
+
466
+ if (platform === 'darwin') {
467
+ let cmd = 'screencapture -x';
468
+ if (delay && delay > 0) {
469
+ cmd += ` -T${delay}`;
470
+ }
471
+ if (display !== undefined) {
472
+ cmd += ` -D${display}`;
473
+ }
474
+ cmd += ` "${tempPath}"`;
475
+ await execAsync(cmd, { timeout: 30000 + (delay || 0) * 1000 });
476
+ } else if (platform === 'linux') {
477
+ if (delay && delay > 0) {
478
+ await new Promise(resolve => setTimeout(resolve, delay * 1000));
479
+ }
480
+
481
+ const commands = [
482
+ `gnome-screenshot -f "${tempPath}"`,
483
+ `scrot "${tempPath}"`,
484
+ `import -window root "${tempPath}"`
485
+ ];
486
+
487
+ let success = false;
488
+ for (const cmd of commands) {
489
+ try {
490
+ await execAsync(cmd, { timeout: 30000 });
491
+ success = true;
492
+ break;
493
+ } catch {
494
+ continue;
495
+ }
496
+ }
497
+
498
+ if (!success) {
499
+ throw new Error('No screenshot tool available. Install gnome-screenshot, scrot, or ImageMagick.');
500
+ }
501
+ } else if (platform === 'win32') {
502
+ if (delay && delay > 0) {
503
+ await new Promise(resolve => setTimeout(resolve, delay * 1000));
504
+ }
505
+
506
+ const psScript = `
507
+ Add-Type -AssemblyName System.Windows.Forms
508
+ $screen = [System.Windows.Forms.Screen]::PrimaryScreen
509
+ $bitmap = New-Object System.Drawing.Bitmap($screen.Bounds.Width, $screen.Bounds.Height)
510
+ $graphics = [System.Drawing.Graphics]::FromImage($bitmap)
511
+ $graphics.CopyFromScreen($screen.Bounds.Location, [System.Drawing.Point]::Empty, $screen.Bounds.Size)
512
+ $bitmap.Save('${tempPath.replace(/\\/g, '\\\\')}')
513
+ $graphics.Dispose()
514
+ $bitmap.Dispose()
515
+ `;
516
+ await execAsync(`powershell -Command "${psScript.replace(/\n/g, '; ')}"`, { timeout: 30000 });
517
+ } else {
518
+ throw new Error(`Screenshot not supported on platform: ${platform}`);
519
+ }
520
+ }
521
+
522
+ // --- Screenshot Tool ---
523
+
524
+ export const screenshotTool: Tool = {
525
+ capabilities: [ToolCapability.SCREEN_CAPTURE],
526
+ definition: {
527
+ name: 'screenshot',
528
+ description: 'Capture a screenshot. Can capture the full screen, a specific window by ID, by PID, or by app name.',
529
+ parameters: {
530
+ type: 'object',
531
+ properties: {
532
+ windowId: {
533
+ type: 'number',
534
+ description: 'Specific window ID to capture (from list_windows)'
535
+ },
536
+ pid: {
537
+ type: 'number',
538
+ description: 'Process ID - captures the first window belonging to this process'
539
+ },
540
+ app: {
541
+ type: 'string',
542
+ description: 'App name to capture (partial match, e.g., "Safari", "Chrome", "Terminal")'
543
+ },
544
+ display: {
545
+ type: 'number',
546
+ description: 'Display number for full-screen capture (default: main display)'
547
+ },
548
+ delay: {
549
+ type: 'number',
550
+ description: 'Delay in seconds before capture (default: 0)'
551
+ }
552
+ },
553
+ required: []
554
+ }
555
+ },
556
+ execute: async (args) => {
557
+ const windowId = args.windowId !== undefined ? Number(args.windowId) : undefined;
558
+ const pid = args.pid !== undefined ? Number(args.pid) : undefined;
559
+ const app = args.app !== undefined ? String(args.app) : undefined;
560
+ const display = args.display !== undefined ? Number(args.display) : undefined;
561
+ const delay = args.delay !== undefined ? Number(args.delay) : 0;
562
+
563
+ const timestamp = Date.now();
564
+ const tempPath = join(tmpdir(), `ztc-screenshot-${timestamp}.png`);
565
+
566
+ try {
567
+ let description = 'Screenshot captured';
568
+
569
+ if (windowId !== undefined) {
570
+ // Capture specific window by ID
571
+ await captureWindow(windowId, tempPath);
572
+ description = `Window ${windowId} captured`;
573
+ } else if (pid !== undefined) {
574
+ // Find window by PID and capture it
575
+ const window = await findWindowByPid(pid);
576
+ if (!window) {
577
+ throw new Error(`No visible window found for PID ${pid}`);
578
+ }
579
+ await captureWindow(window.windowId, tempPath);
580
+ description = `Window captured: ${window.appName} - ${window.title} (PID: ${pid})`;
581
+ } else if (app !== undefined) {
582
+ // Find window by app name and capture it
583
+ const window = await findWindowByApp(app);
584
+ if (!window) {
585
+ throw new Error(`No visible window found for app "${app}"`);
586
+ }
587
+ await captureWindow(window.windowId, tempPath);
588
+ description = `Window captured: ${window.appName} - ${window.title}`;
589
+ } else {
590
+ // Full screen capture
591
+ if (delay > 0) {
592
+ await new Promise(resolve => setTimeout(resolve, delay * 1000));
593
+ }
594
+ await captureFullScreen(tempPath, display);
595
+ description = 'Full screen captured';
596
+ }
597
+
598
+ // Read and compress the image if needed
599
+ const originalSize = (await readFile(tempPath)).length;
600
+ const imageBuffer = await compressImageIfNeeded(tempPath);
601
+ const base64Data = imageBuffer.toString('base64');
602
+ const wasCompressed = imageBuffer.length < originalSize;
603
+
604
+ // Clean up temp file
605
+ try {
606
+ await unlink(tempPath);
607
+ } catch {
608
+ // Ignore cleanup errors
609
+ }
610
+
611
+ // Determine media type based on whether compression converted to JPEG
612
+ const mediaType = wasCompressed ? 'image/jpeg' : 'image/png';
613
+ const sizeInfo = wasCompressed
614
+ ? `${imageBuffer.length} bytes, compressed from ${originalSize} bytes`
615
+ : `${imageBuffer.length} bytes`;
616
+
617
+ const result: ScreenshotResult = {
618
+ type: 'image',
619
+ mediaType,
620
+ data: base64Data,
621
+ description: `${description} (${sizeInfo})`
622
+ };
623
+
624
+ return JSON.stringify(result);
625
+ } catch (err) {
626
+ try {
627
+ await unlink(tempPath);
628
+ } catch {
629
+ // Ignore
630
+ }
631
+
632
+ const message = (err as Error).message;
633
+ throw new Error(`Screenshot failed: ${message}`);
634
+ }
635
+ }
636
+ };
637
+
638
+ // --- List Windows Tool ---
639
+
640
+ export const listWindowsTool: Tool = {
641
+ capabilities: [ToolCapability.SCREEN_CAPTURE],
642
+ definition: {
643
+ name: 'list_windows',
644
+ description: 'List all visible windows with their IDs, PIDs, app names, and titles. Use this to find window IDs for targeted screenshots.',
645
+ parameters: {
646
+ type: 'object',
647
+ properties: {
648
+ filter: {
649
+ type: 'string',
650
+ description: 'Optional filter to match app name or title (case-insensitive)'
651
+ }
652
+ },
653
+ required: []
654
+ }
655
+ },
656
+ execute: async (args) => {
657
+ const filter = args.filter ? String(args.filter).toLowerCase() : undefined;
658
+ const platform = process.platform;
659
+
660
+ let windows: WindowInfo[] = [];
661
+
662
+ if (platform === 'darwin') {
663
+ windows = await getWindowListMac();
664
+ } else if (platform === 'linux') {
665
+ windows = await getWindowListLinux();
666
+ } else if (platform === 'win32') {
667
+ windows = await getWindowListWindows();
668
+ } else {
669
+ throw new Error(`Window listing not supported on platform: ${platform}`);
670
+ }
671
+
672
+ if (filter) {
673
+ windows = windows.filter(w =>
674
+ w.appName.toLowerCase().includes(filter) ||
675
+ w.title.toLowerCase().includes(filter)
676
+ );
677
+ }
678
+
679
+ if (windows.length === 0) {
680
+ return filter
681
+ ? `No windows found matching "${filter}"`
682
+ : 'No visible windows found';
683
+ }
684
+
685
+ const lines = windows.map(w => {
686
+ const boundsStr = w.bounds
687
+ ? ` [${w.bounds.width}x${w.bounds.height}]`
688
+ : '';
689
+ return `• Window ${w.windowId} (PID ${w.pid}): ${w.appName}${w.title ? ` - "${w.title}"` : ''}${boundsStr}`;
690
+ });
691
+
692
+ return `Found ${windows.length} window(s):\n${lines.join('\n')}`;
693
+ }
694
+ };
695
+
696
+ // --- Run and Monitor Tool ---
697
+
698
+ export const runAndMonitorTool: Tool = {
699
+ capabilities: [ToolCapability.SCREEN_CAPTURE, ToolCapability.SHELL_EXEC],
700
+ definition: {
701
+ name: 'run_and_capture',
702
+ description: 'Launch an application and capture its window after it opens. Useful for running a command and seeing its visual output.',
703
+ parameters: {
704
+ type: 'object',
705
+ properties: {
706
+ command: {
707
+ type: 'string',
708
+ description: 'Command to run (e.g., "open -a Safari https://example.com" on macOS, "firefox https://example.com" on Linux)'
709
+ },
710
+ waitMs: {
711
+ type: 'number',
712
+ description: 'Milliseconds to wait for app to open before capturing (default: 2000)'
713
+ },
714
+ app: {
715
+ type: 'string',
716
+ description: 'App name to capture after launch (if different from command). Will search for window by this name.'
717
+ }
718
+ },
719
+ required: ['command']
720
+ }
721
+ },
722
+ execute: async (args) => {
723
+ const command = String(args.command);
724
+ const waitMs = args.waitMs !== undefined ? Number(args.waitMs) : 2000;
725
+ const appName = args.app ? String(args.app) : undefined;
726
+
727
+ // Launch the application
728
+ try {
729
+ // Use spawn behavior - don't wait for command to finish
730
+ execAsync(command, { timeout: 5000 }).catch(() => {});
731
+ } catch {
732
+ // Ignore - app may have launched successfully even if command returns
733
+ }
734
+
735
+ // Wait for app to open
736
+ await new Promise(resolve => setTimeout(resolve, waitMs));
737
+
738
+ // Try to find and capture the window
739
+ const platform = process.platform;
740
+ let windows: WindowInfo[] = [];
741
+
742
+ if (platform === 'darwin') {
743
+ windows = await getWindowListMac();
744
+ } else if (platform === 'linux') {
745
+ windows = await getWindowListLinux();
746
+ } else if (platform === 'win32') {
747
+ windows = await getWindowListWindows();
748
+ }
749
+
750
+ // Find the app window
751
+ let targetWindow: WindowInfo | undefined;
752
+
753
+ if (appName) {
754
+ const lower = appName.toLowerCase();
755
+ targetWindow = windows.find(w =>
756
+ w.appName.toLowerCase().includes(lower) ||
757
+ w.title.toLowerCase().includes(lower)
758
+ );
759
+ } else {
760
+ // Try to extract app name from command
761
+ const cmdParts = command.split(/\s+/);
762
+ // Look for app name in common patterns
763
+ const openIdx = cmdParts.indexOf('-a');
764
+ if (openIdx !== -1 && cmdParts[openIdx + 1]) {
765
+ const appFromCmd = cmdParts[openIdx + 1].toLowerCase();
766
+ targetWindow = windows.find(w => w.appName.toLowerCase().includes(appFromCmd));
767
+ }
768
+
769
+ if (!targetWindow) {
770
+ // Try matching last part of command
771
+ const lastPart = cmdParts[cmdParts.length - 1]?.toLowerCase() || '';
772
+ targetWindow = windows.find(w =>
773
+ w.appName.toLowerCase().includes(lastPart) ||
774
+ w.title.toLowerCase().includes(lastPart)
775
+ );
776
+ }
777
+ }
778
+
779
+ if (!targetWindow) {
780
+ // If we can't find the specific window, return list of windows
781
+ const windowList = windows.slice(0, 10).map(w =>
782
+ `• ${w.appName}${w.title ? ` - "${w.title}"` : ''} (Window ${w.windowId})`
783
+ ).join('\n');
784
+
785
+ return `App launched but window not found. Recent windows:\n${windowList}\n\nUse screenshot with a specific windowId or app name.`;
786
+ }
787
+
788
+ // Capture the window
789
+ const timestamp = Date.now();
790
+ const tempPath = join(tmpdir(), `ztc-screenshot-${timestamp}.png`);
791
+
792
+ try {
793
+ await captureWindow(targetWindow.windowId, tempPath);
794
+
795
+ const imageBuffer = await readFile(tempPath);
796
+ const base64Data = imageBuffer.toString('base64');
797
+
798
+ try {
799
+ await unlink(tempPath);
800
+ } catch {
801
+ // Ignore
802
+ }
803
+
804
+ const result: ScreenshotResult = {
805
+ type: 'image',
806
+ mediaType: 'image/png',
807
+ data: base64Data,
808
+ description: `Captured ${targetWindow.appName}${targetWindow.title ? ` - "${targetWindow.title}"` : ''} (${imageBuffer.length} bytes)`
809
+ };
810
+
811
+ return JSON.stringify(result);
812
+ } catch (err) {
813
+ try {
814
+ await unlink(tempPath);
815
+ } catch {
816
+ // Ignore
817
+ }
818
+ throw new Error(`Failed to capture window: ${(err as Error).message}`);
819
+ }
820
+ }
821
+ };