zerg-ztc 0.1.7 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/dist/App.d.ts.map +1 -1
  2. package/dist/App.js +75 -8
  3. package/dist/App.js.map +1 -1
  4. package/dist/agent/agent.d.ts +2 -0
  5. package/dist/agent/agent.d.ts.map +1 -1
  6. package/dist/agent/agent.js +111 -10
  7. package/dist/agent/agent.js.map +1 -1
  8. package/dist/agent/backends/anthropic.d.ts.map +1 -1
  9. package/dist/agent/backends/anthropic.js +15 -3
  10. package/dist/agent/backends/anthropic.js.map +1 -1
  11. package/dist/agent/backends/gemini.d.ts.map +1 -1
  12. package/dist/agent/backends/gemini.js +12 -0
  13. package/dist/agent/backends/gemini.js.map +1 -1
  14. package/dist/agent/backends/index.d.ts +1 -1
  15. package/dist/agent/backends/index.d.ts.map +1 -1
  16. package/dist/agent/backends/openai_compatible.d.ts.map +1 -1
  17. package/dist/agent/backends/openai_compatible.js +12 -0
  18. package/dist/agent/backends/openai_compatible.js.map +1 -1
  19. package/dist/agent/backends/types.d.ts +21 -1
  20. package/dist/agent/backends/types.d.ts.map +1 -1
  21. package/dist/agent/commands/dictation.d.ts +3 -0
  22. package/dist/agent/commands/dictation.d.ts.map +1 -0
  23. package/dist/agent/commands/dictation.js +10 -0
  24. package/dist/agent/commands/dictation.js.map +1 -0
  25. package/dist/agent/commands/index.d.ts.map +1 -1
  26. package/dist/agent/commands/index.js +2 -1
  27. package/dist/agent/commands/index.js.map +1 -1
  28. package/dist/agent/commands/types.d.ts +7 -0
  29. package/dist/agent/commands/types.d.ts.map +1 -1
  30. package/dist/agent/runtime/capabilities.d.ts +2 -1
  31. package/dist/agent/runtime/capabilities.d.ts.map +1 -1
  32. package/dist/agent/runtime/capabilities.js +1 -0
  33. package/dist/agent/runtime/capabilities.js.map +1 -1
  34. package/dist/agent/tools/index.d.ts +1 -0
  35. package/dist/agent/tools/index.d.ts.map +1 -1
  36. package/dist/agent/tools/index.js +6 -1
  37. package/dist/agent/tools/index.js.map +1 -1
  38. package/dist/agent/tools/screenshot.d.ts +23 -0
  39. package/dist/agent/tools/screenshot.d.ts.map +1 -0
  40. package/dist/agent/tools/screenshot.js +735 -0
  41. package/dist/agent/tools/screenshot.js.map +1 -0
  42. package/dist/components/InputArea.d.ts +1 -0
  43. package/dist/components/InputArea.d.ts.map +1 -1
  44. package/dist/components/InputArea.js +591 -43
  45. package/dist/components/InputArea.js.map +1 -1
  46. package/dist/components/SingleMessage.d.ts.map +1 -1
  47. package/dist/components/SingleMessage.js +157 -7
  48. package/dist/components/SingleMessage.js.map +1 -1
  49. package/dist/config/types.d.ts +6 -0
  50. package/dist/config/types.d.ts.map +1 -1
  51. package/dist/ui/views/status_bar.js +2 -2
  52. package/dist/ui/views/status_bar.js.map +1 -1
  53. package/dist/utils/dictation.d.ts +46 -0
  54. package/dist/utils/dictation.d.ts.map +1 -0
  55. package/dist/utils/dictation.js +409 -0
  56. package/dist/utils/dictation.js.map +1 -0
  57. package/dist/utils/dictation_native.d.ts +51 -0
  58. package/dist/utils/dictation_native.d.ts.map +1 -0
  59. package/dist/utils/dictation_native.js +216 -0
  60. package/dist/utils/dictation_native.js.map +1 -0
  61. package/dist/utils/path_complete.d.ts.map +1 -1
  62. package/dist/utils/path_complete.js +31 -6
  63. package/dist/utils/path_complete.js.map +1 -1
  64. package/dist/utils/path_format.d.ts +20 -0
  65. package/dist/utils/path_format.d.ts.map +1 -0
  66. package/dist/utils/path_format.js +90 -0
  67. package/dist/utils/path_format.js.map +1 -0
  68. package/dist/utils/table.d.ts +38 -0
  69. package/dist/utils/table.d.ts.map +1 -0
  70. package/dist/utils/table.js +133 -0
  71. package/dist/utils/table.js.map +1 -0
  72. package/dist/utils/tool_trace.d.ts +7 -2
  73. package/dist/utils/tool_trace.d.ts.map +1 -1
  74. package/dist/utils/tool_trace.js +156 -51
  75. package/dist/utils/tool_trace.js.map +1 -1
  76. package/package.json +4 -1
  77. package/packages/ztc-dictation/Cargo.toml +43 -0
  78. package/packages/ztc-dictation/README.md +65 -0
  79. package/packages/ztc-dictation/bin/.gitkeep +0 -0
  80. package/packages/ztc-dictation/index.d.ts +16 -0
  81. package/packages/ztc-dictation/index.js +74 -0
  82. package/packages/ztc-dictation/package.json +41 -0
  83. package/packages/ztc-dictation/src/main.rs +430 -0
  84. package/src/App.tsx +110 -7
  85. package/src/agent/agent.ts +116 -11
  86. package/src/agent/backends/anthropic.ts +15 -5
  87. package/src/agent/backends/gemini.ts +12 -0
  88. package/src/agent/backends/index.ts +1 -0
  89. package/src/agent/backends/openai_compatible.ts +12 -0
  90. package/src/agent/backends/types.ts +25 -1
  91. package/src/agent/commands/dictation.ts +11 -0
  92. package/src/agent/commands/index.ts +2 -0
  93. package/src/agent/commands/types.ts +8 -0
  94. package/src/agent/runtime/capabilities.ts +2 -1
  95. package/src/agent/tools/index.ts +6 -1
  96. package/src/agent/tools/screenshot.ts +821 -0
  97. package/src/components/InputArea.tsx +606 -42
  98. package/src/components/SingleMessage.tsx +248 -9
  99. package/src/config/types.ts +7 -0
  100. package/src/ui/views/status_bar.ts +2 -2
  101. package/src/utils/dictation.ts +467 -0
  102. package/src/utils/dictation_native.ts +258 -0
  103. package/src/utils/path_complete.ts +30 -4
  104. package/src/utils/path_format.ts +99 -0
  105. package/src/utils/table.ts +171 -0
  106. package/src/utils/tool_trace.ts +184 -54
@@ -4,7 +4,7 @@ import { extname } from 'path';
4
4
 
5
5
  // Local
6
6
  import { Message, ToolCall, AgentEvent } from '../types.js';
7
- import { AnthropicBackend, AgentBackend, BackendRequest, BackendResponse, ContentBlock, LlmMessage, RequestContentBlock, TokenUsage } from './backends/index.js';
7
+ import { AnthropicBackend, AgentBackend, BackendRequest, BackendResponse, ContentBlock, LlmMessage, RequestContentBlock, ToolResultBlock, TokenUsage } from './backends/index.js';
8
8
  import { AllowAllPolicy, Policy } from './runtime/policy.js';
9
9
  import { NoopTracer, Tracer } from './runtime/tracing.js';
10
10
  import { defaultTools, executeTool, getToolDefinitions, getTool } from './tools/index.js';
@@ -85,8 +85,11 @@ export class Agent {
85
85
 
86
86
  You have access to tools for:
87
87
  - Reading and writing files
88
- - Listing directory contents
88
+ - Listing directory contents
89
89
  - Running shell commands
90
+ - Taking screenshots (full screen or specific windows by app name, PID, or window ID)
91
+ - Listing open windows to find window IDs
92
+ - Launching apps and capturing their windows
90
93
  - Querying the Zerg system
91
94
 
92
95
  Be concise and helpful. When using tools, explain what you're doing briefly. If a task requires multiple steps, proceed through them systematically.
@@ -112,29 +115,131 @@ When a user intent maps to an available slash command, invoke the command direct
112
115
  .filter((m): m is Message & { role: 'user' | 'assistant' } => {
113
116
  // Only include user and assistant messages
114
117
  if (m.role !== 'user' && m.role !== 'assistant') return false;
115
- // Filter out assistant messages with empty content (from tool-only responses)
116
- // The API rejects empty content for non-final assistant messages
117
- if (m.role === 'assistant' && (!m.content || m.content.trim() === '')) return false;
118
+ // Filter out assistant messages with empty content AND no tool calls
119
+ if (m.role === 'assistant' && (!m.content || m.content.trim() === '') && !m.toolCalls?.length) return false;
118
120
  return true;
119
121
  })
120
- .map(m => ({
121
- role: m.role,
122
- content: m.role === 'user' ? this.buildContentBlocks(m.content) : m.content
123
- }));
122
+ .map(m => {
123
+ if (m.role === 'user') {
124
+ return {
125
+ role: m.role,
126
+ content: this.buildContentBlocks(m.content)
127
+ };
128
+ }
129
+
130
+ // Assistant message - may need to include tool_use blocks
131
+ if (m.toolCalls && m.toolCalls.length > 0) {
132
+ // Build content array with text and tool_use blocks
133
+ const contentBlocks: Array<{ type: 'text'; text: string } | { type: 'tool_use'; id: string; name: string; input: Record<string, unknown> }> = [];
134
+
135
+ // Add text content if present
136
+ if (m.content && m.content.trim() && m.content !== '[Using tools...]') {
137
+ contentBlocks.push({ type: 'text', text: m.content });
138
+ }
139
+
140
+ // Add tool_use blocks
141
+ for (const tc of m.toolCalls) {
142
+ contentBlocks.push({
143
+ type: 'tool_use',
144
+ id: tc.id,
145
+ name: tc.name,
146
+ input: tc.args
147
+ });
148
+ }
149
+
150
+ return {
151
+ role: m.role,
152
+ content: contentBlocks as unknown as RequestContentBlock[]
153
+ };
154
+ }
155
+
156
+ // Plain text assistant message
157
+ return {
158
+ role: m.role,
159
+ content: m.content
160
+ };
161
+ });
124
162
  }
125
163
 
126
164
  private contentLength(content: string | RequestContentBlock[]): number {
127
165
  if (typeof content === 'string') return content.length;
128
166
  return content.reduce((sum, block) => {
129
167
  if (block.type === 'text') return sum + block.text.length;
130
- return sum + block.data.length;
168
+ if (block.type === 'image') return sum + block.data.length;
169
+ if (block.type === 'tool_result') {
170
+ // Estimate tool result content length
171
+ if (typeof block.content === 'string') return sum + block.content.length;
172
+ return sum + block.content.reduce((s, b) => {
173
+ if (b.type === 'text') return s + b.text.length;
174
+ if (b.type === 'image') return s + b.source.data.length;
175
+ return s;
176
+ }, 0);
177
+ }
178
+ return sum;
131
179
  }, 0);
132
180
  }
133
181
 
182
+ // Parse tool result string to check for image data
183
+ private parseToolResultForImages(result: string): { hasImage: boolean; imageData?: { mediaType: string; data: string }; text: string } {
184
+ try {
185
+ const parsed = JSON.parse(result);
186
+ if (parsed && parsed.type === 'image' && parsed.data && parsed.mediaType) {
187
+ return {
188
+ hasImage: true,
189
+ imageData: { mediaType: parsed.mediaType, data: parsed.data },
190
+ text: parsed.description || 'Screenshot captured'
191
+ };
192
+ }
193
+ } catch {
194
+ // Not JSON or not an image result
195
+ }
196
+ return { hasImage: false, text: result };
197
+ }
198
+
199
+ // Build tool result content blocks for the API
200
+ private buildToolResultBlocks(toolResults: Array<{ tool_use_id: string; content: string }>): ToolResultBlock[] {
201
+ return toolResults.map(result => {
202
+ const parsed = this.parseToolResultForImages(result.content);
203
+
204
+ if (parsed.hasImage && parsed.imageData) {
205
+ // Include both text and image in tool result
206
+ return {
207
+ type: 'tool_result' as const,
208
+ tool_use_id: result.tool_use_id,
209
+ content: [
210
+ { type: 'text' as const, text: parsed.text },
211
+ {
212
+ type: 'image' as const,
213
+ source: {
214
+ type: 'base64' as const,
215
+ media_type: parsed.imageData.mediaType,
216
+ data: parsed.imageData.data
217
+ }
218
+ }
219
+ ]
220
+ };
221
+ }
222
+
223
+ // Plain text result
224
+ return {
225
+ type: 'tool_result' as const,
226
+ tool_use_id: result.tool_use_id,
227
+ content: result.content
228
+ };
229
+ });
230
+ }
231
+
134
232
  private buildContentBlocks(content: string): string | RequestContentBlock[] {
135
233
  const trimmed = content.trimStart();
234
+
235
+ // Check if this is a tool results message
136
236
  if (trimmed.startsWith('[') && trimmed.includes('"tool_use_id"')) {
137
- return content;
237
+ try {
238
+ const toolResults = JSON.parse(trimmed) as Array<{ tool_use_id: string; content: string }>;
239
+ return this.buildToolResultBlocks(toolResults);
240
+ } catch {
241
+ return content;
242
+ }
138
243
  }
139
244
 
140
245
  const imageRegex = /\[image ([^\]]+)\]/g;
@@ -32,11 +32,21 @@ export class AnthropicBackend implements AgentBackend {
32
32
  role: message.role,
33
33
  content: typeof message.content === 'string'
34
34
  ? message.content
35
- : message.content.map(block => (
36
- block.type === 'text'
37
- ? { type: 'text', text: block.text }
38
- : { type: 'image', source: { type: 'base64', media_type: block.mediaType, data: block.data } }
39
- ))
35
+ : message.content.map(block => {
36
+ if (block.type === 'text') {
37
+ return { type: 'text', text: block.text };
38
+ }
39
+ if (block.type === 'tool_result') {
40
+ // Pass tool results through in Anthropic format
41
+ return block;
42
+ }
43
+ if (block.type === 'tool_use') {
44
+ // Pass tool_use blocks through for assistant messages
45
+ return block;
46
+ }
47
+ // Image block
48
+ return { type: 'image', source: { type: 'base64', media_type: block.mediaType, data: block.data } };
49
+ })
40
50
  })),
41
51
  tools: request.tools.map(t => ({
42
52
  name: t.name,
@@ -46,6 +46,18 @@ export class GeminiBackend implements AgentBackend {
46
46
  if (block.type === 'text') {
47
47
  return { text: block.text };
48
48
  }
49
+ if (block.type === 'tool_result') {
50
+ // Gemini handles function responses differently - convert to text for now
51
+ const resultText = typeof block.content === 'string'
52
+ ? block.content
53
+ : block.content.map(b => b.type === 'text' ? b.text : '[image]').join('\n');
54
+ return { text: `Function result: ${resultText}` };
55
+ }
56
+ if (block.type === 'tool_use') {
57
+ // Convert tool_use to function call format for Gemini
58
+ return { functionCall: { name: block.name, args: block.input } };
59
+ }
60
+ // Image block
49
61
  return { inlineData: { mimeType: block.mediaType, data: block.data } };
50
62
  });
51
63
  };
@@ -12,5 +12,6 @@ export type {
12
12
  ToolUseBlock,
13
13
  LlmMessage,
14
14
  RequestContentBlock,
15
+ ToolResultBlock,
15
16
  TokenUsage
16
17
  } from './types.js';
@@ -49,6 +49,18 @@ export class OpenAICompatibleBackend implements AgentBackend {
49
49
  if (block.type === 'text') {
50
50
  return { type: 'text', text: block.text };
51
51
  }
52
+ if (block.type === 'tool_result') {
53
+ // OpenAI format: convert tool result to text
54
+ const resultText = typeof block.content === 'string'
55
+ ? block.content
56
+ : block.content.map(b => b.type === 'text' ? b.text : '[image]').join('\n');
57
+ return { type: 'text', text: `Tool result (${block.tool_use_id}): ${resultText}` };
58
+ }
59
+ if (block.type === 'tool_use') {
60
+ // OpenAI handles tool calls differently - convert to text representation
61
+ return { type: 'text', text: `[Tool call: ${block.name}(${JSON.stringify(block.input)})]` };
62
+ }
63
+ // Image block
52
64
  return {
53
65
  type: 'image_url',
54
66
  image_url: { url: `data:${block.mediaType};base64,${block.data}` }
@@ -12,7 +12,31 @@ export interface RequestImageBlock {
12
12
  path?: string;
13
13
  }
14
14
 
15
- export type RequestContentBlock = RequestTextBlock | RequestImageBlock;
15
+ export interface ToolResultImageSource {
16
+ type: 'base64';
17
+ media_type: string;
18
+ data: string;
19
+ }
20
+
21
+ export interface ToolResultImageBlock {
22
+ type: 'image';
23
+ source: ToolResultImageSource;
24
+ }
25
+
26
+ export interface ToolResultBlock {
27
+ type: 'tool_result';
28
+ tool_use_id: string;
29
+ content: string | Array<RequestTextBlock | ToolResultImageBlock>;
30
+ }
31
+
32
+ export interface ToolUseRequestBlock {
33
+ type: 'tool_use';
34
+ id: string;
35
+ name: string;
36
+ input: Record<string, unknown>;
37
+ }
38
+
39
+ export type RequestContentBlock = RequestTextBlock | RequestImageBlock | ToolResultBlock | ToolUseRequestBlock;
16
40
 
17
41
  export interface LlmMessage {
18
42
  role: 'user' | 'assistant';
@@ -0,0 +1,11 @@
1
+ import { Command } from './types.js';
2
+ import { getDictationStatus } from '../../utils/dictation.js';
3
+
4
+ export const dictationStatusCommand: Command = {
5
+ name: 'dictation',
6
+ description: 'Check voice dictation status and availability',
7
+ handler: async (args, ctx) => {
8
+ const status = getDictationStatus();
9
+ ctx.addMessage({ role: 'system', content: status });
10
+ }
11
+ };
@@ -15,6 +15,7 @@ import { retryCommand } from './retry.js';
15
15
  import { inputModeCommand } from './input_mode.js';
16
16
  import { keybindingsCommand } from './keybindings.js';
17
17
  import { updateCommand } from './update.js';
18
+ import { dictationStatusCommand } from './dictation.js';
18
19
  import { Command } from './types.js';
19
20
 
20
21
  const commandList: Command[] = [];
@@ -39,6 +40,7 @@ commandList.push(
39
40
  updateCommand,
40
41
  inputModeCommand,
41
42
  retryCommand,
43
+ dictationStatusCommand,
42
44
  exitCommand
43
45
  );
44
46
 
@@ -53,6 +53,12 @@ export interface SkillsController {
53
53
  list: () => Promise<Skill[]>;
54
54
  }
55
55
 
56
+ export interface DictationController {
57
+ startRecording: () => void;
58
+ stopRecording: () => Promise<string>; // Returns transcribed text
59
+ isRecording: () => boolean;
60
+ }
61
+
56
62
  export interface CommandContext {
57
63
  addMessage: (msg: Omit<Message, 'id' | 'timestamp'>) => void;
58
64
  clearMessages: () => void;
@@ -68,8 +74,10 @@ export interface CommandContext {
68
74
  clipboard: ClipboardController;
69
75
  models: ModelsController;
70
76
  skills: SkillsController;
77
+ dictation?: DictationController;
71
78
  getInputMode: () => 'queue' | 'interrupt';
72
79
  setInputMode: (mode: 'queue' | 'interrupt') => void;
80
+ setInputText?: (text: string) => void; // Set input field text
73
81
  }
74
82
 
75
83
  export interface Command {
@@ -2,5 +2,6 @@ export enum ToolCapability {
2
2
  FILE_READ = 'file_read',
3
3
  FILE_WRITE = 'file_write',
4
4
  SHELL_EXEC = 'shell_exec',
5
- NETWORK = 'network'
5
+ NETWORK = 'network',
6
+ SCREEN_CAPTURE = 'screen_capture'
6
7
  }
@@ -5,6 +5,7 @@ import { runCommandTool } from './shell.js';
5
5
  import { zergQueryTool } from './zerg.js';
6
6
  import { searchTool } from './search.js';
7
7
  import { listSkillsTool } from './skills.js';
8
+ import { screenshotTool, listWindowsTool, runAndMonitorTool } from './screenshot.js';
8
9
 
9
10
  // --- Tool Registry ---
10
11
 
@@ -15,7 +16,10 @@ export const defaultTools: Tool[] = [
15
16
  searchTool,
16
17
  listSkillsTool,
17
18
  runCommandTool,
18
- zergQueryTool
19
+ zergQueryTool,
20
+ screenshotTool,
21
+ listWindowsTool,
22
+ runAndMonitorTool
19
23
  ];
20
24
 
21
25
  export function getToolDefinitions(tools: Tool[] = defaultTools): ToolDefinition[] {
@@ -44,4 +48,5 @@ export { searchTool } from './search.js';
44
48
  export { listSkillsTool } from './skills.js';
45
49
  export { runCommandTool } from './shell.js';
46
50
  export { zergQueryTool } from './zerg.js';
51
+ export { screenshotTool, listWindowsTool, runAndMonitorTool } from './screenshot.js';
47
52
  export type { Tool } from './types.js';