keystone-cli 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,182 @@
1
+ import { afterEach, beforeEach, describe, expect, it, mock, spyOn } from 'bun:test';
2
+ import type { ExpressionContext } from '../expression/evaluator';
3
+ import * as agentParser from '../parser/agent-parser';
4
+ import type { Config } from '../parser/config-schema';
5
+ import type { Agent, LlmStep, Step } from '../parser/schema';
6
+ import { ConfigLoader } from '../utils/config-loader';
7
+ import { type LLMMessage, OpenAIAdapter } from './llm-adapter';
8
+ import { executeLlmStep } from './llm-executor';
9
+
10
+ describe('LLM Clarification', () => {
11
+ const originalChat = OpenAIAdapter.prototype.chat;
12
+
13
+ beforeEach(() => {
14
+ spyOn(agentParser, 'resolveAgentPath').mockReturnValue('test-agent.md');
15
+ spyOn(agentParser, 'parseAgent').mockReturnValue({
16
+ name: 'test-agent',
17
+ systemPrompt: 'test system prompt',
18
+ tools: [],
19
+ model: 'gpt-4o',
20
+ } as unknown as Agent);
21
+
22
+ ConfigLoader.setConfig({
23
+ providers: {
24
+ openai: { type: 'openai', api_key_env: 'OPENAI_API_KEY' },
25
+ },
26
+ default_provider: 'openai',
27
+ model_mappings: {},
28
+ storage: { retention_days: 30 },
29
+ workflows_directory: 'workflows',
30
+ mcp_servers: {},
31
+ } as unknown as Config);
32
+ });
33
+
34
+ afterEach(() => {
35
+ OpenAIAdapter.prototype.chat = originalChat;
36
+ mock.restore();
37
+ });
38
+
39
+ it('should inject ask tool when allowClarification is true', async () => {
40
+ const step: LlmStep = {
41
+ id: 'test-step',
42
+ type: 'llm',
43
+ agent: 'test-agent',
44
+ prompt: 'test prompt',
45
+ allowClarification: true,
46
+ needs: [],
47
+ maxIterations: 10,
48
+ };
49
+
50
+ const context: ExpressionContext = {
51
+ inputs: {},
52
+ output: {},
53
+ };
54
+
55
+ const chatMock = mock(async () => ({
56
+ message: { role: 'assistant' as const, content: 'Final response' },
57
+ usage: { prompt_tokens: 10, completion_tokens: 10, total_tokens: 20 },
58
+ }));
59
+ OpenAIAdapter.prototype.chat = chatMock;
60
+
61
+ const executeStepFn = mock(async () => ({ output: 'ok', status: 'success' as const }));
62
+
63
+ await executeLlmStep(step, context, executeStepFn);
64
+
65
+ expect(chatMock).toHaveBeenCalled();
66
+ const calls = chatMock.mock.calls as unknown[][];
67
+ const options = calls[0][1] as { tools?: { function: { name: string } }[] };
68
+ expect(options.tools).toBeDefined();
69
+ expect(options.tools?.some((t) => t.function.name === 'ask')).toBe(true);
70
+ });
71
+
72
+ it('should suspend in non-TTY when ask is called', async () => {
73
+ const originalIsTTY = process.stdin.isTTY;
74
+ // Assign directly to match step-executor.test.ts pattern
75
+ // @ts-ignore
76
+ process.stdin.isTTY = false;
77
+
78
+ try {
79
+ const step: LlmStep = {
80
+ id: 'test-step',
81
+ type: 'llm',
82
+ agent: 'test-agent',
83
+ prompt: 'test prompt',
84
+ allowClarification: true,
85
+ needs: [],
86
+ maxIterations: 10,
87
+ };
88
+
89
+ const context: ExpressionContext = {
90
+ inputs: {},
91
+ output: {},
92
+ };
93
+
94
+ const chatMock = mock(async () => ({
95
+ message: {
96
+ role: 'assistant' as const,
97
+ content: null,
98
+ tool_calls: [
99
+ {
100
+ id: 'call-ask',
101
+ type: 'function',
102
+ function: { name: 'ask', arguments: '{"question": "What is your name?"}' },
103
+ },
104
+ ],
105
+ },
106
+ usage: { prompt_tokens: 10, completion_tokens: 10, total_tokens: 20 },
107
+ }));
108
+ OpenAIAdapter.prototype.chat = chatMock;
109
+
110
+ const executeStepFn = mock(async () => ({ output: 'ok', status: 'success' as const }));
111
+
112
+ const result = await executeLlmStep(step, context, executeStepFn);
113
+
114
+ expect(result.status).toBe('suspended');
115
+ const output = result.output as { question: string; messages: unknown[] };
116
+ expect(output.question).toBe('What is your name?');
117
+ expect(output.messages).toBeDefined();
118
+ } finally {
119
+ // @ts-ignore
120
+ process.stdin.isTTY = originalIsTTY;
121
+ }
122
+ });
123
+
124
+ it('should resume correctly when answer is provided', async () => {
125
+ const step: LlmStep = {
126
+ id: 'test-step',
127
+ type: 'llm',
128
+ agent: 'test-agent',
129
+ prompt: 'test prompt',
130
+ allowClarification: true,
131
+ needs: [],
132
+ maxIterations: 10,
133
+ };
134
+
135
+ const context: ExpressionContext = {
136
+ inputs: {
137
+ 'test-step': { __answer: 'My name is Keystone' },
138
+ },
139
+ output: {
140
+ messages: [
141
+ { role: 'system', content: 'system prompt' },
142
+ { role: 'user', content: 'test prompt' },
143
+ {
144
+ role: 'assistant',
145
+ content: null,
146
+ tool_calls: [
147
+ {
148
+ id: 'call-ask',
149
+ type: 'function',
150
+ function: { name: 'ask', arguments: '{"question": "What is your name?"}' },
151
+ },
152
+ ],
153
+ },
154
+ ] as LLMMessage[],
155
+ },
156
+ };
157
+
158
+ const chatMock = mock(async () => ({
159
+ message: { role: 'assistant' as const, content: 'Hello Keystone' },
160
+ usage: { prompt_tokens: 10, completion_tokens: 10, total_tokens: 20 },
161
+ }));
162
+ OpenAIAdapter.prototype.chat = chatMock;
163
+
164
+ const executeStepFn = mock(async () => ({ output: 'ok', status: 'success' as const }));
165
+
166
+ const result = await executeLlmStep(step, context, executeStepFn);
167
+
168
+ expect(result.output).toBe('Hello Keystone');
169
+ expect(chatMock).toHaveBeenCalled();
170
+ const calls = chatMock.mock.calls as unknown[][];
171
+ const messages = calls[0][0] as {
172
+ role: string;
173
+ content: string | null;
174
+ tool_call_id?: string;
175
+ }[];
176
+
177
+ const toolMsg = messages[messages.length - 2];
178
+ expect(toolMsg.role).toBe('tool');
179
+ expect(toolMsg.content).toBe('My name is Keystone');
180
+ expect(toolMsg.tool_call_id).toBe('call-ask');
181
+ });
182
+ });
@@ -3,9 +3,10 @@ import type { ExpressionContext } from '../expression/evaluator';
3
3
  import { ExpressionEvaluator } from '../expression/evaluator';
4
4
  import { parseAgent, resolveAgentPath } from '../parser/agent-parser';
5
5
  import type { AgentTool, LlmStep, Step } from '../parser/schema';
6
+ import { Redactor } from '../utils/redactor';
6
7
  import { type LLMMessage, getAdapter } from './llm-adapter';
7
8
  import { MCPClient } from './mcp-client';
8
- import type { MCPManager } from './mcp-manager';
9
+ import type { MCPManager, MCPServerConfig } from './mcp-manager';
9
10
  import type { StepResult } from './step-executor';
10
11
  import type { Logger } from './workflow-runner';
11
12
 
@@ -42,10 +43,30 @@ export async function executeLlmStep(
42
43
  systemPrompt += `\n\nIMPORTANT: You must output valid JSON that matches the following schema:\n${JSON.stringify(step.schema, null, 2)}`;
43
44
  }
44
45
 
45
- const messages: LLMMessage[] = [
46
- { role: 'system', content: systemPrompt },
47
- { role: 'user', content: prompt },
48
- ];
46
+ const messages: LLMMessage[] = [];
47
+
48
+ // Resume from state if provided
49
+ if (context.output && typeof context.output === 'object' && 'messages' in context.output) {
50
+ messages.push(...(context.output.messages as LLMMessage[]));
51
+
52
+ // If we have an answer in inputs, add it as a tool result for the last tool call
53
+ const stepInputs = context.inputs?.[step.id] as Record<string, unknown> | undefined;
54
+ if (stepInputs && typeof stepInputs === 'object' && '__answer' in stepInputs) {
55
+ const answer = stepInputs.__answer;
56
+ const lastMessage = messages[messages.length - 1];
57
+ const askCall = lastMessage?.tool_calls?.find((tc) => tc.function.name === 'ask');
58
+ if (askCall) {
59
+ messages.push({
60
+ role: 'tool',
61
+ tool_call_id: askCall.id,
62
+ name: 'ask',
63
+ content: String(answer),
64
+ });
65
+ }
66
+ }
67
+ } else {
68
+ messages.push({ role: 'system', content: systemPrompt }, { role: 'user', content: prompt });
69
+ }
49
70
 
50
71
  const localMcpClients: MCPClient[] = [];
51
72
  const allTools: ToolDefinition[] = [];
@@ -84,14 +105,15 @@ export async function executeLlmStep(
84
105
  }
85
106
 
86
107
  // 3. Add MCP tools
87
- const mcpServersToConnect = [...(step.mcpServers || [])];
108
+ const mcpServersToConnect: (string | MCPServerConfig)[] = [...(step.mcpServers || [])];
88
109
  if (step.useGlobalMcp && mcpManager) {
89
110
  const globalServers = mcpManager.getGlobalServers();
90
111
  for (const globalServer of globalServers) {
91
112
  // Only add if not already explicitly listed
92
- const alreadyListed = mcpServersToConnect.some((s) =>
93
- typeof s === 'string' ? s === globalServer.name : s.name === globalServer.name
94
- );
113
+ const alreadyListed = mcpServersToConnect.some((s) => {
114
+ const name = typeof s === 'string' ? s : s.name;
115
+ return name === globalServer.name;
116
+ });
95
117
  if (!alreadyListed) {
96
118
  mcpServersToConnect.push(globalServer);
97
119
  }
@@ -103,7 +125,7 @@ export async function executeLlmStep(
103
125
  let client: MCPClient | undefined;
104
126
 
105
127
  if (mcpManager) {
106
- client = await mcpManager.getClient(server, logger);
128
+ client = await mcpManager.getClient(server as string | MCPServerConfig, logger);
107
129
  } else {
108
130
  // Fallback if no manager (should not happen in normal workflow run)
109
131
  if (typeof server === 'string') {
@@ -113,9 +135,9 @@ export async function executeLlmStep(
113
135
  logger.log(` 🔌 Connecting to MCP server: ${server.name}`);
114
136
  try {
115
137
  client = await MCPClient.createLocal(
116
- server.command,
117
- server.args || [],
118
- server.env || {}
138
+ (server as MCPServerConfig).command || 'node',
139
+ (server as MCPServerConfig).args || [],
140
+ (server as MCPServerConfig).env || {}
119
141
  );
120
142
  await client.initialize();
121
143
  localMcpClients.push(client);
@@ -123,7 +145,9 @@ export async function executeLlmStep(
123
145
  logger.error(
124
146
  ` ✗ Failed to connect to MCP server ${server.name}: ${error instanceof Error ? error.message : String(error)}`
125
147
  );
126
- client.stop();
148
+ if (client) {
149
+ client.stop();
150
+ }
127
151
  client = undefined;
128
152
  }
129
153
  }
@@ -144,44 +168,76 @@ export async function executeLlmStep(
144
168
  }
145
169
 
146
170
  const llmTools = allTools.map((t) => ({
147
- type: 'function',
171
+ type: 'function' as const,
148
172
  function: {
149
173
  name: t.name,
150
174
  description: t.description,
151
- parameters: t.parameters,
175
+ parameters: t.parameters as Record<string, unknown>,
152
176
  },
153
177
  }));
154
178
 
179
+ if (step.allowClarification) {
180
+ llmTools.push({
181
+ type: 'function' as const,
182
+ function: {
183
+ name: 'ask',
184
+ description:
185
+ 'Ask the user a clarifying question if the initial request is ambiguous or missing information.',
186
+ parameters: {
187
+ type: 'object',
188
+ properties: {
189
+ question: {
190
+ type: 'string',
191
+ description: 'The question to ask the user',
192
+ },
193
+ },
194
+ required: ['question'],
195
+ } as Record<string, unknown>,
196
+ },
197
+ });
198
+ }
199
+
155
200
  // ReAct Loop
156
201
  let iterations = 0;
157
202
  const maxIterations = step.maxIterations || 10;
203
+ const totalUsage = {
204
+ prompt_tokens: 0,
205
+ completion_tokens: 0,
206
+ total_tokens: 0,
207
+ };
158
208
 
159
209
  while (iterations < maxIterations) {
160
210
  iterations++;
161
211
 
212
+ const redactor = new Redactor(context.secrets || {});
213
+
162
214
  const response = await adapter.chat(messages, {
163
215
  model: resolvedModel,
164
216
  tools: llmTools.length > 0 ? llmTools : undefined,
217
+ onStream: (chunk) => {
218
+ if (!step.schema) {
219
+ process.stdout.write(redactor.redact(chunk));
220
+ }
221
+ },
165
222
  });
166
223
 
224
+ if (response.usage) {
225
+ totalUsage.prompt_tokens += response.usage.prompt_tokens;
226
+ totalUsage.completion_tokens += response.usage.completion_tokens;
227
+ totalUsage.total_tokens += response.usage.total_tokens;
228
+ }
229
+
167
230
  const { message } = response;
168
231
  messages.push(message);
169
232
 
170
- if (message.content && !step.schema) {
171
- logger.log(`\n${message.content}`);
172
- }
173
-
174
233
  if (!message.tool_calls || message.tool_calls.length === 0) {
175
234
  let output = message.content;
176
235
 
177
236
  // If schema is defined, attempt to parse JSON
178
237
  if (step.schema && typeof output === 'string') {
179
238
  try {
180
- // Attempt to extract JSON if wrapped in markdown code blocks or just finding the first {
181
- const jsonMatch =
182
- output.match(/```(?:json)?\s*([\s\S]*?)\s*```/i) || output.match(/\{[\s\S]*\}/);
183
- const jsonStr = jsonMatch ? jsonMatch[1] || jsonMatch[0] : output;
184
- output = JSON.parse(jsonStr);
239
+ const { extractJson } = await import('../utils/json-parser');
240
+ output = extractJson(output);
185
241
  } catch (e) {
186
242
  throw new Error(
187
243
  `Failed to parse LLM output as JSON matching schema: ${e instanceof Error ? e.message : String(e)}\nOutput: ${output}`
@@ -192,6 +248,7 @@ export async function executeLlmStep(
192
248
  return {
193
249
  output,
194
250
  status: 'success',
251
+ usage: totalUsage,
195
252
  };
196
253
  }
197
254
 
@@ -201,6 +258,41 @@ export async function executeLlmStep(
201
258
  const toolInfo = allTools.find((t) => t.name === toolCall.function.name);
202
259
 
203
260
  if (!toolInfo) {
261
+ if (toolCall.function.name === 'ask' && step.allowClarification) {
262
+ const args = JSON.parse(toolCall.function.arguments) as { question: string };
263
+
264
+ if (process.stdin.isTTY) {
265
+ // In TTY, we can use a human step to get the answer immediately
266
+ logger.log(`\n🤔 Question from ${agent.name}: ${args.question}`);
267
+ const result = await executeStepFn(
268
+ {
269
+ id: `${step.id}-clarify`,
270
+ type: 'human',
271
+ message: args.question,
272
+ inputType: 'text',
273
+ } as Step,
274
+ context
275
+ );
276
+
277
+ messages.push({
278
+ role: 'tool',
279
+ tool_call_id: toolCall.id,
280
+ name: 'ask',
281
+ content: String(result.output),
282
+ });
283
+ continue;
284
+ }
285
+ // In non-TTY, we suspend
286
+ return {
287
+ status: 'suspended',
288
+ output: {
289
+ messages,
290
+ question: args.question,
291
+ },
292
+ usage: totalUsage,
293
+ };
294
+ }
295
+
204
296
  messages.push({
205
297
  role: 'tool',
206
298
  tool_call_id: toolCall.id,
@@ -233,7 +325,7 @@ export async function executeLlmStep(
233
325
  // Execute the tool as a step
234
326
  const toolContext: ExpressionContext = {
235
327
  ...context,
236
- args,
328
+ item: args, // Use item to pass args to tool execution
237
329
  };
238
330
 
239
331
  const result = await executeStepFn(toolInfo.execution, toolContext);
@@ -135,7 +135,10 @@ export class MCPManager {
135
135
  }
136
136
 
137
137
  private getServerKey(config: MCPServerConfig): string {
138
- return config.name;
138
+ if (config.type === 'remote') {
139
+ return `remote:${config.name}:${config.url}`;
140
+ }
141
+ return `local:${config.name}:${config.command}:${(config.args || []).join(' ')}`;
139
142
  }
140
143
 
141
144
  getGlobalServers(): MCPServerConfig[] {
@@ -38,7 +38,7 @@ describe('MCPServer', () => {
38
38
  method: 'tools/list',
39
39
  });
40
40
 
41
- expect(response?.result?.tools).toHaveLength(5);
41
+ expect(response?.result?.tools).toHaveLength(7);
42
42
  // @ts-ignore
43
43
  expect(response?.result?.tools?.map((t) => t.name)).toContain('run_workflow');
44
44
  });
@@ -249,4 +249,118 @@ describe('MCPServer', () => {
249
249
  writeSpy.mockRestore();
250
250
  consoleSpy.mockRestore();
251
251
  });
252
+
253
+ it('should call start_workflow tool and return immediately', async () => {
254
+ spyOn(WorkflowRegistry, 'resolvePath').mockReturnValue('test.yaml');
255
+ // @ts-ignore
256
+ spyOn(WorkflowParser, 'loadWorkflow').mockReturnValue({
257
+ name: 'test-wf',
258
+ steps: [],
259
+ });
260
+
261
+ // Mock WorkflowRunner - simulate a slow workflow
262
+ const mockRun = mock(
263
+ () => new Promise((resolve) => setTimeout(() => resolve({ result: 'ok' }), 100))
264
+ );
265
+ // @ts-ignore
266
+ spyOn(WorkflowRunner.prototype, 'run').mockImplementation(mockRun);
267
+ spyOn(WorkflowRunner.prototype, 'getRunId').mockReturnValue('async-run-123');
268
+
269
+ const response = await handleMessage({
270
+ jsonrpc: '2.0',
271
+ id: 10,
272
+ method: 'tools/call',
273
+ params: {
274
+ name: 'start_workflow',
275
+ arguments: { workflow_name: 'test-wf', inputs: {} },
276
+ },
277
+ });
278
+
279
+ const result = JSON.parse(response?.result?.content?.[0]?.text);
280
+ expect(result.status).toBe('running');
281
+ expect(result.run_id).toBe('async-run-123');
282
+ expect(result.hint).toContain('get_run_status');
283
+ });
284
+
285
+ it('should call get_run_status tool for running workflow', async () => {
286
+ const runId = 'status-test-run';
287
+ await db.createRun(runId, 'test-wf', { foo: 'bar' });
288
+ await db.updateRunStatus(runId, 'running');
289
+
290
+ const response = await handleMessage({
291
+ jsonrpc: '2.0',
292
+ id: 11,
293
+ method: 'tools/call',
294
+ params: { name: 'get_run_status', arguments: { run_id: runId } },
295
+ });
296
+
297
+ const status = JSON.parse(response?.result?.content?.[0]?.text);
298
+ expect(status.run_id).toBe(runId);
299
+ expect(status.workflow).toBe('test-wf');
300
+ expect(status.status).toBe('running');
301
+ expect(status.hint).toContain('still running');
302
+ });
303
+
304
+ it('should call get_run_status tool for completed workflow', async () => {
305
+ const runId = 'completed-test-run';
306
+ await db.createRun(runId, 'test-wf', {});
307
+ await db.updateRunStatus(runId, 'completed', { output: 'done' });
308
+
309
+ const response = await handleMessage({
310
+ jsonrpc: '2.0',
311
+ id: 12,
312
+ method: 'tools/call',
313
+ params: { name: 'get_run_status', arguments: { run_id: runId } },
314
+ });
315
+
316
+ const status = JSON.parse(response?.result?.content?.[0]?.text);
317
+ expect(status.status).toBe('completed');
318
+ expect(status.outputs).toEqual({ output: 'done' });
319
+ expect(status.hint).toBeUndefined();
320
+ });
321
+
322
+ it('should call get_run_status tool for failed workflow', async () => {
323
+ const runId = 'failed-test-run';
324
+ await db.createRun(runId, 'test-wf', {});
325
+ await db.updateRunStatus(runId, 'failed', undefined, 'Something went wrong');
326
+
327
+ const response = await handleMessage({
328
+ jsonrpc: '2.0',
329
+ id: 13,
330
+ method: 'tools/call',
331
+ params: { name: 'get_run_status', arguments: { run_id: runId } },
332
+ });
333
+
334
+ const status = JSON.parse(response?.result?.content?.[0]?.text);
335
+ expect(status.status).toBe('failed');
336
+ expect(status.error).toBe('Something went wrong');
337
+ });
338
+
339
+ it('should call get_run_status tool for paused workflow', async () => {
340
+ const runId = 'paused-test-run';
341
+ await db.createRun(runId, 'test-wf', {});
342
+ await db.updateRunStatus(runId, 'paused');
343
+
344
+ const response = await handleMessage({
345
+ jsonrpc: '2.0',
346
+ id: 14,
347
+ method: 'tools/call',
348
+ params: { name: 'get_run_status', arguments: { run_id: runId } },
349
+ });
350
+
351
+ const status = JSON.parse(response?.result?.content?.[0]?.text);
352
+ expect(status.status).toBe('paused');
353
+ expect(status.hint).toContain('answer_human_input');
354
+ });
355
+
356
+ it('should return error for non-existent run in get_run_status', async () => {
357
+ const response = await handleMessage({
358
+ jsonrpc: '2.0',
359
+ id: 15,
360
+ method: 'tools/call',
361
+ params: { name: 'get_run_status', arguments: { run_id: 'non-existent' } },
362
+ });
363
+
364
+ expect(response?.error?.message).toContain('not found');
365
+ });
252
366
  });