keystone-cli 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +34 -0
- package/package.json +1 -1
- package/src/cli.ts +233 -21
- package/src/db/memory-db.ts +6 -0
- package/src/db/sqlite-setup.test.ts +47 -0
- package/src/db/workflow-db.ts +6 -0
- package/src/expression/evaluator.ts +2 -0
- package/src/parser/schema.ts +3 -0
- package/src/runner/debug-repl.test.ts +240 -6
- package/src/runner/llm-adapter.test.ts +10 -4
- package/src/runner/llm-executor.ts +39 -3
- package/src/runner/shell-executor.ts +40 -12
- package/src/runner/standard-tools-integration.test.ts +147 -0
- package/src/runner/standard-tools.test.ts +69 -0
- package/src/runner/standard-tools.ts +270 -0
- package/src/runner/step-executor.test.ts +194 -1
- package/src/runner/step-executor.ts +46 -15
- package/src/runner/stream-utils.test.ts +113 -7
- package/src/runner/stream-utils.ts +4 -4
- package/src/runner/workflow-runner.ts +14 -20
- package/src/templates/agents/keystone-architect.md +16 -2
- package/src/templates/agents/software-engineer.md +17 -0
- package/src/templates/memory-service.yaml +54 -0
- package/src/templates/robust-automation.yaml +44 -0
- package/src/templates/scaffold-feature.yaml +1 -0
|
@@ -4,16 +4,24 @@ import { processOpenAIStream } from './stream-utils';
|
|
|
4
4
|
const encoder = new TextEncoder();
|
|
5
5
|
|
|
6
6
|
function responseFromChunks(chunks: string[]): Response {
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
7
|
+
let index = 0;
|
|
8
|
+
const reader = {
|
|
9
|
+
async read(): Promise<{ done: boolean; value?: Uint8Array }> {
|
|
10
|
+
if (index >= chunks.length) {
|
|
11
|
+
return { done: true, value: undefined };
|
|
11
12
|
}
|
|
12
|
-
|
|
13
|
+
const value = encoder.encode(chunks[index]);
|
|
14
|
+
index += 1;
|
|
15
|
+
return { done: false, value };
|
|
13
16
|
},
|
|
14
|
-
|
|
17
|
+
async cancel(): Promise<void> {},
|
|
18
|
+
};
|
|
15
19
|
|
|
16
|
-
return
|
|
20
|
+
return {
|
|
21
|
+
body: {
|
|
22
|
+
getReader: () => reader,
|
|
23
|
+
},
|
|
24
|
+
} as Response;
|
|
17
25
|
}
|
|
18
26
|
|
|
19
27
|
describe('processOpenAIStream', () => {
|
|
@@ -61,5 +69,103 @@ describe('processOpenAIStream', () => {
|
|
|
61
69
|
|
|
62
70
|
expect(result.message.content).toBe('ok');
|
|
63
71
|
expect(logger.warn).toHaveBeenCalledTimes(1);
|
|
72
|
+
expect(logger.warn.mock.calls[0][0]).toContain('Malformed JSON line');
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
it('throws error when buffer size is exceeded', async () => {
|
|
76
|
+
const response = responseFromChunks(['a'.repeat(1024 * 1024 + 1)]);
|
|
77
|
+
await expect(processOpenAIStream(response)).rejects.toThrow(
|
|
78
|
+
'LLM stream line exceed maximum size'
|
|
79
|
+
);
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
it('throws error when response size limit is exceeded', async () => {
|
|
83
|
+
const response = responseFromChunks([
|
|
84
|
+
`data: {"choices":[{"delta":{"content":"${'a'.repeat(600 * 1024)}"}}]}\n`,
|
|
85
|
+
`data: {"choices":[{"delta":{"content":"${'a'.repeat(500 * 1024)}"}}]}\n`,
|
|
86
|
+
]);
|
|
87
|
+
await expect(processOpenAIStream(response)).rejects.toThrow(
|
|
88
|
+
'LLM response exceeds maximum size'
|
|
89
|
+
);
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
it('throws error when tool call arguments size limit is exceeded', async () => {
|
|
93
|
+
const response = responseFromChunks([
|
|
94
|
+
`data: {"choices":[{"delta":{"tool_calls":[{"index":0,"function":{"arguments":"${'a'.repeat(600 * 1024)}"}}]}}]}\n`,
|
|
95
|
+
`data: {"choices":[{"delta":{"tool_calls":[{"index":0,"function":{"arguments":"${'a'.repeat(500 * 1024)}"}}]}}]}\n`,
|
|
96
|
+
]);
|
|
97
|
+
await expect(processOpenAIStream(response)).rejects.toThrow(
|
|
98
|
+
'LLM tool call arguments exceed maximum size'
|
|
99
|
+
);
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
it('handles and logs generic errors during chunk processing', async () => {
|
|
103
|
+
const logger = {
|
|
104
|
+
log: mock(() => {}),
|
|
105
|
+
error: mock(() => {}),
|
|
106
|
+
warn: mock(() => {}),
|
|
107
|
+
info: mock(() => {}),
|
|
108
|
+
};
|
|
109
|
+
// Mocking JSON.parse to throw a non-SyntaxError
|
|
110
|
+
const originalParse = JSON.parse;
|
|
111
|
+
JSON.parse = (str: string) => {
|
|
112
|
+
if (str === '{"trigger_error":true}') throw new Error('Generic error');
|
|
113
|
+
return originalParse(str);
|
|
114
|
+
};
|
|
115
|
+
|
|
116
|
+
try {
|
|
117
|
+
const response = responseFromChunks(['data: {"trigger_error":true}\n']);
|
|
118
|
+
await processOpenAIStream(response, { logger });
|
|
119
|
+
expect(logger.warn).toHaveBeenCalledTimes(1);
|
|
120
|
+
expect(logger.warn.mock.calls[0][0]).toContain(
|
|
121
|
+
'Error processing chunk: Error: Generic error'
|
|
122
|
+
);
|
|
123
|
+
} finally {
|
|
124
|
+
JSON.parse = originalParse;
|
|
125
|
+
}
|
|
126
|
+
});
|
|
127
|
+
|
|
128
|
+
it('handles errors in the final line processing', async () => {
|
|
129
|
+
const logger = {
|
|
130
|
+
log: mock(() => {}),
|
|
131
|
+
error: mock(() => {}),
|
|
132
|
+
warn: mock(() => {}),
|
|
133
|
+
info: mock(() => {}),
|
|
134
|
+
};
|
|
135
|
+
const response = responseFromChunks(['data: {bad json}']); // No newline, triggers buffer processing
|
|
136
|
+
|
|
137
|
+
await processOpenAIStream(response, { logger });
|
|
138
|
+
|
|
139
|
+
expect(logger.warn).toHaveBeenCalledTimes(1);
|
|
140
|
+
expect(logger.warn.mock.calls[0][0]).toContain('Malformed JSON line');
|
|
141
|
+
});
|
|
142
|
+
|
|
143
|
+
it('throws size limit error in final line processing', async () => {
|
|
144
|
+
const response = responseFromChunks([
|
|
145
|
+
`data: {"choices":[{"delta":{"content":"${'a'.repeat(600 * 1024)}"}}]}\n`,
|
|
146
|
+
`data: {"choices":[{"delta":{"content":"${'a'.repeat(500 * 1024)}"}}]}`,
|
|
147
|
+
]);
|
|
148
|
+
// The first line is ok, the second line is in the final buffer and exceeds size
|
|
149
|
+
await expect(processOpenAIStream(response)).rejects.toThrow(
|
|
150
|
+
'LLM response exceeds maximum size'
|
|
151
|
+
);
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
it('bubbles up reader cancel errors', async () => {
|
|
155
|
+
const reader = {
|
|
156
|
+
read: async () => {
|
|
157
|
+
throw new Error('Read error');
|
|
158
|
+
},
|
|
159
|
+
cancel: async () => {
|
|
160
|
+
throw new Error('Cancel error');
|
|
161
|
+
},
|
|
162
|
+
};
|
|
163
|
+
const response = {
|
|
164
|
+
body: {
|
|
165
|
+
getReader: () => reader,
|
|
166
|
+
},
|
|
167
|
+
} as unknown as Response;
|
|
168
|
+
|
|
169
|
+
await expect(processOpenAIStream(response)).rejects.toThrow('Read error');
|
|
64
170
|
});
|
|
65
171
|
});
|
|
@@ -67,7 +67,7 @@ export async function processOpenAIStream(
|
|
|
67
67
|
const toolCall = tc as ToolCallDelta;
|
|
68
68
|
if (!toolCalls[toolCall.index]) {
|
|
69
69
|
toolCalls[toolCall.index] = {
|
|
70
|
-
id: toolCall.id,
|
|
70
|
+
id: toolCall.id || '',
|
|
71
71
|
type: 'function',
|
|
72
72
|
function: { name: '', arguments: '' },
|
|
73
73
|
};
|
|
@@ -93,7 +93,7 @@ export async function processOpenAIStream(
|
|
|
93
93
|
const activeLogger = options?.logger || new ConsoleLogger();
|
|
94
94
|
|
|
95
95
|
// Rethrow size limit errors so they bubble up
|
|
96
|
-
if (
|
|
96
|
+
if (e instanceof Error && e.message.toLowerCase().includes('maximum size')) {
|
|
97
97
|
throw e;
|
|
98
98
|
}
|
|
99
99
|
|
|
@@ -137,7 +137,7 @@ export async function processOpenAIStream(
|
|
|
137
137
|
const toolCall = tc as ToolCallDelta;
|
|
138
138
|
if (!toolCalls[toolCall.index]) {
|
|
139
139
|
toolCalls[toolCall.index] = {
|
|
140
|
-
id: toolCall.id,
|
|
140
|
+
id: toolCall.id || '',
|
|
141
141
|
type: 'function',
|
|
142
142
|
function: { name: '', arguments: '' },
|
|
143
143
|
};
|
|
@@ -161,7 +161,7 @@ export async function processOpenAIStream(
|
|
|
161
161
|
}
|
|
162
162
|
}
|
|
163
163
|
} catch (e) {
|
|
164
|
-
if (
|
|
164
|
+
if (e instanceof Error && e.message.toLowerCase().includes('maximum size')) {
|
|
165
165
|
throw e;
|
|
166
166
|
}
|
|
167
167
|
const activeLogger = options?.logger || new ConsoleLogger();
|
|
@@ -630,16 +630,13 @@ export class WorkflowRunner {
|
|
|
630
630
|
}
|
|
631
631
|
|
|
632
632
|
const operation = async () => {
|
|
633
|
-
const result = await executeStep(
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
this.
|
|
637
|
-
this.
|
|
638
|
-
this.
|
|
639
|
-
|
|
640
|
-
this.options.workflowDir,
|
|
641
|
-
this.options.dryRun
|
|
642
|
-
);
|
|
633
|
+
const result = await executeStep(stepToExecute, context, this.logger, {
|
|
634
|
+
executeWorkflowFn: this.executeSubWorkflow.bind(this),
|
|
635
|
+
mcpManager: this.mcpManager,
|
|
636
|
+
memoryDb: this.memoryDb,
|
|
637
|
+
workflowDir: this.options.workflowDir,
|
|
638
|
+
dryRun: this.options.dryRun,
|
|
639
|
+
});
|
|
643
640
|
if (result.status === 'failed') {
|
|
644
641
|
throw new Error(result.error || 'Step failed');
|
|
645
642
|
}
|
|
@@ -868,16 +865,13 @@ Do not change the 'id' or 'type' or 'auto_heal' fields.
|
|
|
868
865
|
|
|
869
866
|
// Execute the agent step
|
|
870
867
|
// We use a fresh context but share secrets/env
|
|
871
|
-
const result = await executeStep(
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
this.
|
|
875
|
-
this.
|
|
876
|
-
this.
|
|
877
|
-
|
|
878
|
-
this.options.workflowDir,
|
|
879
|
-
this.options.dryRun
|
|
880
|
-
);
|
|
868
|
+
const result = await executeStep(agentStep, context, this.logger, {
|
|
869
|
+
executeWorkflowFn: this.executeSubWorkflow.bind(this),
|
|
870
|
+
mcpManager: this.mcpManager,
|
|
871
|
+
memoryDb: this.memoryDb,
|
|
872
|
+
workflowDir: this.options.workflowDir,
|
|
873
|
+
dryRun: this.options.dryRun,
|
|
874
|
+
});
|
|
881
875
|
|
|
882
876
|
if (result.status !== 'success' || !result.output) {
|
|
883
877
|
throw new Error(`Healer agent failed: ${result.error || 'No output'}`);
|
|
@@ -19,9 +19,9 @@ You are the Keystone Architect. Your goal is to design and generate high-quality
|
|
|
19
19
|
- **eval**: (Optional) Configuration for prompt optimization `{ scorer: 'llm'|'script', agent, prompt, run }`.
|
|
20
20
|
- **steps**: Array of step objects. Each step MUST have an `id` and a `type`:
|
|
21
21
|
- **shell**: `{ id, type: 'shell', run, dir, env, allowInsecure, transform }` (Set `allowInsecure: true` to bypass risky command checks)
|
|
22
|
-
- **llm**: `{ id, type: 'llm', agent, prompt, schema, provider, model, tools, maxIterations, useGlobalMcp, allowClarification, mcpServers }`
|
|
22
|
+
- **llm**: `{ id, type: 'llm', agent, prompt, schema, provider, model, tools, maxIterations, useGlobalMcp, allowClarification, useStandardTools, allowOutsideCwd, allowInsecure, mcpServers }`
|
|
23
23
|
- **workflow**: `{ id, type: 'workflow', path, inputs }`
|
|
24
|
-
- **file**: `{ id, type: 'file', path, op: 'read'|'write'|'append', content }`
|
|
24
|
+
- **file**: `{ id, type: 'file', path, op: 'read'|'write'|'append', content, allowOutsideCwd }`
|
|
25
25
|
- **request**: `{ id, type: 'request', url, method, body, headers }`
|
|
26
26
|
- **human**: `{ id, type: 'human', message, inputType: 'confirm'|'text' }` (Note: 'confirm' returns boolean but automatically fallbacks to text if input is not yes/no)
|
|
27
27
|
- **sleep**: `{ id, type: 'sleep', duration }` (duration can be a number or expression string)
|
|
@@ -31,6 +31,17 @@ You are the Keystone Architect. Your goal is to design and generate high-quality
|
|
|
31
31
|
- **finally**: Optional array of steps to run at the end of the workflow, regardless of success or failure.
|
|
32
32
|
- **IMPORTANT**: Steps run in **parallel** by default. To ensure sequential execution, a step must explicitly list the previous step's ID in its `needs` array.
|
|
33
33
|
|
|
34
|
+
## Standard Tools
|
|
35
|
+
When `useStandardTools: true` is set on an `llm` step, the agent has access to:
|
|
36
|
+
- `read_file(path)`: Read file contents.
|
|
37
|
+
- `read_file_lines(path, start, count)`: Read a specific range of lines.
|
|
38
|
+
- `write_file(path, content)`: Write/overwrite file.
|
|
39
|
+
- `list_files(path)`: List directory contents.
|
|
40
|
+
- `search_files(pattern, dir)`: Search for files by pattern (glob).
|
|
41
|
+
- `search_content(query, pattern, dir)`: Search for text within files.
|
|
42
|
+
- `run_command(command, dir)`: Run shell commands (restricted by `allowInsecure`).
|
|
43
|
+
- **Path Gating**: Restricted to CWD by default. Use `allowOutsideCwd: true` to bypass.
|
|
44
|
+
|
|
34
45
|
## Agent Schema (.md)
|
|
35
46
|
Markdown files with YAML frontmatter:
|
|
36
47
|
- **name**: Agent name.
|
|
@@ -45,6 +56,9 @@ Markdown files with YAML frontmatter:
|
|
|
45
56
|
- `${{ steps.id.output }}`
|
|
46
57
|
- `${{ steps.id.status }}` (e.g., `'pending'`, `'running'`, `'success'`, `'failed'`, `'skipped'`)
|
|
47
58
|
- `${{ args.paramName }}` (used inside agent tools)
|
|
59
|
+
- `${{ item }}` (current item in a `foreach` loop)
|
|
60
|
+
- `${{ secrets.NAME }}` (access redacted secrets)
|
|
61
|
+
- `${{ env.NAME }}` (access environment variables)
|
|
48
62
|
- Standard JS-like expressions: `${{ steps.count > 0 ? 'yes' : 'no' }}`
|
|
49
63
|
|
|
50
64
|
# Guidelines
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: software-engineer
|
|
3
|
+
description: "Expert at writing and debugging code"
|
|
4
|
+
model: gpt-4o
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# Role
|
|
8
|
+
You are a Software Engineer. Your goal is to implement, refactor, and debug code based on user specifications.
|
|
9
|
+
|
|
10
|
+
# Guidelines
|
|
11
|
+
- Use `list_files` or `search_files` to understand the project structure.
|
|
12
|
+
- Use `search_content` to find where specific code or dependencies are located.
|
|
13
|
+
- Use `read_file` to examine code, or `read_file_lines` for large files.
|
|
14
|
+
- Use `write_file` to implement new features or fixes.
|
|
15
|
+
- Use `run_command` only when necessary for testing or building (e.g., `npm test`, `bun run build`).
|
|
16
|
+
- Be concise and follow best practices for the language you are writing in.
|
|
17
|
+
- Always verify your changes if possible by running tests.
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
name: memory-service
|
|
2
|
+
description: "Demonstrate long-term memory capabilities"
|
|
3
|
+
|
|
4
|
+
steps:
|
|
5
|
+
# Store information in memory
|
|
6
|
+
- id: remember_facts
|
|
7
|
+
type: memory
|
|
8
|
+
op: store
|
|
9
|
+
text: "Keystone CLI was initialized on 2025-01-01 by the engineering team."
|
|
10
|
+
metadata:
|
|
11
|
+
type: "fact"
|
|
12
|
+
confidence: 1.0
|
|
13
|
+
|
|
14
|
+
- id: remember_preference
|
|
15
|
+
type: memory
|
|
16
|
+
op: store
|
|
17
|
+
text: "The user prefers TypeScript over JavaScript for all projects."
|
|
18
|
+
metadata:
|
|
19
|
+
type: "preference"
|
|
20
|
+
confidence: 0.9
|
|
21
|
+
needs: [remember_facts]
|
|
22
|
+
|
|
23
|
+
# Search for information
|
|
24
|
+
- id: recall_preference
|
|
25
|
+
type: memory
|
|
26
|
+
op: search
|
|
27
|
+
query: "What language does the user like?"
|
|
28
|
+
limit: 1
|
|
29
|
+
needs: [remember_preference]
|
|
30
|
+
|
|
31
|
+
# Use recalled information in an LLM step
|
|
32
|
+
- id: confirm_memory
|
|
33
|
+
type: llm
|
|
34
|
+
agent: general
|
|
35
|
+
needs: [recall_preference]
|
|
36
|
+
prompt: |
|
|
37
|
+
Based on this memory:
|
|
38
|
+
${{ steps.recall_preference.output[0].content }}
|
|
39
|
+
|
|
40
|
+
What programming language should I use? Answer in one word.
|
|
41
|
+
schema:
|
|
42
|
+
type: object
|
|
43
|
+
properties:
|
|
44
|
+
language:
|
|
45
|
+
type: string
|
|
46
|
+
required: [language]
|
|
47
|
+
|
|
48
|
+
- id: summary
|
|
49
|
+
type: shell
|
|
50
|
+
needs: [confirm_memory]
|
|
51
|
+
run: |
|
|
52
|
+
echo "Memory Service Demo Complete"
|
|
53
|
+
echo "Recalled: ${{ steps.recall_preference.output[0].content }}"
|
|
54
|
+
echo "Decision: ${{ steps.confirm_memory.output.language }}"
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
name: robust-automation
|
|
2
|
+
description: "Demonstrate auto-healing and reflexion features"
|
|
3
|
+
|
|
4
|
+
steps:
|
|
5
|
+
# Demonstration of auto-healing
|
|
6
|
+
# This step attempts to run a broken command, but the agent should fix it
|
|
7
|
+
- id: auto_heal_demo
|
|
8
|
+
type: shell
|
|
9
|
+
run: |
|
|
10
|
+
# This command has a typo and should fail
|
|
11
|
+
ech "Hello World"
|
|
12
|
+
auto_heal:
|
|
13
|
+
agent: software-engineer
|
|
14
|
+
maxAttempts: 2
|
|
15
|
+
model: gpt-4o
|
|
16
|
+
|
|
17
|
+
# Demonstration of reflexion (self-correction)
|
|
18
|
+
# This step asks for JSON but provides a prompt that might lead to text
|
|
19
|
+
# Reflexion should catch the schema validation error and retry
|
|
20
|
+
- id: reflexion_demo
|
|
21
|
+
type: llm
|
|
22
|
+
agent: general
|
|
23
|
+
needs: [auto_heal_demo]
|
|
24
|
+
prompt: |
|
|
25
|
+
Generate a list of 3 random colors. Just list them.
|
|
26
|
+
schema:
|
|
27
|
+
type: object
|
|
28
|
+
properties:
|
|
29
|
+
colors:
|
|
30
|
+
type: array
|
|
31
|
+
items:
|
|
32
|
+
type: string
|
|
33
|
+
required: [colors]
|
|
34
|
+
reflexion:
|
|
35
|
+
limit: 3
|
|
36
|
+
hint: "Ensure the output is valid JSON matching the schema."
|
|
37
|
+
|
|
38
|
+
- id: summary
|
|
39
|
+
type: shell
|
|
40
|
+
needs: [reflexion_demo]
|
|
41
|
+
run: |
|
|
42
|
+
echo "Robust automation demo complete."
|
|
43
|
+
echo "Healed Command Output: ${{ steps.auto_heal_demo.output.stdout }}"
|
|
44
|
+
echo "Reflexion Output: ${{ steps.reflexion_demo.output }}"
|