clawlet 0.2.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -2
- package/package.json +10 -3
- package/src/agent.eval.test.ts +218 -0
- package/src/agent.ts +52 -1004
- package/src/cli.ts +3 -1
- package/src/evals/bootstrap_trigger.yaml +14 -0
- package/src/evals/connection_auth.yaml +12 -0
- package/src/evals/create_python_file.yaml +11 -0
- package/src/evals/directory_traversal.yaml +13 -0
- package/src/evals/empty_directory.yaml +12 -0
- package/src/evals/extend_agents_md.yaml +161 -0
- package/src/evals/external_data.yaml +16 -0
- package/src/evals/file_not_found.yaml +15 -0
- package/src/evals/memory_persistence.yaml +19 -0
- package/src/evals/move_and_rename.yaml +13 -0
- package/src/evals/needle_in_haystack.yaml +16 -0
- package/src/evals/persona_tone.yaml +16 -0
- package/src/evals/rag_user.yaml +17 -0
- package/src/evals/reasoning_multi_step.yaml +13 -0
- package/src/evals/refactoring_edit.yaml +14 -0
- package/src/evals/skill_sandbox_execution.yaml +19 -0
- package/src/evals/skill_system_installation.yaml +14 -0
- package/src/evals/soft_delete.yaml +17 -0
- package/src/evals/stat_check.yaml +16 -0
- package/src/evals/workflow_cleanup.yaml +17 -0
- package/src/evals/write_complex_json.yaml +15 -0
- package/src/llm.ts +35 -0
- package/src/logger.ts +39 -0
- package/src/memory.ts +95 -27
- package/src/storage.ts +147 -95
- package/src/tools.ts +1044 -0
- package/template/AGENTS.template +1 -1
package/README.md
CHANGED
|
@@ -53,7 +53,7 @@ $ pnpm start
|
|
|
53
53
|
* features
|
|
54
54
|
- [x] handle session history
|
|
55
55
|
- [x] read/write files and trash in workspace folder
|
|
56
|
-
- [
|
|
56
|
+
- [ ] git history for workspace folder
|
|
57
57
|
- [x] <AGENTS.md> support
|
|
58
58
|
- [x] <SOUL.md> support
|
|
59
59
|
- [x] users details at USER.md
|
|
@@ -70,13 +70,24 @@ $ pnpm start
|
|
|
70
70
|
* messaging
|
|
71
71
|
- [x] chat via command line interface
|
|
72
72
|
- [x] chat via telegram bot
|
|
73
|
-
*
|
|
73
|
+
* installation
|
|
74
|
+
- [x] make available with (p)npx
|
|
75
|
+
* qa
|
|
76
|
+
- [x] add evals via vitetest
|
|
74
77
|
* operating system support
|
|
75
78
|
- [x] runs on macosx
|
|
76
79
|
- [ ] run on windows / linux
|
|
77
80
|
- [ ] an *.app for mac
|
|
78
81
|
- [ ] an .exe for windows
|
|
79
82
|
|
|
83
|
+
# Similiar projects
|
|
84
|
+
|
|
85
|
+
* Typescript
|
|
86
|
+
* <https://github.com/openclaw/openclaw>
|
|
87
|
+
* GO
|
|
88
|
+
* <https://github.com/sipeed/picoclaw>
|
|
89
|
+
* <https://github.com/HKUDS/nanobot>
|
|
90
|
+
|
|
80
91
|
# License
|
|
81
92
|
|
|
82
93
|
clawlet is copyright 2026 by DracoBlue and licensed under the MIT License.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "clawlet",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.4.0",
|
|
4
4
|
"description": "A lightweight AI based personal assistant.",
|
|
5
5
|
"main": "src/cli.ts",
|
|
6
6
|
"type": "module",
|
|
@@ -30,17 +30,24 @@
|
|
|
30
30
|
"@ai-sdk/openai": "^1.3.22",
|
|
31
31
|
"@ai-sdk/openai-compatible": "^2.0.28",
|
|
32
32
|
"@libsql/client": "^0.17.0",
|
|
33
|
+
"@vitest/coverage-v8": "^4.0.18",
|
|
33
34
|
"ai": "^6.0.58",
|
|
34
35
|
"dotenv": "^17.2.2",
|
|
35
36
|
"grammy": "^1.39.3",
|
|
37
|
+
"pino": "^10.3.1",
|
|
36
38
|
"tsx": "^4.21.0",
|
|
37
|
-
"
|
|
39
|
+
"turndown": "^7.2.2",
|
|
40
|
+
"unstorage": "^1.17.4",
|
|
41
|
+
"vitest": "^4.0.18",
|
|
42
|
+
"yaml": "^2.8.2"
|
|
38
43
|
},
|
|
39
44
|
"devDependencies": {
|
|
40
45
|
"@types/node": "^25.2.1",
|
|
46
|
+
"@types/turndown": "^5.0.6",
|
|
41
47
|
"typescript": "^5.9.3"
|
|
42
48
|
},
|
|
43
49
|
"scripts": {
|
|
44
|
-
"start": "tsx src/cli.ts"
|
|
50
|
+
"start": "tsx src/cli.ts",
|
|
51
|
+
"test": "vitest run"
|
|
45
52
|
}
|
|
46
53
|
}
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
import { describe, it, expect } from 'vitest';
|
|
2
|
+
import fs from 'node:fs/promises';
|
|
3
|
+
import path from 'node:path';
|
|
4
|
+
import { fileURLToPath } from 'node:url';
|
|
5
|
+
import YAML from 'yaml';
|
|
6
|
+
import { generateText } from 'ai';
|
|
7
|
+
import { Agent } from './agent.js';
|
|
8
|
+
import { AgentMemory } from './memory.js';
|
|
9
|
+
import { model } from './llm.js';
|
|
10
|
+
|
|
11
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
12
|
+
const __dirname = path.dirname(__filename);
|
|
13
|
+
|
|
14
|
+
const evalDir = path.join(__dirname, 'evals');
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Unstorage uses `:` as path separator internally.
|
|
18
|
+
* YAML files use `/` for readability. Normalize to `:` for workspace access.
|
|
19
|
+
*/
|
|
20
|
+
function normalizeStorageKey(key: string): string {
|
|
21
|
+
return key.replace(/\//g, ':');
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Run an LLM-as-judge evaluation using localModel.
|
|
26
|
+
* Returns true if the judge considers the eval criteria met.
|
|
27
|
+
*/
|
|
28
|
+
async function runLlmJudge(
|
|
29
|
+
evalCriteria: string,
|
|
30
|
+
userInput: string,
|
|
31
|
+
agentOutput: string
|
|
32
|
+
): Promise<{ pass: boolean; reasoning: string }> {
|
|
33
|
+
const { text } = await generateText({
|
|
34
|
+
model,
|
|
35
|
+
messages: [
|
|
36
|
+
{
|
|
37
|
+
role: 'system',
|
|
38
|
+
content: `You are a strict test evaluator. You will be given:
|
|
39
|
+
1. The user's input to an AI agent
|
|
40
|
+
2. The agent's output/response
|
|
41
|
+
3. Evaluation criteria
|
|
42
|
+
|
|
43
|
+
Judge whether the agent's output meets ALL the evaluation criteria.
|
|
44
|
+
|
|
45
|
+
Respond with EXACTLY this format:
|
|
46
|
+
PASS or FAIL
|
|
47
|
+
Reasoning: <brief explanation>`
|
|
48
|
+
},
|
|
49
|
+
{
|
|
50
|
+
role: 'user',
|
|
51
|
+
content: `## User Input\n${userInput}\n\n## Agent Output\n${agentOutput}\n\n## Evaluation Criteria\n${evalCriteria}`
|
|
52
|
+
}
|
|
53
|
+
],
|
|
54
|
+
temperature: 0.1,
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
const firstLine = text.trim().split('\n')[0]?.trim().toUpperCase() ?? '';
|
|
58
|
+
const pass = firstLine.startsWith('PASS');
|
|
59
|
+
return { pass, reasoning: text.trim() };
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// Default timeout for LLM-backed eval tests (2 minutes)
|
|
63
|
+
const EVAL_TIMEOUT = 120_000;
|
|
64
|
+
|
|
65
|
+
const runTestCaseFile = async (filename: string) => {
|
|
66
|
+
const content = await fs.readFile(path.join(evalDir, filename), 'utf-8');
|
|
67
|
+
const data = YAML.parse(content);
|
|
68
|
+
// 1. SETUP
|
|
69
|
+
const memory = await AgentMemory.createInMemory();
|
|
70
|
+
|
|
71
|
+
// Seed workspace files
|
|
72
|
+
if (data.setup?.files) {
|
|
73
|
+
for (const [name, content] of Object.entries(data.setup.files)) {
|
|
74
|
+
await memory.workspace.setItem(normalizeStorageKey(name), content as string);
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// Seed KV store
|
|
79
|
+
if (data.setup?.kv) {
|
|
80
|
+
for (const [key, value] of Object.entries(data.setup.kv)) {
|
|
81
|
+
await memory.secrets.set(key, value as string);
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// 2. EXECUTION
|
|
86
|
+
const agent = new Agent(memory, model);
|
|
87
|
+
let output = "";
|
|
88
|
+
|
|
89
|
+
// Output capture
|
|
90
|
+
agent.addOutput({
|
|
91
|
+
onAgentStart: () => {},
|
|
92
|
+
onResponseChunk: () => {},
|
|
93
|
+
onResponseEnd: (full) => { output = full; },
|
|
94
|
+
onError: (e) => { throw e; }
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
(agent as any).inputQueue.push({ text: data.input, label: 'test' });
|
|
98
|
+
await (agent as any).processQueue();
|
|
99
|
+
|
|
100
|
+
// 3. ASSERTIONS
|
|
101
|
+
|
|
102
|
+
// a) Response keywords (ALL must match)
|
|
103
|
+
if (data.validate?.response?.contains) {
|
|
104
|
+
data.validate.response.contains.forEach((keyword: string) => {
|
|
105
|
+
expect(output.toLowerCase()).toContain(keyword.toLowerCase());
|
|
106
|
+
});
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// b) Response keywords (ALL must not match)
|
|
110
|
+
if (data.validate?.response?.must_not_contain) {
|
|
111
|
+
data.validate.response.must_not_contain.forEach((keyword: string) => {
|
|
112
|
+
expect(output.toLowerCase()).not.toContain(keyword.toLowerCase());
|
|
113
|
+
});
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// c) Response keywords (ANY must match — at least one)
|
|
117
|
+
if (data.validate?.response?.contains_any) {
|
|
118
|
+
const matches = data.validate.response.contains_any.some(
|
|
119
|
+
(keyword: string) => output.toLowerCase().includes(keyword.toLowerCase())
|
|
120
|
+
);
|
|
121
|
+
expect(
|
|
122
|
+
matches,
|
|
123
|
+
`Expected response to contain at least one of: ${data.validate.response.contains_any.join(', ')}`
|
|
124
|
+
).toBe(true);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// d) File content check
|
|
128
|
+
if (data.validate?.files) {
|
|
129
|
+
for (const [filepath, rules] of Object.entries(data.validate.files as Record<string, any>)) {
|
|
130
|
+
const storageKey = normalizeStorageKey(filepath);
|
|
131
|
+
const content = await memory.workspace.getItem(storageKey);
|
|
132
|
+
// Unstorage memory driver may auto-parse JSON strings into objects
|
|
133
|
+
const textContent = content
|
|
134
|
+
? (typeof content === 'object' ? JSON.stringify(content, null, 2) : String(content))
|
|
135
|
+
: "";
|
|
136
|
+
|
|
137
|
+
// ALL must be present
|
|
138
|
+
if (rules.contains) {
|
|
139
|
+
rules.contains.forEach((str: string) => {
|
|
140
|
+
expect(textContent, `File "${filepath}" should contain "${str}"`).toContain(str);
|
|
141
|
+
});
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// At least ONE must be present
|
|
145
|
+
if (rules.contains_any) {
|
|
146
|
+
const matches = rules.contains_any.some(
|
|
147
|
+
(str: string) => textContent.includes(str)
|
|
148
|
+
);
|
|
149
|
+
expect(
|
|
150
|
+
matches,
|
|
151
|
+
`File "${filepath}" should contain at least one of: ${rules.contains_any.join(', ')}`
|
|
152
|
+
).toBe(true);
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// NONE must be present
|
|
156
|
+
if (rules.must_not_contain) {
|
|
157
|
+
rules.must_not_contain.forEach((str: string) => {
|
|
158
|
+
expect(textContent, `File "${filepath}" should NOT contain "${str}"`).not.toContain(str);
|
|
159
|
+
});
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// File must exist (non-empty)
|
|
163
|
+
if (rules.exists === true) {
|
|
164
|
+
expect(textContent.length, `File "${filepath}" should exist and not be empty`).toBeGreaterThan(0);
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
// e) KV store assertions
|
|
170
|
+
if (data.validate?.kv) {
|
|
171
|
+
for (const [key, rules] of Object.entries(data.validate.kv as Record<string, any>)) {
|
|
172
|
+
const value = await memory.secrets.get(key);
|
|
173
|
+
|
|
174
|
+
if (rules.exists === true) {
|
|
175
|
+
expect(value, `KV key "${key}" should exist`).not.toBeNull();
|
|
176
|
+
}
|
|
177
|
+
if (rules.contains) {
|
|
178
|
+
rules.contains.forEach((str: string) => {
|
|
179
|
+
expect(value ?? '', `KV key "${key}" should contain "${str}"`).toContain(str);
|
|
180
|
+
});
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
// f) LLM judge evaluation using localModel
|
|
186
|
+
if (data.validate?.llm_eval) {
|
|
187
|
+
const { pass, reasoning } = await runLlmJudge(
|
|
188
|
+
data.validate.llm_eval,
|
|
189
|
+
data.input,
|
|
190
|
+
output
|
|
191
|
+
);
|
|
192
|
+
expect(pass, `LLM judge failed:\n${reasoning} (eval: ${data.validate.llm_eval}, output: ${output})`).toBe(true);
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
describe('Agent Evals (LLM)', () => {
|
|
197
|
+
it(`bootstrap_trigger`, async () => runTestCaseFile('bootstrap_trigger.yaml'), EVAL_TIMEOUT);
|
|
198
|
+
it(`connection_auth`, async () => runTestCaseFile('connection_auth.yaml'), EVAL_TIMEOUT);
|
|
199
|
+
it(`create_python_file`, async () => runTestCaseFile('create_python_file.yaml'), EVAL_TIMEOUT);
|
|
200
|
+
it(`directory_traversal`, async () => runTestCaseFile('directory_traversal.yaml'), EVAL_TIMEOUT);
|
|
201
|
+
it(`empty_directory`, async () => runTestCaseFile('empty_directory.yaml'), EVAL_TIMEOUT);
|
|
202
|
+
it(`extend_agents_md`, async () => runTestCaseFile('extend_agents_md.yaml'), EVAL_TIMEOUT * 2);
|
|
203
|
+
it(`external_data`, async () => runTestCaseFile('external_data.yaml'), EVAL_TIMEOUT);
|
|
204
|
+
it(`file_not_found`, async () => runTestCaseFile('file_not_found.yaml'), EVAL_TIMEOUT);
|
|
205
|
+
it(`memory_persistence`, async () => runTestCaseFile('memory_persistence.yaml'), EVAL_TIMEOUT);
|
|
206
|
+
it(`move_and_rename`, async () => runTestCaseFile('move_and_rename.yaml'), EVAL_TIMEOUT);
|
|
207
|
+
it(`needle_in_haystack`, async () => runTestCaseFile('needle_in_haystack.yaml'), EVAL_TIMEOUT);
|
|
208
|
+
it(`persona_tone`, async () => runTestCaseFile('persona_tone.yaml'), EVAL_TIMEOUT);
|
|
209
|
+
it(`rag_user`, async () => runTestCaseFile('rag_user.yaml'), EVAL_TIMEOUT);
|
|
210
|
+
it(`reasoning_multi_step`, async () => runTestCaseFile('reasoning_multi_step.yaml'), EVAL_TIMEOUT);
|
|
211
|
+
it(`refactoring_edit`, async () => runTestCaseFile('refactoring_edit.yaml'), EVAL_TIMEOUT);
|
|
212
|
+
it(`skill_sandbox_execution`, async () => runTestCaseFile('skill_sandbox_execution.yaml'), EVAL_TIMEOUT);
|
|
213
|
+
it(`skill_system_installation`, async () => runTestCaseFile('skill_system_installation.yaml'), EVAL_TIMEOUT);
|
|
214
|
+
it(`soft_delete`, async () => runTestCaseFile('soft_delete.yaml'), EVAL_TIMEOUT);
|
|
215
|
+
it(`stat_check`, async () => runTestCaseFile('stat_check.yaml'), EVAL_TIMEOUT);
|
|
216
|
+
it(`workflow_cleanup`, async () => runTestCaseFile('workflow_cleanup.yaml'), EVAL_TIMEOUT);
|
|
217
|
+
it(`write_complex_json`, async () => runTestCaseFile('write_complex_json.yaml'), EVAL_TIMEOUT);
|
|
218
|
+
});
|