clawlet 0.2.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -53,7 +53,7 @@ $ pnpm start
53
53
  * features
54
54
  - [x] handle session history
55
55
  - [x] read/write files and trash in workspace folder
56
- - [x] git history for workspace folder
56
+ - [ ] git history for workspace folder
57
57
  - [x] <AGENTS.md> support
58
58
  - [x] <SOUL.md> support
59
59
  - [x] users details at USER.md
@@ -70,13 +70,24 @@ $ pnpm start
70
70
  * messaging
71
71
  - [x] chat via command line interface
72
72
  - [x] chat via telegram bot
73
- * make available with (p)npx
73
+ * installation
74
+ - [x] make available with (p)npx
75
+ * qa
76
+ - [x] add evals via vitetest
74
77
  * operating system support
75
78
  - [x] runs on macosx
76
79
  - [ ] run on windows / linux
77
80
  - [ ] an *.app for mac
78
81
  - [ ] an .exe for windows
79
82
 
83
+ # Similiar projects
84
+
85
+ * Typescript
86
+ * <https://github.com/openclaw/openclaw>
87
+ * GO
88
+ * <https://github.com/sipeed/picoclaw>
89
+ * <https://github.com/HKUDS/nanobot>
90
+
80
91
  # License
81
92
 
82
93
  clawlet is copyright 2026 by DracoBlue and licensed under the MIT License.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clawlet",
3
- "version": "0.2.1",
3
+ "version": "0.4.0",
4
4
  "description": "A lightweight AI based personal assistant.",
5
5
  "main": "src/cli.ts",
6
6
  "type": "module",
@@ -30,17 +30,24 @@
30
30
  "@ai-sdk/openai": "^1.3.22",
31
31
  "@ai-sdk/openai-compatible": "^2.0.28",
32
32
  "@libsql/client": "^0.17.0",
33
+ "@vitest/coverage-v8": "^4.0.18",
33
34
  "ai": "^6.0.58",
34
35
  "dotenv": "^17.2.2",
35
36
  "grammy": "^1.39.3",
37
+ "pino": "^10.3.1",
36
38
  "tsx": "^4.21.0",
37
- "unstorage": "^1.17.4"
39
+ "turndown": "^7.2.2",
40
+ "unstorage": "^1.17.4",
41
+ "vitest": "^4.0.18",
42
+ "yaml": "^2.8.2"
38
43
  },
39
44
  "devDependencies": {
40
45
  "@types/node": "^25.2.1",
46
+ "@types/turndown": "^5.0.6",
41
47
  "typescript": "^5.9.3"
42
48
  },
43
49
  "scripts": {
44
- "start": "tsx src/cli.ts"
50
+ "start": "tsx src/cli.ts",
51
+ "test": "vitest run"
45
52
  }
46
53
  }
@@ -0,0 +1,218 @@
1
+ import { describe, it, expect } from 'vitest';
2
+ import fs from 'node:fs/promises';
3
+ import path from 'node:path';
4
+ import { fileURLToPath } from 'node:url';
5
+ import YAML from 'yaml';
6
+ import { generateText } from 'ai';
7
+ import { Agent } from './agent.js';
8
+ import { AgentMemory } from './memory.js';
9
+ import { model } from './llm.js';
10
+
11
+ const __filename = fileURLToPath(import.meta.url);
12
+ const __dirname = path.dirname(__filename);
13
+
14
+ const evalDir = path.join(__dirname, 'evals');
15
+
16
+ /**
17
+ * Unstorage uses `:` as path separator internally.
18
+ * YAML files use `/` for readability. Normalize to `:` for workspace access.
19
+ */
20
+ function normalizeStorageKey(key: string): string {
21
+ return key.replace(/\//g, ':');
22
+ }
23
+
24
+ /**
25
+ * Run an LLM-as-judge evaluation using localModel.
26
+ * Returns true if the judge considers the eval criteria met.
27
+ */
28
+ async function runLlmJudge(
29
+ evalCriteria: string,
30
+ userInput: string,
31
+ agentOutput: string
32
+ ): Promise<{ pass: boolean; reasoning: string }> {
33
+ const { text } = await generateText({
34
+ model,
35
+ messages: [
36
+ {
37
+ role: 'system',
38
+ content: `You are a strict test evaluator. You will be given:
39
+ 1. The user's input to an AI agent
40
+ 2. The agent's output/response
41
+ 3. Evaluation criteria
42
+
43
+ Judge whether the agent's output meets ALL the evaluation criteria.
44
+
45
+ Respond with EXACTLY this format:
46
+ PASS or FAIL
47
+ Reasoning: <brief explanation>`
48
+ },
49
+ {
50
+ role: 'user',
51
+ content: `## User Input\n${userInput}\n\n## Agent Output\n${agentOutput}\n\n## Evaluation Criteria\n${evalCriteria}`
52
+ }
53
+ ],
54
+ temperature: 0.1,
55
+ });
56
+
57
+ const firstLine = text.trim().split('\n')[0]?.trim().toUpperCase() ?? '';
58
+ const pass = firstLine.startsWith('PASS');
59
+ return { pass, reasoning: text.trim() };
60
+ }
61
+
62
+ // Default timeout for LLM-backed eval tests (2 minutes)
63
+ const EVAL_TIMEOUT = 120_000;
64
+
65
+ const runTestCaseFile = async (filename: string) => {
66
+ const content = await fs.readFile(path.join(evalDir, filename), 'utf-8');
67
+ const data = YAML.parse(content);
68
+ // 1. SETUP
69
+ const memory = await AgentMemory.createInMemory();
70
+
71
+ // Seed workspace files
72
+ if (data.setup?.files) {
73
+ for (const [name, content] of Object.entries(data.setup.files)) {
74
+ await memory.workspace.setItem(normalizeStorageKey(name), content as string);
75
+ }
76
+ }
77
+
78
+ // Seed KV store
79
+ if (data.setup?.kv) {
80
+ for (const [key, value] of Object.entries(data.setup.kv)) {
81
+ await memory.secrets.set(key, value as string);
82
+ }
83
+ }
84
+
85
+ // 2. EXECUTION
86
+ const agent = new Agent(memory, model);
87
+ let output = "";
88
+
89
+ // Output capture
90
+ agent.addOutput({
91
+ onAgentStart: () => {},
92
+ onResponseChunk: () => {},
93
+ onResponseEnd: (full) => { output = full; },
94
+ onError: (e) => { throw e; }
95
+ });
96
+
97
+ (agent as any).inputQueue.push({ text: data.input, label: 'test' });
98
+ await (agent as any).processQueue();
99
+
100
+ // 3. ASSERTIONS
101
+
102
+ // a) Response keywords (ALL must match)
103
+ if (data.validate?.response?.contains) {
104
+ data.validate.response.contains.forEach((keyword: string) => {
105
+ expect(output.toLowerCase()).toContain(keyword.toLowerCase());
106
+ });
107
+ }
108
+
109
+ // b) Response keywords (ALL must not match)
110
+ if (data.validate?.response?.must_not_contain) {
111
+ data.validate.response.must_not_contain.forEach((keyword: string) => {
112
+ expect(output.toLowerCase()).not.toContain(keyword.toLowerCase());
113
+ });
114
+ }
115
+
116
+ // c) Response keywords (ANY must match — at least one)
117
+ if (data.validate?.response?.contains_any) {
118
+ const matches = data.validate.response.contains_any.some(
119
+ (keyword: string) => output.toLowerCase().includes(keyword.toLowerCase())
120
+ );
121
+ expect(
122
+ matches,
123
+ `Expected response to contain at least one of: ${data.validate.response.contains_any.join(', ')}`
124
+ ).toBe(true);
125
+ }
126
+
127
+ // d) File content check
128
+ if (data.validate?.files) {
129
+ for (const [filepath, rules] of Object.entries(data.validate.files as Record<string, any>)) {
130
+ const storageKey = normalizeStorageKey(filepath);
131
+ const content = await memory.workspace.getItem(storageKey);
132
+ // Unstorage memory driver may auto-parse JSON strings into objects
133
+ const textContent = content
134
+ ? (typeof content === 'object' ? JSON.stringify(content, null, 2) : String(content))
135
+ : "";
136
+
137
+ // ALL must be present
138
+ if (rules.contains) {
139
+ rules.contains.forEach((str: string) => {
140
+ expect(textContent, `File "${filepath}" should contain "${str}"`).toContain(str);
141
+ });
142
+ }
143
+
144
+ // At least ONE must be present
145
+ if (rules.contains_any) {
146
+ const matches = rules.contains_any.some(
147
+ (str: string) => textContent.includes(str)
148
+ );
149
+ expect(
150
+ matches,
151
+ `File "${filepath}" should contain at least one of: ${rules.contains_any.join(', ')}`
152
+ ).toBe(true);
153
+ }
154
+
155
+ // NONE must be present
156
+ if (rules.must_not_contain) {
157
+ rules.must_not_contain.forEach((str: string) => {
158
+ expect(textContent, `File "${filepath}" should NOT contain "${str}"`).not.toContain(str);
159
+ });
160
+ }
161
+
162
+ // File must exist (non-empty)
163
+ if (rules.exists === true) {
164
+ expect(textContent.length, `File "${filepath}" should exist and not be empty`).toBeGreaterThan(0);
165
+ }
166
+ }
167
+ }
168
+
169
+ // e) KV store assertions
170
+ if (data.validate?.kv) {
171
+ for (const [key, rules] of Object.entries(data.validate.kv as Record<string, any>)) {
172
+ const value = await memory.secrets.get(key);
173
+
174
+ if (rules.exists === true) {
175
+ expect(value, `KV key "${key}" should exist`).not.toBeNull();
176
+ }
177
+ if (rules.contains) {
178
+ rules.contains.forEach((str: string) => {
179
+ expect(value ?? '', `KV key "${key}" should contain "${str}"`).toContain(str);
180
+ });
181
+ }
182
+ }
183
+ }
184
+
185
+ // f) LLM judge evaluation using localModel
186
+ if (data.validate?.llm_eval) {
187
+ const { pass, reasoning } = await runLlmJudge(
188
+ data.validate.llm_eval,
189
+ data.input,
190
+ output
191
+ );
192
+ expect(pass, `LLM judge failed:\n${reasoning} (eval: ${data.validate.llm_eval}, output: ${output})`).toBe(true);
193
+ }
194
+ }
195
+
196
+ describe('Agent Evals (LLM)', () => {
197
+ it(`bootstrap_trigger`, async () => runTestCaseFile('bootstrap_trigger.yaml'), EVAL_TIMEOUT);
198
+ it(`connection_auth`, async () => runTestCaseFile('connection_auth.yaml'), EVAL_TIMEOUT);
199
+ it(`create_python_file`, async () => runTestCaseFile('create_python_file.yaml'), EVAL_TIMEOUT);
200
+ it(`directory_traversal`, async () => runTestCaseFile('directory_traversal.yaml'), EVAL_TIMEOUT);
201
+ it(`empty_directory`, async () => runTestCaseFile('empty_directory.yaml'), EVAL_TIMEOUT);
202
+ it(`extend_agents_md`, async () => runTestCaseFile('extend_agents_md.yaml'), EVAL_TIMEOUT * 2);
203
+ it(`external_data`, async () => runTestCaseFile('external_data.yaml'), EVAL_TIMEOUT);
204
+ it(`file_not_found`, async () => runTestCaseFile('file_not_found.yaml'), EVAL_TIMEOUT);
205
+ it(`memory_persistence`, async () => runTestCaseFile('memory_persistence.yaml'), EVAL_TIMEOUT);
206
+ it(`move_and_rename`, async () => runTestCaseFile('move_and_rename.yaml'), EVAL_TIMEOUT);
207
+ it(`needle_in_haystack`, async () => runTestCaseFile('needle_in_haystack.yaml'), EVAL_TIMEOUT);
208
+ it(`persona_tone`, async () => runTestCaseFile('persona_tone.yaml'), EVAL_TIMEOUT);
209
+ it(`rag_user`, async () => runTestCaseFile('rag_user.yaml'), EVAL_TIMEOUT);
210
+ it(`reasoning_multi_step`, async () => runTestCaseFile('reasoning_multi_step.yaml'), EVAL_TIMEOUT);
211
+ it(`refactoring_edit`, async () => runTestCaseFile('refactoring_edit.yaml'), EVAL_TIMEOUT);
212
+ it(`skill_sandbox_execution`, async () => runTestCaseFile('skill_sandbox_execution.yaml'), EVAL_TIMEOUT);
213
+ it(`skill_system_installation`, async () => runTestCaseFile('skill_system_installation.yaml'), EVAL_TIMEOUT);
214
+ it(`soft_delete`, async () => runTestCaseFile('soft_delete.yaml'), EVAL_TIMEOUT);
215
+ it(`stat_check`, async () => runTestCaseFile('stat_check.yaml'), EVAL_TIMEOUT);
216
+ it(`workflow_cleanup`, async () => runTestCaseFile('workflow_cleanup.yaml'), EVAL_TIMEOUT);
217
+ it(`write_complex_json`, async () => runTestCaseFile('write_complex_json.yaml'), EVAL_TIMEOUT);
218
+ });