clawlet 0.5.1 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/agent.eval.test.ts +1 -0
- package/src/agent.ts +0 -6
- package/src/evals/rewrite_agents_md.yaml +161 -0
- package/src/llm.ts +26 -22
- package/src/logger.ts +22 -1
- package/src/storage.ts +23 -57
- package/src/tools.ts +0 -6
package/package.json
CHANGED
package/src/agent.eval.test.ts
CHANGED
|
@@ -217,6 +217,7 @@ describe('Agent Evals (LLM)', () => {
|
|
|
217
217
|
it(`reasoning_multi_step`, async () => runTestCaseFile('reasoning_multi_step.yaml'), EVAL_TIMEOUT);
|
|
218
218
|
it(`refactoring_edit`, async () => runTestCaseFile('refactoring_edit.yaml'), EVAL_TIMEOUT);
|
|
219
219
|
it(`skill_sandbox_execution`, async () => runTestCaseFile('skill_sandbox_execution.yaml'), EVAL_TIMEOUT);
|
|
220
|
+
it(`rewrite_agents_md`, async () => runTestCaseFile('rewrite_agents_md.yaml'), EVAL_TIMEOUT);
|
|
220
221
|
it(`skill_system_installation`, async () => runTestCaseFile('skill_system_installation.yaml'), EVAL_TIMEOUT);
|
|
221
222
|
it(`soft_delete`, async () => runTestCaseFile('soft_delete.yaml'), EVAL_TIMEOUT);
|
|
222
223
|
it(`stat_check`, async () => runTestCaseFile('stat_check.yaml'), EVAL_TIMEOUT);
|
package/src/agent.ts
CHANGED
|
@@ -16,9 +16,6 @@ import { createTools } from './tools.js';
|
|
|
16
16
|
const __filename = fileURLToPath(import.meta.url);
|
|
17
17
|
const __dirname = path.dirname(__filename);
|
|
18
18
|
const PACKAGE_ROOT = path.resolve(__dirname, '..');
|
|
19
|
-
const GENERATE_TEXT_TEMPERATURE = 0.6;
|
|
20
|
-
const GENERATE_TEXT_TOP_P = 0.95;
|
|
21
|
-
const GENERATE_TEXT_MAX_OUTPUT_TOKENS = 16384;
|
|
22
19
|
const GENERATE_TEXT_MAX_STEPS = 30;
|
|
23
20
|
|
|
24
21
|
// --- ADAPTER INTERFACES ---
|
|
@@ -138,9 +135,6 @@ async function runAgent(
|
|
|
138
135
|
system: await buildSystemPrompt(memory),
|
|
139
136
|
messages,
|
|
140
137
|
tools,
|
|
141
|
-
temperature: GENERATE_TEXT_TEMPERATURE,
|
|
142
|
-
topP: GENERATE_TEXT_TOP_P,
|
|
143
|
-
maxOutputTokens: GENERATE_TEXT_MAX_OUTPUT_TOKENS,
|
|
144
138
|
stopWhen: stepCountIs(GENERATE_TEXT_MAX_STEPS),
|
|
145
139
|
|
|
146
140
|
onStepFinish: (step) => {
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
name: "Extend AGENTS.md with New Section"
|
|
2
|
+
description: "Tests whether the agent can read a large AGENTS.md, append a new section, and preserve the existing content. Stresses the model's ability to handle long text read+write cycles."
|
|
3
|
+
|
|
4
|
+
timeout: 240000
|
|
5
|
+
|
|
6
|
+
setup:
|
|
7
|
+
files:
|
|
8
|
+
SOUL.md: |
|
|
9
|
+
# SOUL
|
|
10
|
+
I amlike what I do.
|
|
11
|
+
USER.md: |
|
|
12
|
+
# USER
|
|
13
|
+
name: Mr. X.
|
|
14
|
+
IDENTITY: |
|
|
15
|
+
# IDENTITY
|
|
16
|
+
name: Bob
|
|
17
|
+
AGENTS.md: |
|
|
18
|
+
# System Identity & Architecture
|
|
19
|
+
|
|
20
|
+
You are an AI agent running on **Qwen3-4B-Instruct**.
|
|
21
|
+
- **Environment:** `mlx_lm.server` (local Apple Silicon execution).
|
|
22
|
+
- **Strengths:** Speed, code generation, logical instruction following.
|
|
23
|
+
- **Constraints:** You have a smaller parameter count than massive frontier models. You must compensate by being **explicit, structured, and deliberate** in your reasoning.
|
|
24
|
+
|
|
25
|
+
# Every Session
|
|
26
|
+
|
|
27
|
+
Before doing anything else:
|
|
28
|
+
1. Read `SOUL.md` — Who you are.
|
|
29
|
+
2. Read `USER.md` — Who you're helping.
|
|
30
|
+
3. Read `memory/YYYY-MM-DD.md` (today + yesterday) — Recent context.
|
|
31
|
+
4. **If in MAIN SESSION:** Read `MEMORY.md`.
|
|
32
|
+
|
|
33
|
+
## 🧠 Reasoning Protocol (Crucial)
|
|
34
|
+
|
|
35
|
+
Because you are a highly efficient 4B model, you **MUST** pause and think to ensure accuracy.
|
|
36
|
+
|
|
37
|
+
For any request that involves multiple steps, ambiguity, or tool use, you must output a **Thinking Process** before your final response:
|
|
38
|
+
|
|
39
|
+
1. **Analyze:** What is the user actually asking?
|
|
40
|
+
2. **Plan:** What steps/tools are needed?
|
|
41
|
+
3. **Execute:** Generate the response or tool call.
|
|
42
|
+
|
|
43
|
+
*Example:*
|
|
44
|
+
> **Thinking Process:**
|
|
45
|
+
> User wants to search for colors. I need to check if the 'tavily' skill is installed. It is. I will construct the skill.prompt command.
|
|
46
|
+
|
|
47
|
+
## Memory Management
|
|
48
|
+
|
|
49
|
+
You wake up fresh each session. Files are your only continuity.
|
|
50
|
+
|
|
51
|
+
- **Daily logs:** `memory/YYYY-MM-DD.md` (Raw logs of events/actions).
|
|
52
|
+
- **Long-term:** `MEMORY.md` (Curated insights, User preferences, Major decisions).
|
|
53
|
+
|
|
54
|
+
### 📝 Write It Down or It Didn't Happen
|
|
55
|
+
**Memory is limited.** "Mental notes" die when the session ends.
|
|
56
|
+
- **Action:** When you learn something, **immediately** write it to `memory/YYYY-MM-DD.md` or `MEMORY.md` using `fs.writeFile`.
|
|
57
|
+
- **Method:** You cannot "remember" things between sessions unless they are saved to a file.
|
|
58
|
+
|
|
59
|
+
### 🚨 Error Transparency Protocol
|
|
60
|
+
If an action fails:
|
|
61
|
+
1. **Log it:** Write the error to the daily memory file.
|
|
62
|
+
2. **Include:** Exact error message, action attempted, and the fix you tried.
|
|
63
|
+
3. **No Hallucinations:** Do not invent successful outcomes. If it failed, say it failed.
|
|
64
|
+
|
|
65
|
+
## Safety & Permissions
|
|
66
|
+
|
|
67
|
+
**Safe to do freely:**
|
|
68
|
+
- Read files, organize folders, search web (if enabled), check calendars.
|
|
69
|
+
- Internal workspace operations.
|
|
70
|
+
|
|
71
|
+
**Ask first:**
|
|
72
|
+
- sending emails, tweets, or public posts.
|
|
73
|
+
- Destructive commands (always use `trash` over `rm`).
|
|
74
|
+
|
|
75
|
+
## Group Chat Behavior
|
|
76
|
+
|
|
77
|
+
**Role:** Participant, not a proxy.
|
|
78
|
+
**Rule:** Quality > Quantity.
|
|
79
|
+
|
|
80
|
+
**When to Speak:**
|
|
81
|
+
- Directly mentioned.
|
|
82
|
+
- You can fix a factual error or provide a specific answer.
|
|
83
|
+
|
|
84
|
+
**When to Stay Silent (`HEARTBEAT_OK`):**
|
|
85
|
+
- Casual banter.
|
|
86
|
+
- Question already answered.
|
|
87
|
+
- Your reply would just be "lol" or "agree".
|
|
88
|
+
|
|
89
|
+
**Reactions:** Use emoji reactions to acknowledge messages without cluttering the chat.
|
|
90
|
+
|
|
91
|
+
## Heartbeats
|
|
92
|
+
|
|
93
|
+
When receiving a heartbeat prompt:
|
|
94
|
+
1. **Read:** Check `HEARTBEAT.md` (if exists).
|
|
95
|
+
2. **Evaluate:** Do I *actually* need to do something? (Check email, calendar, etc.)
|
|
96
|
+
3. **Action:**
|
|
97
|
+
* **If Yes:** Perform the task.
|
|
98
|
+
* **If No:** Reply exactly: `HEARTBEAT_OK` (Do not add extra text).
|
|
99
|
+
|
|
100
|
+
## Tool & Skill Execution
|
|
101
|
+
|
|
102
|
+
You interact with the outside world via **Skills**.
|
|
103
|
+
|
|
104
|
+
### Execution Syntax
|
|
105
|
+
Use `skill.prompt` to invoke a skill.
|
|
106
|
+
|
|
107
|
+
**Format:**
|
|
108
|
+
`skill.prompt <skill_name> "<prompt_for_skill>"`
|
|
109
|
+
|
|
110
|
+
### Installation
|
|
111
|
+
Use `skills.install <name> "<url>"` to add new capabilities.
|
|
112
|
+
|
|
113
|
+
## File Operations
|
|
114
|
+
|
|
115
|
+
**1. File Writing Protocol:**
|
|
116
|
+
You must use `fs.writeFile` to persist **ALL** critical updates.
|
|
117
|
+
- Updating user preferences? -> `fs.writeFile` to `USER.md`.
|
|
118
|
+
- Logging an event? -> `fs.writeFile` to `memory/YYYY-MM-DD.md`.
|
|
119
|
+
- **Never** assume stating "I have updated the memory" is enough. You must execute the write.
|
|
120
|
+
|
|
121
|
+
**2. Message History Persistence:**
|
|
122
|
+
- Message history is **not** stored in RAM.
|
|
123
|
+
- Any decision or context you need for the future must be written to a file using `fs.writeFile`.
|
|
124
|
+
|
|
125
|
+
## Security
|
|
126
|
+
- **Moltbook API Key:** Access by using `connection.request({ name: "moltbook", "url": "..." })`.
|
|
127
|
+
- **Secrets:** Never print API keys in plain text logs.
|
|
128
|
+
|
|
129
|
+
## Make It Yours
|
|
130
|
+
Refine this `AGENTS.md` as you learn. If a rule isn't working for your specific model version, change it by using `fs.writeFile`.
|
|
131
|
+
|
|
132
|
+
input: "Add a new section at the end called '## Daily Reflection Protocol' to the file AGENTS.md (use the tool fs.writeFile). The section should contain these rules: 1) At the end of every session, write a 3-sentence summary to the daily memory file. 2) Include what was accomplished, what failed, and what to prioritize next. 3) Tag entries with #reflection for easy searching. Make sure you preserve ALL existing content in AGENTS.md when writing the updated version."
|
|
133
|
+
|
|
134
|
+
validate:
|
|
135
|
+
files:
|
|
136
|
+
AGENTS.md:
|
|
137
|
+
contains:
|
|
138
|
+
- "System Identity"
|
|
139
|
+
- "Every Session"
|
|
140
|
+
- "Reasoning Protocol"
|
|
141
|
+
- "Memory Management"
|
|
142
|
+
- "Safety & Permissions"
|
|
143
|
+
- "Daily Reflection Protocol"
|
|
144
|
+
- "#reflection"
|
|
145
|
+
contains_any:
|
|
146
|
+
- "3-sentence"
|
|
147
|
+
- "three-sentence"
|
|
148
|
+
- "summary"
|
|
149
|
+
- "reflection"
|
|
150
|
+
must_not_contain:
|
|
151
|
+
- "[object Object]"
|
|
152
|
+
response:
|
|
153
|
+
must_not_contain:
|
|
154
|
+
- "<tool_call>"
|
|
155
|
+
contains_any:
|
|
156
|
+
- "AGENTS.md"
|
|
157
|
+
- "added"
|
|
158
|
+
- "updated"
|
|
159
|
+
- "section"
|
|
160
|
+
- "reflection"
|
|
161
|
+
- "Daily Reflection"
|
package/src/llm.ts
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import { hermesToolMiddleware, xmlToolMiddleware, yamlToolMiddleware } from "@ai-sdk-tool/parser";
|
|
2
2
|
import { createOpenAICompatible } from "@ai-sdk/openai-compatible";
|
|
3
|
-
import { addToolInputExamplesMiddleware, extractReasoningMiddleware, wrapLanguageModel, type LanguageModel, gateway } from "ai";
|
|
3
|
+
import { addToolInputExamplesMiddleware, extractReasoningMiddleware, wrapLanguageModel, type LanguageModel, gateway, defaultSettingsMiddleware, type LanguageModelMiddleware } from "ai";
|
|
4
4
|
|
|
5
5
|
const OPENAI_COMPATIBLE_MODEL_ID = process.env.OPENAI_COMPATIBLE_MODEL_ID ?? 'qwen-local';
|
|
6
6
|
const OPENAI_COMPATIBLE_BASE_URL = process.env.OPENAI_COMPATIBLE_BASE_URL ?? 'http://localhost:8000/v1';
|
|
7
|
-
const AI_GATEWAY_USE_QWEN_MIDDLEWARE = process.env.AI_GATEWAY_USE_QWEN_MIDDLEWARE ?? '';
|
|
7
|
+
const AI_GATEWAY_USE_QWEN_MIDDLEWARE = process.env.AI_GATEWAY_USE_QWEN_MIDDLEWARE ?? (!process.env.AI_GATEWAY_MODEL_ID ? '1' : '');
|
|
8
8
|
|
|
9
9
|
// --- MODEL SETUP ---
|
|
10
10
|
const localProvider = createOpenAICompatible({
|
|
@@ -12,24 +12,28 @@ const localProvider = createOpenAICompatible({
|
|
|
12
12
|
baseURL: OPENAI_COMPATIBLE_BASE_URL,
|
|
13
13
|
});
|
|
14
14
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
})
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
15
|
+
const unwrappedModel : LanguageModel = process.env.AI_GATEWAY_MODEL_ID ? gateway.languageModel(process.env.AI_GATEWAY_MODEL_ID) : localProvider.languageModel(OPENAI_COMPATIBLE_MODEL_ID);
|
|
16
|
+
|
|
17
|
+
const middleware : LanguageModelMiddleware[] = [
|
|
18
|
+
defaultSettingsMiddleware({
|
|
19
|
+
settings: {
|
|
20
|
+
// tool calls: temperature: 0.0, maxOutputTokens: 2048
|
|
21
|
+
// normal chat: topP: 0.8, maxOutputTokens: 2048
|
|
22
|
+
// no tools:
|
|
23
|
+
topP: 0.9, maxOutputTokens: 8192
|
|
24
|
+
},
|
|
25
|
+
})
|
|
26
|
+
];
|
|
27
|
+
|
|
28
|
+
if (AI_GATEWAY_USE_QWEN_MIDDLEWARE) {
|
|
29
|
+
middleware.push(hermesToolMiddleware);
|
|
30
|
+
middleware.push(addToolInputExamplesMiddleware({ prefix: 'Input Examples:', }));
|
|
31
|
+
middleware.push(extractReasoningMiddleware({
|
|
32
|
+
tagName: "think"
|
|
33
|
+
}));
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export const model : LanguageModel = wrapLanguageModel({
|
|
37
|
+
model: unwrappedModel,
|
|
38
|
+
middleware
|
|
35
39
|
});
|
package/src/logger.ts
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
|
+
import 'dotenv/config';
|
|
1
2
|
import pino from "pino";
|
|
2
3
|
|
|
4
|
+
const isTest = process.env.NODE_ENV === "test";
|
|
3
5
|
const isProd = process.env.NODE_ENV === "production";
|
|
4
6
|
const logFile = process.env.LOG_FILE_PATH ?? `${process.cwd()}/logs/clawlet.jsonl`;
|
|
5
7
|
|
|
@@ -18,7 +20,26 @@ const transport = pino.transport({
|
|
|
18
20
|
]
|
|
19
21
|
});
|
|
20
22
|
|
|
21
|
-
export const logger = pino(
|
|
23
|
+
export const logger = isTest ? pino(
|
|
24
|
+
{
|
|
25
|
+
base: {
|
|
26
|
+
service: process.env.SERVICE_NAME ?? "clawlet",
|
|
27
|
+
env: process.env.NODE_ENV ?? "development",
|
|
28
|
+
version: process.env.APP_VERSION,
|
|
29
|
+
},
|
|
30
|
+
timestamp: () => `,"ts":"${new Date().toISOString()}"`,
|
|
31
|
+
formatters: {
|
|
32
|
+
level(label, number) {
|
|
33
|
+
return { level: number, level_label: label };
|
|
34
|
+
},
|
|
35
|
+
},
|
|
36
|
+
level: 'debug',
|
|
37
|
+
},
|
|
38
|
+
pino.destination({
|
|
39
|
+
dest: 1,
|
|
40
|
+
sync: true
|
|
41
|
+
})
|
|
42
|
+
) : pino({
|
|
22
43
|
level: process.env.LOG_LEVEL ?? (isProd ? "info" : "debug"),
|
|
23
44
|
base: {
|
|
24
45
|
service: process.env.SERVICE_NAME ?? "clawlet",
|
package/src/storage.ts
CHANGED
|
@@ -104,23 +104,8 @@ export class LibSqlListStorage<T = any> {
|
|
|
104
104
|
}
|
|
105
105
|
|
|
106
106
|
async replaceAll(name: string, items: T[]): Promise<void> {
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
await tx.execute({
|
|
110
|
-
sql: `DELETE FROM ${this.tableName} WHERE name = ?`,
|
|
111
|
-
args: [name],
|
|
112
|
-
});
|
|
113
|
-
for (const item of items) {
|
|
114
|
-
await tx.execute({
|
|
115
|
-
sql: `INSERT INTO ${this.tableName} (name, item) VALUES (?, ?)`,
|
|
116
|
-
args: [name, JSON.stringify(item)]
|
|
117
|
-
});
|
|
118
|
-
}
|
|
119
|
-
await tx.commit();
|
|
120
|
-
} catch (e) {
|
|
121
|
-
await tx.rollback();
|
|
122
|
-
throw e;
|
|
123
|
-
}
|
|
107
|
+
this.clear(name);
|
|
108
|
+
this.pushMany(name, items);
|
|
124
109
|
}
|
|
125
110
|
|
|
126
111
|
async getAll(name: string): Promise<T[]> {
|
|
@@ -181,19 +166,9 @@ export class LibSqlFiFoStorage<T> {
|
|
|
181
166
|
}
|
|
182
167
|
|
|
183
168
|
public async pushMany(queue: string, items: T[]): Promise<void> {
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
await tx.execute({
|
|
188
|
-
sql: `INSERT INTO ${this.tableName} (queue_name, value) VALUES (?, ?)`,
|
|
189
|
-
args: [queue, JSON.stringify(item)],
|
|
190
|
-
});
|
|
191
|
-
}
|
|
192
|
-
await tx.commit();
|
|
193
|
-
} catch (e) {
|
|
194
|
-
await tx.rollback();
|
|
195
|
-
throw e;
|
|
196
|
-
}
|
|
169
|
+
for (const item of items) {
|
|
170
|
+
this.push(queue, item);
|
|
171
|
+
}
|
|
197
172
|
}
|
|
198
173
|
|
|
199
174
|
public async empty(queue: string): Promise<boolean> {
|
|
@@ -201,33 +176,24 @@ export class LibSqlFiFoStorage<T> {
|
|
|
201
176
|
}
|
|
202
177
|
|
|
203
178
|
public async pop(queue: string): Promise<T | null> {
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
});
|
|
223
|
-
|
|
224
|
-
await tx.commit();
|
|
225
|
-
|
|
226
|
-
return JSON.parse(row.value as string) as T;
|
|
227
|
-
} catch (e) {
|
|
228
|
-
await tx.rollback();
|
|
229
|
-
throw e;
|
|
230
|
-
}
|
|
179
|
+
const rs = await this.client.execute({
|
|
180
|
+
sql: `SELECT id, value FROM ${this.tableName} WHERE queue_name = ? ORDER BY id ASC LIMIT 1`,
|
|
181
|
+
args: [queue],
|
|
182
|
+
});
|
|
183
|
+
|
|
184
|
+
if (rs.rows.length === 0) {
|
|
185
|
+
return null;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
const row = rs.rows[0] as any;
|
|
189
|
+
const id = row.id;
|
|
190
|
+
|
|
191
|
+
await this.client.execute({
|
|
192
|
+
sql: `DELETE FROM ${this.tableName} WHERE id = ?`,
|
|
193
|
+
args: [id!],
|
|
194
|
+
});
|
|
195
|
+
|
|
196
|
+
return JSON.parse(row.value as string) as T;
|
|
231
197
|
}
|
|
232
198
|
|
|
233
199
|
public async count(queue: string): Promise<number> {
|
package/src/tools.ts
CHANGED
|
@@ -14,9 +14,6 @@ import TurndownService from 'turndown';
|
|
|
14
14
|
import { logger } from './logger.js';
|
|
15
15
|
|
|
16
16
|
// Resolve the package root directory (where template/ lives), independent of cwd
|
|
17
|
-
const GENERATE_TEXT_TEMPERATURE = 0.6;
|
|
18
|
-
const GENERATE_TEXT_TOP_P = 0.95;
|
|
19
|
-
const GENERATE_TEXT_MAX_OUTPUT_TOKENS = 16384;
|
|
20
17
|
const GENERATE_TEXT_MAX_STEPS = 30;
|
|
21
18
|
|
|
22
19
|
const turndownService = new TurndownService()
|
|
@@ -972,9 +969,6 @@ Return ONLY a JSON object mapping tool names to arrays of permission rules. Exam
|
|
|
972
969
|
system: await buildSkillSystemPrompt(name, memory, skillPermissions),
|
|
973
970
|
messages,
|
|
974
971
|
tools: Object.keys(sandboxed).length > 0 ? sandboxed : {},
|
|
975
|
-
temperature: GENERATE_TEXT_TEMPERATURE,
|
|
976
|
-
topP: GENERATE_TEXT_TOP_P,
|
|
977
|
-
maxOutputTokens: GENERATE_TEXT_MAX_OUTPUT_TOKENS,
|
|
978
972
|
stopWhen: stepCountIs(GENERATE_TEXT_MAX_STEPS),
|
|
979
973
|
onStepFinish: (step) => {
|
|
980
974
|
if (step.toolCalls.length > 0) {
|