clawmem 0.10.1 → 0.10.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -332,6 +332,7 @@ If your GPU lives on a separate machine, point the env vars at it:
332
332
  ```bash
333
333
  export CLAWMEM_EMBED_URL=http://gpu-host:8088
334
334
  export CLAWMEM_LLM_URL=http://gpu-host:8089
335
+ export CLAWMEM_LLM_MODEL=qwen3
335
336
  export CLAWMEM_RERANK_URL=http://gpu-host:8090
336
337
  ```
337
338
 
@@ -944,6 +945,9 @@ Notes referenced by the agent during a session get boosted (`access_count++`). U
944
945
  | `CLAWMEM_EMBED_TPM_LIMIT` | `100000` | Tokens-per-minute limit for cloud embedding pacing. Match to your provider tier. |
945
946
  | `CLAWMEM_EMBED_DIMENSIONS` | (none) | Output dimensions for OpenAI `text-embedding-3-*` Matryoshka models (e.g. `512`, `1024`). |
946
947
  | `CLAWMEM_LLM_URL` | `http://localhost:8089` | LLM server URL for intent/query/A-MEM. Without it, falls to `node-llama-cpp` (if allowed). |
948
+ | `CLAWMEM_LLM_MODEL` | `qwen3` | Model name sent to the configured LLM endpoint. Override this for OpenAI-compatible proxies such as `gpt-5.4-mini`. |
949
+ | `CLAWMEM_LLM_REASONING_EFFORT` | (none) | Optional top-level `reasoning_effort` field for Chat Completions endpoints that support it (for example OpenAI reasoning models). Leave unset for llama-server/vLLM unless your serving stack explicitly accepts that field. |
950
+ | `CLAWMEM_LLM_NO_THINK` | `true` | Append `/no_think` to remote LLM prompts. Set to `false` for standard OpenAI models and other endpoints that reject or treat the Qwen-style suffix as literal prompt text. |
947
951
  | `CLAWMEM_RERANK_URL` | `http://localhost:8090` | Reranker server URL. Without it, falls to `node-llama-cpp` (if allowed). |
948
952
  | `CLAWMEM_NO_LOCAL_MODELS` | `false` | Block `node-llama-cpp` from auto-downloading GGUF models. Set `true` for remote-only setups where you want fail-fast on unreachable endpoints. |
949
953
  | `CLAWMEM_MERGE_SCORE_NORMAL` | `0.93` | **v0.7.1.** Phase 2 consolidation merge-safety threshold when candidate and existing anchors align. Merges above this normalized 3-gram cosine score are allowed. |
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clawmem",
3
- "version": "0.10.1",
3
+ "version": "0.10.2",
4
4
  "description": "On-device memory layer for AI agents. Claude Code, OpenClaw, and Hermes. Hooks + MCP server + hybrid RAG search.",
5
5
  "type": "module",
6
6
  "bin": {
package/src/clawmem.ts CHANGED
@@ -1491,6 +1491,7 @@ async function cmdSetupOpenClaw(args: string[]) {
1491
1491
  console.log(` 3. Configure GPU endpoints (if not using defaults):`);
1492
1492
  console.log(` ${c.cyan}openclaw config set plugins.entries.clawmem.config.gpuEmbed http://YOUR_GPU:8088${c.reset}`);
1493
1493
  console.log(` ${c.cyan}openclaw config set plugins.entries.clawmem.config.gpuLlm http://YOUR_GPU:8089${c.reset}`);
1494
+ console.log(` ${c.cyan}openclaw config set plugins.entries.clawmem.config.gpuLlmModel qwen3${c.reset}`);
1494
1495
  console.log(` ${c.cyan}openclaw config set plugins.entries.clawmem.config.gpuRerank http://YOUR_GPU:8090${c.reset}`);
1495
1496
  console.log();
1496
1497
  console.log(` 4. Start the REST API (for agent tools):`);
@@ -15,6 +15,9 @@ Config via environment variables:
15
15
  CLAWMEM_PROFILE — Retrieval profile: speed, balanced, deep (default: balanced)
16
16
  CLAWMEM_EMBED_URL — GPU embedding server URL (optional)
17
17
  CLAWMEM_LLM_URL — GPU LLM server URL (optional)
18
+ CLAWMEM_LLM_MODEL — Model name sent to the GPU/cloud LLM endpoint (optional)
19
+ CLAWMEM_LLM_REASONING_EFFORT — Top-level reasoning_effort for supporting Chat Completions endpoints (optional)
20
+ CLAWMEM_LLM_NO_THINK — Append /no_think to remote prompts; false disables it for standard OpenAI models (optional)
18
21
  CLAWMEM_RERANK_URL — GPU reranker server URL (optional)
19
22
 
20
23
  Agent-context isolation:
@@ -295,6 +298,24 @@ class ClawMemProvider(MemoryProvider):
295
298
  "secret": False,
296
299
  "env_var": "CLAWMEM_LLM_URL",
297
300
  },
301
+ {
302
+ "key": "llm_model",
303
+ "description": "Model name sent to the GPU LLM server (e.g., qwen3, gpt-5.4-mini)",
304
+ "secret": False,
305
+ "env_var": "CLAWMEM_LLM_MODEL",
306
+ },
307
+ {
308
+ "key": "llm_reasoning_effort",
309
+ "description": "Optional top-level reasoning_effort for Chat Completions endpoints that support it",
310
+ "secret": False,
311
+ "env_var": "CLAWMEM_LLM_REASONING_EFFORT",
312
+ },
313
+ {
314
+ "key": "llm_no_think",
315
+ "description": "Append /no_think to remote LLM prompts; disable for standard OpenAI models",
316
+ "secret": False,
317
+ "env_var": "CLAWMEM_LLM_NO_THINK",
318
+ },
298
319
  ]
299
320
 
300
321
  # -- Core lifecycle --------------------------------------------------------
@@ -324,7 +345,15 @@ class ClawMemProvider(MemoryProvider):
324
345
  )
325
346
 
326
347
  # Build env for hook shell-outs (GPU endpoints, profile)
327
- for var in ("CLAWMEM_EMBED_URL", "CLAWMEM_LLM_URL", "CLAWMEM_RERANK_URL", "CLAWMEM_PROFILE"):
348
+ for var in (
349
+ "CLAWMEM_EMBED_URL",
350
+ "CLAWMEM_LLM_URL",
351
+ "CLAWMEM_LLM_MODEL",
352
+ "CLAWMEM_LLM_REASONING_EFFORT",
353
+ "CLAWMEM_LLM_NO_THINK",
354
+ "CLAWMEM_RERANK_URL",
355
+ "CLAWMEM_PROFILE",
356
+ ):
328
357
  val = os.environ.get(var)
329
358
  if val:
330
359
  self._env_extra[var] = val
package/src/llm.ts CHANGED
@@ -237,6 +237,23 @@ export type LlamaCppConfig = {
237
237
  * When set, generate() calls /v1/chat/completions instead of local node-llama-cpp.
238
238
  */
239
239
  remoteLlmUrl?: string;
240
+ /**
241
+ * Remote LLM model name to send with chat completion requests.
242
+ * Env: CLAWMEM_LLM_MODEL
243
+ */
244
+ remoteLlmModel?: string;
245
+ /**
246
+ * Optional top-level reasoning_effort field for Chat Completions endpoints that support it.
247
+ * Example values: none, minimal, low, medium, high, xhigh.
248
+ * Env: CLAWMEM_LLM_REASONING_EFFORT
249
+ */
250
+ remoteLlmReasoningEffort?: string;
251
+ /**
252
+ * Whether to append /no_think to remote LLM prompts.
253
+ * Defaults to true to preserve current behavior with Qwen3-compatible endpoints.
254
+ * Env: CLAWMEM_LLM_NO_THINK
255
+ */
256
+ remoteLlmNoThink?: boolean;
240
257
  /**
241
258
  * Inactivity timeout in ms before unloading contexts (default: 2 minutes, 0 to disable).
242
259
  *
@@ -259,6 +276,23 @@ export type LlamaCppConfig = {
259
276
  */
260
277
  // Default inactivity timeout: 2 minutes
261
278
  const DEFAULT_INACTIVITY_TIMEOUT_MS = 2 * 60 * 1000;
279
+ const ALLOWED_REMOTE_LLM_REASONING_EFFORTS = new Set(["none", "minimal", "low", "medium", "high", "xhigh"]);
280
+
281
+ function normalizeRemoteLlmReasoningEffort(value?: string): string | null {
282
+ const raw = (value || "").trim().toLowerCase();
283
+ if (!raw) return null;
284
+ if (!ALLOWED_REMOTE_LLM_REASONING_EFFORTS.has(raw)) {
285
+ console.warn(`[clawmem] Ignoring unsupported remoteLlmReasoningEffort=${raw}`);
286
+ return null;
287
+ }
288
+ return raw;
289
+ }
290
+
291
+ function buildRemoteChatCompletionsUrl(remoteLlmUrl: string): string {
292
+ const baseUrl = remoteLlmUrl.replace(/\/+$/, "");
293
+ const endpoint = baseUrl.endsWith("/v1") ? "/chat/completions" : "/v1/chat/completions";
294
+ return `${baseUrl}${endpoint}`;
295
+ }
262
296
 
263
297
  export class LlamaCpp implements LLM {
264
298
  private llama: Llama | null = null;
@@ -276,6 +310,9 @@ export class LlamaCpp implements LLM {
276
310
  private remoteEmbedApiKey: string | null;
277
311
  private remoteEmbedModel: string;
278
312
  private remoteLlmUrl: string | null;
313
+ private remoteLlmModel: string;
314
+ private remoteLlmReasoningEffort: string | null;
315
+ private remoteLlmNoThink: boolean;
279
316
 
280
317
  // Ensure we don't load the same model concurrently (which can allocate duplicate VRAM).
281
318
  private embedModelLoadPromise: Promise<LlamaModel> | null = null;
@@ -306,6 +343,10 @@ export class LlamaCpp implements LLM {
306
343
  this.remoteEmbedApiKey = config.remoteEmbedApiKey || null;
307
344
  this.remoteEmbedModel = config.remoteEmbedModel || "embedding";
308
345
  this.remoteLlmUrl = config.remoteLlmUrl || null;
346
+ const normalizedRemoteLlmModel = config.remoteLlmModel?.trim();
347
+ this.remoteLlmModel = normalizedRemoteLlmModel || "qwen3";
348
+ this.remoteLlmReasoningEffort = normalizeRemoteLlmReasoningEffort(config.remoteLlmReasoningEffort);
349
+ this.remoteLlmNoThink = config.remoteLlmNoThink ?? true;
309
350
  this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
310
351
  this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false;
311
352
  }
@@ -921,15 +962,19 @@ export class LlamaCpp implements LLM {
921
962
  // Re-check: concurrent call may have set cooldown while we were awaited
922
963
  if (this.isRemoteLlmDown()) return null;
923
964
  try {
924
- const resp = await fetch(`${this.remoteLlmUrl}/v1/chat/completions`, {
965
+ const body: Record<string, unknown> = {
966
+ model: this.remoteLlmModel,
967
+ messages: [{ role: "user", content: this.remoteLlmNoThink ? `${prompt} /no_think` : prompt }],
968
+ max_tokens: maxTokens,
969
+ temperature,
970
+ };
971
+ if (this.remoteLlmReasoningEffort) {
972
+ body.reasoning_effort = this.remoteLlmReasoningEffort;
973
+ }
974
+ const resp = await fetch(buildRemoteChatCompletionsUrl(this.remoteLlmUrl!), {
925
975
  method: "POST",
926
976
  headers: { "Content-Type": "application/json" },
927
- body: JSON.stringify({
928
- model: "qwen3",
929
- messages: [{ role: "user", content: `${prompt} /no_think` }],
930
- max_tokens: maxTokens,
931
- temperature,
932
- }),
977
+ body: JSON.stringify(body),
933
978
  signal,
934
979
  });
935
980
 
@@ -1254,6 +1299,13 @@ export function getDefaultLlamaCpp(): LlamaCpp {
1254
1299
  remoteEmbedApiKey: embedApiKey,
1255
1300
  remoteEmbedModel: process.env.CLAWMEM_EMBED_MODEL || undefined,
1256
1301
  remoteLlmUrl: process.env.CLAWMEM_LLM_URL || undefined,
1302
+ remoteLlmModel: process.env.CLAWMEM_LLM_MODEL?.trim() || undefined,
1303
+ remoteLlmReasoningEffort: process.env.CLAWMEM_LLM_REASONING_EFFORT || undefined,
1304
+ remoteLlmNoThink: (() => {
1305
+ const raw = (process.env.CLAWMEM_LLM_NO_THINK || "").trim().toLowerCase();
1306
+ if (!raw) return undefined;
1307
+ return !["0", "false", "no", "off"].includes(raw);
1308
+ })(),
1257
1309
  });
1258
1310
  }
1259
1311
  return defaultLlamaCpp;
@@ -1276,4 +1328,3 @@ export async function disposeDefaultLlamaCpp(): Promise<void> {
1276
1328
  defaultLlamaCpp = null;
1277
1329
  }
1278
1330
  }
1279
-
@@ -37,8 +37,8 @@
37
37
  * 4. REST API service (`clawmem serve`) lifecycle — unchanged.
38
38
  *
39
39
  * §14.3 critical correctness contract: `agent_end` is fire-and-forget at
40
- * `attempt.ts:2198-2224`. Precompact-extract MUST run inside
41
- * `handleBeforePromptBuild` (which IS awaited at `attempt.ts:1642`), gated
40
+ * `attempt.ts:2470-2496`. Precompact-extract MUST run inside
41
+ * `handleBeforePromptBuild` (which IS awaited at `attempt.ts:1873`), gated
42
42
  * by the proximity heuristic in `compaction-threshold.ts`. See `engine.ts`
43
43
  * top-of-file comment for the full rationale.
44
44
  */
@@ -107,6 +107,13 @@ const clawmemPlugin = {
107
107
  env: {
108
108
  ...(pluginCfg.gpuEmbed ? { CLAWMEM_EMBED_URL: pluginCfg.gpuEmbed as string } : {}),
109
109
  ...(pluginCfg.gpuLlm ? { CLAWMEM_LLM_URL: pluginCfg.gpuLlm as string } : {}),
110
+ ...(pluginCfg.gpuLlmModel ? { CLAWMEM_LLM_MODEL: pluginCfg.gpuLlmModel as string } : {}),
111
+ ...(pluginCfg.gpuLlmReasoningEffort
112
+ ? { CLAWMEM_LLM_REASONING_EFFORT: pluginCfg.gpuLlmReasoningEffort as string }
113
+ : {}),
114
+ ...(pluginCfg.gpuLlmNoThink !== undefined
115
+ ? { CLAWMEM_LLM_NO_THINK: String(pluginCfg.gpuLlmNoThink) }
116
+ : {}),
110
117
  ...(pluginCfg.gpuRerank ? { CLAWMEM_RERANK_URL: pluginCfg.gpuRerank as string } : {}),
111
118
  CLAWMEM_PROFILE: profile,
112
119
  },
@@ -154,7 +161,7 @@ const clawmemPlugin = {
154
161
  // ----- Plugin Hook: before_prompt_build (AWAITED — load-bearing path) -----
155
162
  // Both context-surfacing retrieval injection and pre-emptive precompact
156
163
  // extraction live here. handleBeforePromptBuild is async and the OpenClaw
157
- // attempt path awaits the result at attempt.ts:1642 before building the
164
+ // attempt path awaits the result at attempt.ts:1873 before building the
158
165
  // effective prompt. precompact-extract therefore runs strictly before
159
166
  // the LLM call that could trigger compaction on this turn.
160
167
  api.on(
@@ -168,7 +175,7 @@ const clawmemPlugin = {
168
175
  // ----- Plugin Hook: agent_end (FIRE-AND-FORGET in core) -----
169
176
  // Decision-extractor, handoff-generator, and feedback-loop run here.
170
177
  // These writes are eventually-consistent (saveMemory dedupes), so the
171
- // fire-and-forget context at attempt.ts:2198-2224 is acceptable.
178
+ // fire-and-forget context at attempt.ts:2470-2496 is acceptable.
172
179
  // precompact-extract is intentionally NOT in this handler — it lives
173
180
  // in handleBeforePromptBuild for correctness reasons.
174
181
  api.on("agent_end", async (event: AgentEndEvent, ctx: AgentEndContext) => {
@@ -41,6 +41,23 @@
41
41
  "help": "URL for ClawMem LLM (query expansion, extraction)",
42
42
  "advanced": true
43
43
  },
44
+ "gpuLlmModel": {
45
+ "label": "LLM Model",
46
+ "placeholder": "qwen3",
47
+ "help": "Model name sent to the configured LLM endpoint",
48
+ "advanced": true
49
+ },
50
+ "gpuLlmReasoningEffort": {
51
+ "label": "Reasoning Effort",
52
+ "placeholder": "(unset)",
53
+ "help": "Optional top-level reasoning_effort for Chat Completions endpoints that support it. Unset omits the field.",
54
+ "advanced": true
55
+ },
56
+ "gpuLlmNoThink": {
57
+ "label": "Append /no_think",
58
+ "help": "Append /no_think to remote LLM prompts (default: true). Disable for standard OpenAI models.",
59
+ "advanced": true
60
+ },
44
61
  "gpuRerank": {
45
62
  "label": "Reranker Endpoint",
46
63
  "placeholder": "http://localhost:8090",
@@ -78,6 +95,16 @@
78
95
  "gpuLlm": {
79
96
  "type": "string"
80
97
  },
98
+ "gpuLlmModel": {
99
+ "type": "string"
100
+ },
101
+ "gpuLlmReasoningEffort": {
102
+ "type": "string",
103
+ "enum": ["none", "minimal", "low", "medium", "high", "xhigh"]
104
+ },
105
+ "gpuLlmNoThink": {
106
+ "type": "boolean"
107
+ },
81
108
  "gpuRerank": {
82
109
  "type": "string"
83
110
  }
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clawmem-openclaw-plugin",
3
- "version": "0.10.1",
3
+ "version": "0.10.2",
4
4
  "description": "OpenClaw plugin adapter for ClawMem — on-device hybrid memory layer",
5
5
  "type": "module",
6
6
  "openclaw": {