npm - clawmem - Versions diffs - 0.10.1 → 0.10.2 - Mend

clawmem 0.10.1 → 0.10.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md +4 -0
package/package.json +1 -1
package/src/clawmem.ts +1 -0
package/src/hermes/__init__.py +30 -1
package/src/llm.ts +59 -8
package/src/openclaw/index.ts +11 -4
package/src/openclaw/openclaw.plugin.json +27 -0
package/src/openclaw/package.json +1 -1

package/README.md CHANGED Viewed

@@ -332,6 +332,7 @@ If your GPU lives on a separate machine, point the env vars at it:
 ```bash
 export CLAWMEM_EMBED_URL=http://gpu-host:8088
 export CLAWMEM_LLM_URL=http://gpu-host:8089
+export CLAWMEM_LLM_MODEL=qwen3
 export CLAWMEM_RERANK_URL=http://gpu-host:8090
 ```
@@ -944,6 +945,9 @@ Notes referenced by the agent during a session get boosted (`access_count++`). U
 | `CLAWMEM_EMBED_TPM_LIMIT` | `100000` | Tokens-per-minute limit for cloud embedding pacing. Match to your provider tier. |
 | `CLAWMEM_EMBED_DIMENSIONS` | (none) | Output dimensions for OpenAI `text-embedding-3-*` Matryoshka models (e.g. `512`, `1024`). |
 | `CLAWMEM_LLM_URL` | `http://localhost:8089` | LLM server URL for intent/query/A-MEM. Without it, falls to `node-llama-cpp` (if allowed). |
+| `CLAWMEM_LLM_MODEL` | `qwen3` | Model name sent to the configured LLM endpoint. Override this for OpenAI-compatible proxies such as `gpt-5.4-mini`. |
+| `CLAWMEM_LLM_REASONING_EFFORT` | (none) | Optional top-level `reasoning_effort` field for Chat Completions endpoints that support it (for example OpenAI reasoning models). Leave unset for llama-server/vLLM unless your serving stack explicitly accepts that field. |
+| `CLAWMEM_LLM_NO_THINK` | `true` | Append `/no_think` to remote LLM prompts. Set to `false` for standard OpenAI models and other endpoints that reject or treat the Qwen-style suffix as literal prompt text. |
 | `CLAWMEM_RERANK_URL` | `http://localhost:8090` | Reranker server URL. Without it, falls to `node-llama-cpp` (if allowed). |
 | `CLAWMEM_NO_LOCAL_MODELS` | `false` | Block `node-llama-cpp` from auto-downloading GGUF models. Set `true` for remote-only setups where you want fail-fast on unreachable endpoints. |
 | `CLAWMEM_MERGE_SCORE_NORMAL` | `0.93` | **v0.7.1.** Phase 2 consolidation merge-safety threshold when candidate and existing anchors align. Merges above this normalized 3-gram cosine score are allowed. |

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clawmem",
-  "version": "0.10.1",
+  "version": "0.10.2",
   "description": "On-device memory layer for AI agents. Claude Code, OpenClaw, and Hermes. Hooks + MCP server + hybrid RAG search.",
   "type": "module",
   "bin": {

package/src/clawmem.ts CHANGED Viewed

@@ -1491,6 +1491,7 @@ async function cmdSetupOpenClaw(args: string[]) {
   console.log(`  3. Configure GPU endpoints (if not using defaults):`);
   console.log(`     ${c.cyan}openclaw config set plugins.entries.clawmem.config.gpuEmbed http://YOUR_GPU:8088${c.reset}`);
   console.log(`     ${c.cyan}openclaw config set plugins.entries.clawmem.config.gpuLlm http://YOUR_GPU:8089${c.reset}`);
+  console.log(`     ${c.cyan}openclaw config set plugins.entries.clawmem.config.gpuLlmModel qwen3${c.reset}`);
   console.log(`     ${c.cyan}openclaw config set plugins.entries.clawmem.config.gpuRerank http://YOUR_GPU:8090${c.reset}`);
   console.log();
   console.log(`  4. Start the REST API (for agent tools):`);

package/src/hermes/__init__.py CHANGED Viewed

@@ -15,6 +15,9 @@ Config via environment variables:
   CLAWMEM_PROFILE       — Retrieval profile: speed, balanced, deep (default: balanced)
   CLAWMEM_EMBED_URL     — GPU embedding server URL (optional)
   CLAWMEM_LLM_URL       — GPU LLM server URL (optional)
+  CLAWMEM_LLM_MODEL     — Model name sent to the GPU/cloud LLM endpoint (optional)
+  CLAWMEM_LLM_REASONING_EFFORT — Top-level reasoning_effort for supporting Chat Completions endpoints (optional)
+  CLAWMEM_LLM_NO_THINK  — Append /no_think to remote prompts; false disables it for standard OpenAI models (optional)
   CLAWMEM_RERANK_URL    — GPU reranker server URL (optional)
 Agent-context isolation:
@@ -295,6 +298,24 @@ class ClawMemProvider(MemoryProvider):
                 "secret": False,
                 "env_var": "CLAWMEM_LLM_URL",
             },
+            {
+                "key": "llm_model",
+                "description": "Model name sent to the GPU LLM server (e.g., qwen3, gpt-5.4-mini)",
+                "secret": False,
+                "env_var": "CLAWMEM_LLM_MODEL",
+            },
+            {
+                "key": "llm_reasoning_effort",
+                "description": "Optional top-level reasoning_effort for Chat Completions endpoints that support it",
+                "secret": False,
+                "env_var": "CLAWMEM_LLM_REASONING_EFFORT",
+            },
+            {
+                "key": "llm_no_think",
+                "description": "Append /no_think to remote LLM prompts; disable for standard OpenAI models",
+                "secret": False,
+                "env_var": "CLAWMEM_LLM_NO_THINK",
+            },
         ]
     # -- Core lifecycle --------------------------------------------------------
@@ -324,7 +345,15 @@ class ClawMemProvider(MemoryProvider):
             )
         # Build env for hook shell-outs (GPU endpoints, profile)
-        for var in ("CLAWMEM_EMBED_URL", "CLAWMEM_LLM_URL", "CLAWMEM_RERANK_URL", "CLAWMEM_PROFILE"):
+        for var in (
+            "CLAWMEM_EMBED_URL",
+            "CLAWMEM_LLM_URL",
+            "CLAWMEM_LLM_MODEL",
+            "CLAWMEM_LLM_REASONING_EFFORT",
+            "CLAWMEM_LLM_NO_THINK",
+            "CLAWMEM_RERANK_URL",
+            "CLAWMEM_PROFILE",
+        ):
             val = os.environ.get(var)
             if val:
                 self._env_extra[var] = val

package/src/llm.ts CHANGED Viewed

@@ -237,6 +237,23 @@ export type LlamaCppConfig = {
    * When set, generate() calls /v1/chat/completions instead of local node-llama-cpp.
    */
   remoteLlmUrl?: string;
+  /**
+   * Remote LLM model name to send with chat completion requests.
+   * Env: CLAWMEM_LLM_MODEL
+   */
+  remoteLlmModel?: string;
+  /**
+   * Optional top-level reasoning_effort field for Chat Completions endpoints that support it.
+   * Example values: none, minimal, low, medium, high, xhigh.
+   * Env: CLAWMEM_LLM_REASONING_EFFORT
+   */
+  remoteLlmReasoningEffort?: string;
+  /**
+   * Whether to append /no_think to remote LLM prompts.
+   * Defaults to true to preserve current behavior with Qwen3-compatible endpoints.
+   * Env: CLAWMEM_LLM_NO_THINK
+   */
+  remoteLlmNoThink?: boolean;
   /**
    * Inactivity timeout in ms before unloading contexts (default: 2 minutes, 0 to disable).
    *
@@ -259,6 +276,23 @@ export type LlamaCppConfig = {
  */
 // Default inactivity timeout: 2 minutes
 const DEFAULT_INACTIVITY_TIMEOUT_MS = 2 * 60 * 1000;
+const ALLOWED_REMOTE_LLM_REASONING_EFFORTS = new Set(["none", "minimal", "low", "medium", "high", "xhigh"]);
+function normalizeRemoteLlmReasoningEffort(value?: string): string | null {
+  const raw = (value || "").trim().toLowerCase();
+  if (!raw) return null;
+  if (!ALLOWED_REMOTE_LLM_REASONING_EFFORTS.has(raw)) {
+    console.warn(`[clawmem] Ignoring unsupported remoteLlmReasoningEffort=${raw}`);
+    return null;
+  }
+  return raw;
+}
+function buildRemoteChatCompletionsUrl(remoteLlmUrl: string): string {
+  const baseUrl = remoteLlmUrl.replace(/\/+$/, "");
+  const endpoint = baseUrl.endsWith("/v1") ? "/chat/completions" : "/v1/chat/completions";
+  return `${baseUrl}${endpoint}`;
+}
 export class LlamaCpp implements LLM {
   private llama: Llama | null = null;
@@ -276,6 +310,9 @@ export class LlamaCpp implements LLM {
   private remoteEmbedApiKey: string | null;
   private remoteEmbedModel: string;
   private remoteLlmUrl: string | null;
+  private remoteLlmModel: string;
+  private remoteLlmReasoningEffort: string | null;
+  private remoteLlmNoThink: boolean;
   // Ensure we don't load the same model concurrently (which can allocate duplicate VRAM).
   private embedModelLoadPromise: Promise<LlamaModel> | null = null;
@@ -306,6 +343,10 @@ export class LlamaCpp implements LLM {
     this.remoteEmbedApiKey = config.remoteEmbedApiKey || null;
     this.remoteEmbedModel = config.remoteEmbedModel || "embedding";
     this.remoteLlmUrl = config.remoteLlmUrl || null;
+    const normalizedRemoteLlmModel = config.remoteLlmModel?.trim();
+    this.remoteLlmModel = normalizedRemoteLlmModel || "qwen3";
+    this.remoteLlmReasoningEffort = normalizeRemoteLlmReasoningEffort(config.remoteLlmReasoningEffort);
+    this.remoteLlmNoThink = config.remoteLlmNoThink ?? true;
     this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
     this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false;
   }
@@ -921,15 +962,19 @@ export class LlamaCpp implements LLM {
     // Re-check: concurrent call may have set cooldown while we were awaited
     if (this.isRemoteLlmDown()) return null;
     try {
-      const resp = await fetch(`${this.remoteLlmUrl}/v1/chat/completions`, {
+      const body: Record<string, unknown> = {
+        model: this.remoteLlmModel,
+        messages: [{ role: "user", content: this.remoteLlmNoThink ? `${prompt} /no_think` : prompt }],
+        max_tokens: maxTokens,
+        temperature,
+      };
+      if (this.remoteLlmReasoningEffort) {
+        body.reasoning_effort = this.remoteLlmReasoningEffort;
+      }
+      const resp = await fetch(buildRemoteChatCompletionsUrl(this.remoteLlmUrl!), {
         method: "POST",
         headers: { "Content-Type": "application/json" },
-        body: JSON.stringify({
-          model: "qwen3",
-          messages: [{ role: "user", content: `${prompt} /no_think` }],
-          max_tokens: maxTokens,
-          temperature,
-        }),
+        body: JSON.stringify(body),
         signal,
       });
@@ -1254,6 +1299,13 @@ export function getDefaultLlamaCpp(): LlamaCpp {
       remoteEmbedApiKey: embedApiKey,
       remoteEmbedModel: process.env.CLAWMEM_EMBED_MODEL || undefined,
       remoteLlmUrl: process.env.CLAWMEM_LLM_URL || undefined,
+      remoteLlmModel: process.env.CLAWMEM_LLM_MODEL?.trim() || undefined,
+      remoteLlmReasoningEffort: process.env.CLAWMEM_LLM_REASONING_EFFORT || undefined,
+      remoteLlmNoThink: (() => {
+        const raw = (process.env.CLAWMEM_LLM_NO_THINK || "").trim().toLowerCase();
+        if (!raw) return undefined;
+        return !["0", "false", "no", "off"].includes(raw);
+      })(),
     });
   }
   return defaultLlamaCpp;
@@ -1276,4 +1328,3 @@ export async function disposeDefaultLlamaCpp(): Promise<void> {
     defaultLlamaCpp = null;
   }
 }

package/src/openclaw/index.ts CHANGED Viewed

@@ -37,8 +37,8 @@
  *   4. REST API service (`clawmem serve`) lifecycle — unchanged.
  *
  * §14.3 critical correctness contract: `agent_end` is fire-and-forget at
- * `attempt.ts:2198-2224`. Precompact-extract MUST run inside
- * `handleBeforePromptBuild` (which IS awaited at `attempt.ts:1642`), gated
+ * `attempt.ts:2470-2496`. Precompact-extract MUST run inside
+ * `handleBeforePromptBuild` (which IS awaited at `attempt.ts:1873`), gated
  * by the proximity heuristic in `compaction-threshold.ts`. See `engine.ts`
  * top-of-file comment for the full rationale.
  */
@@ -107,6 +107,13 @@ const clawmemPlugin = {
       env: {
         ...(pluginCfg.gpuEmbed ? { CLAWMEM_EMBED_URL: pluginCfg.gpuEmbed as string } : {}),
         ...(pluginCfg.gpuLlm ? { CLAWMEM_LLM_URL: pluginCfg.gpuLlm as string } : {}),
+        ...(pluginCfg.gpuLlmModel ? { CLAWMEM_LLM_MODEL: pluginCfg.gpuLlmModel as string } : {}),
+        ...(pluginCfg.gpuLlmReasoningEffort
+          ? { CLAWMEM_LLM_REASONING_EFFORT: pluginCfg.gpuLlmReasoningEffort as string }
+          : {}),
+        ...(pluginCfg.gpuLlmNoThink !== undefined
+          ? { CLAWMEM_LLM_NO_THINK: String(pluginCfg.gpuLlmNoThink) }
+          : {}),
         ...(pluginCfg.gpuRerank ? { CLAWMEM_RERANK_URL: pluginCfg.gpuRerank as string } : {}),
         CLAWMEM_PROFILE: profile,
       },
@@ -154,7 +161,7 @@ const clawmemPlugin = {
     // ----- Plugin Hook: before_prompt_build (AWAITED — load-bearing path) -----
     // Both context-surfacing retrieval injection and pre-emptive precompact
     // extraction live here. handleBeforePromptBuild is async and the OpenClaw
-    // attempt path awaits the result at attempt.ts:1642 before building the
+    // attempt path awaits the result at attempt.ts:1873 before building the
     // effective prompt. precompact-extract therefore runs strictly before
     // the LLM call that could trigger compaction on this turn.
     api.on(
@@ -168,7 +175,7 @@ const clawmemPlugin = {
     // ----- Plugin Hook: agent_end (FIRE-AND-FORGET in core) -----
     // Decision-extractor, handoff-generator, and feedback-loop run here.
     // These writes are eventually-consistent (saveMemory dedupes), so the
-    // fire-and-forget context at attempt.ts:2198-2224 is acceptable.
+    // fire-and-forget context at attempt.ts:2470-2496 is acceptable.
     // precompact-extract is intentionally NOT in this handler — it lives
     // in handleBeforePromptBuild for correctness reasons.
     api.on("agent_end", async (event: AgentEndEvent, ctx: AgentEndContext) => {

package/src/openclaw/openclaw.plugin.json CHANGED Viewed

@@ -41,6 +41,23 @@
       "help": "URL for ClawMem LLM (query expansion, extraction)",
       "advanced": true
     },
+    "gpuLlmModel": {
+      "label": "LLM Model",
+      "placeholder": "qwen3",
+      "help": "Model name sent to the configured LLM endpoint",
+      "advanced": true
+    },
+    "gpuLlmReasoningEffort": {
+      "label": "Reasoning Effort",
+      "placeholder": "(unset)",
+      "help": "Optional top-level reasoning_effort for Chat Completions endpoints that support it. Unset omits the field.",
+      "advanced": true
+    },
+    "gpuLlmNoThink": {
+      "label": "Append /no_think",
+      "help": "Append /no_think to remote LLM prompts (default: true). Disable for standard OpenAI models.",
+      "advanced": true
+    },
     "gpuRerank": {
       "label": "Reranker Endpoint",
       "placeholder": "http://localhost:8090",
@@ -78,6 +95,16 @@
       "gpuLlm": {
         "type": "string"
       },
+      "gpuLlmModel": {
+        "type": "string"
+      },
+      "gpuLlmReasoningEffort": {
+        "type": "string",
+        "enum": ["none", "minimal", "low", "medium", "high", "xhigh"]
+      },
+      "gpuLlmNoThink": {
+        "type": "boolean"
+      },
       "gpuRerank": {
         "type": "string"
       }

package/src/openclaw/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clawmem-openclaw-plugin",
-  "version": "0.10.1",
+  "version": "0.10.2",
   "description": "OpenClaw plugin adapter for ClawMem — on-device hybrid memory layer",
   "type": "module",
   "openclaw": {