npm - @simbimbo/memory-ocmemog - Versions diffs - 0.1.5 → 0.1.7 - Mend

@simbimbo/memory-ocmemog 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

package/CHANGELOG.md +23 -1
package/README.md +19 -14
package/brain/runtime/config.py +6 -1
package/brain/runtime/inference.py +98 -28
package/brain/runtime/memory/api.py +822 -1
package/brain/runtime/memory/context_builder.py +101 -76
package/brain/runtime/memory/distill.py +156 -13
package/brain/runtime/memory/pondering_engine.py +2 -0
package/brain/runtime/memory/promote.py +6 -0
package/brain/runtime/memory/provenance.py +52 -0
package/brain/runtime/memory/retrieval.py +116 -50
package/brain/runtime/model_router.py +2 -0
package/brain/runtime/providers.py +17 -8
package/docs/notes/2026-03-18-memory-repair-and-backfill.md +3 -3
package/docs/notes/local-model-role-matrix-2026-03-18.md +7 -3
package/docs/usage.md +23 -19
package/index.ts +1 -1
package/ocmemog/sidecar/app.py +225 -1
package/ocmemog/sidecar/compat.py +4 -0
package/ocmemog/sidecar/transcript_watcher.py +2 -2
package/package.json +1 -1
package/scripts/install-ocmemog.sh +26 -26
package/scripts/ocmemog-backfill-vectors.py +6 -4
package/scripts/ocmemog-context.sh +1 -1
package/scripts/ocmemog-demo.py +2 -2
package/scripts/ocmemog-install.sh +4 -12
package/scripts/ocmemog-load-test.py +2 -2
package/scripts/ocmemog-ponder.sh +2 -2
package/scripts/ocmemog-recall-test.py +2 -2
package/scripts/ocmemog-reindex-vectors.py +6 -4
package/scripts/ocmemog-reliability-soak.py +1 -1
package/scripts/ocmemog-sidecar.sh +11 -7
package/scripts/ocmemog-test-rig.py +4 -3

package/CHANGELOG.md CHANGED Viewed

@@ -1,11 +1,33 @@
 # Changelog
+## 0.1.7 — 2026-03-19
+llama.cpp-first cleanup after the 0.1.6 runtime cutover.
+### Highlights
+- made llama.cpp / local OpenAI-compatible endpoints the primary documented and scripted local runtime path
+- reduced misleading Ollama-first defaults in installers, sidecar scripts, docs, and helper tooling
+- aligned context/distill/runtime helpers with the fixed local model architecture (`17890` gateway, `17891` sidecar, `18080` text, `18081` embeddings)
+- kept compatibility hooks only where still useful for rollback or mixed environments
+## 0.1.6 — 2026-03-19
+Port-separation and publish-solid follow-up.
+### Highlights
+- Split ocmemog sidecar onto dedicated loopback port `17891` to avoid collision with the OpenClaw gateway/dashboard on `17890`
+- Restored the plain realtime dashboard on `/dashboard` and fixed the `local_html` template crash
+- Updated plugin/runtime defaults, scripts, and documentation to use the dedicated sidecar endpoint on `17891`
+- Switched repo-facing local-runtime defaults to llama.cpp-first endpoints on `18080`/`18081` with Qwen2.5 text and `nomic-embed-text-v1.5` embeddings, while keeping Ollama as explicit legacy fallback only
+- Added governance retrieval/governance-policy hardening plus expanded regression coverage for duplicate, contradiction, supersession, queue, audit, rollback, and auto-resolve flows
+- Aligned package/version metadata across npm, Python, and FastAPI surfaces
 ## 0.1.5 — 2026-03-18
 Repair and hardening follow-up after the 0.1.4 publish.
 ### Highlights
-- Fixed vector reindex defaults so repair scripts use provider-backed Ollama embeddings instead of silently rebuilding weak local/hash vectors
+- Fixed vector reindex defaults so repair scripts use provider-backed local embeddings instead of silently rebuilding weak local/hash vectors
 - Added battery-aware sidecar defaults for macOS laptops (`OCMEMOG_LAPTOP_MODE=auto|ac|battery`)
 - Fixed `record_reinforcement()` so new experiences preserve `memory_reference`, and added integrity repair to backfill legacy missing references
 - Added incremental vector backfill tooling (`scripts/ocmemog-backfill-vectors.py`) for non-destructive backlog repair

package/README.md CHANGED Viewed

@@ -32,7 +32,7 @@ pip install -r requirements.txt
 ./scripts/ocmemog-sidecar.sh
 # then open
-# http://127.0.0.1:17890/dashboard
+# http://127.0.0.1:17891/dashboard
 ```
 ## Optional: transcript watcher (auto-ingest)
@@ -45,8 +45,8 @@ export OCMEMOG_TRANSCRIPT_DIR="$HOME/.openclaw/workspace/memory/transcripts"
 Default bind:
-- endpoint: `http://127.0.0.1:17890`
-- health: `http://127.0.0.1:17890/healthz`
+- endpoint: `http://127.0.0.1:17891`
+- health: `http://127.0.0.1:17891/healthz`
 ## Continuity proof / benchmark harness
@@ -78,20 +78,24 @@ Optional environment variables:
 - `OCMEMOG_OPENAI_API_BASE` (default: `https://api.openai.com/v1`)
 - `OCMEMOG_OPENAI_EMBED_MODEL` (default: `text-embedding-3-small`)
 - `BRAIN_EMBED_MODEL_LOCAL` (`simple` by default)
-- `BRAIN_EMBED_MODEL_PROVIDER` (`openai` to enable provider embeddings)
+- `BRAIN_EMBED_MODEL_PROVIDER` (`local-openai` to use the local llama.cpp embedding endpoint; `openai` remains available for hosted embeddings)
 - `OCMEMOG_TRANSCRIPT_WATCHER` (`true` to auto-start transcript watcher inside the sidecar)
 - `OCMEMOG_TRANSCRIPT_ROOTS` (comma-separated allowed roots for transcript context retrieval; default: `~/.openclaw/workspace/memory`)
 - `OCMEMOG_API_TOKEN` (optional; if set, requests must include `x-ocmemog-token` or `Authorization: Bearer ...`)
 - `OCMEMOG_AUTO_HYDRATION` (`true` to re-enable prompt-time continuity prepending; defaults to `false` as a safety guard until the host runtime is verified not to persist prepended context into session history)
 - `OCMEMOG_LAPTOP_MODE` (`auto` by default; on macOS battery power this slows watcher polling, reduces ingest batch size, and disables sentiment reinforcement unless explicitly overridden)
-- `OCMEMOG_USE_OLLAMA` (`true` to use Ollama for distill/inference)
-- `OCMEMOG_OLLAMA_HOST` (default: `http://127.0.0.1:11434`)
-- `OCMEMOG_OLLAMA_MODEL` (default: `phi3:latest`; lightweight local fallback / cheap cognition)
-- `OCMEMOG_OLLAMA_EMBED_MODEL` (default: `nomic-embed-text:latest`)
+- `OCMEMOG_LOCAL_LLM_BASE_URL` (default: `http://127.0.0.1:18080/v1`; local OpenAI-compatible text endpoint, e.g. llama.cpp)
+- `OCMEMOG_LOCAL_LLM_MODEL` (default: `qwen2.5-7b-instruct`; matches the active Qwen2.5-7B-Instruct GGUF runtime)
+- `OCMEMOG_LOCAL_EMBED_BASE_URL` (default: `http://127.0.0.1:18081/v1`; local OpenAI-compatible embedding endpoint)
+- `OCMEMOG_LOCAL_EMBED_MODEL` (default: `nomic-embed-text-v1.5`)
+- `OCMEMOG_USE_OLLAMA` (`true` to force legacy Ollama local inference path)
+- `OCMEMOG_OLLAMA_HOST` (default: `http://127.0.0.1:11434`; legacy fallback)
+- `OCMEMOG_OLLAMA_MODEL` (default: `qwen2.5:7b`; legacy fallback for machines that still use Ollama)
+- `OCMEMOG_OLLAMA_EMBED_MODEL` (default: `nomic-embed-text:latest`; legacy embedding fallback)
 - `OCMEMOG_PROMOTION_THRESHOLD` (default: `0.5`)
 - `OCMEMOG_DEMOTION_THRESHOLD` (default: `0.2`)
 - `OCMEMOG_PONDER_ENABLED` (default: `true`)
-- `OCMEMOG_PONDER_MODEL` (default via launcher: `qwen2.5:7b`; recommended for structured local memory refinement)
+- `OCMEMOG_PONDER_MODEL` (default via launcher: `local-openai:qwen2.5-7b-instruct`; recommended for structured local memory refinement)
 - `OCMEMOG_LESSON_MINING_ENABLED` (default: `true`)
 ## Security
@@ -129,12 +133,13 @@ This installer will try to:
 - install Python requirements
 - install/enable the OpenClaw plugin when the `openclaw` CLI is available
 - install/load LaunchAgents via `scripts/ocmemog-install.sh`
-- pull required local Ollama models when Ollama is already installed
+- verify the local llama.cpp runtime and expected text/embed endpoints
 - validate `/healthz`
 Notes:
-- If `OCMEMOG_INSTALL_PREREQS=true` and Homebrew is present, the installer will try to install missing `ollama` and `ffmpeg` automatically.
-- If Ollama is not installed and prereq auto-install is off or unavailable, the installer warns and continues; local model support will remain unavailable until Ollama is installed.
+- If `OCMEMOG_INSTALL_PREREQS=true` and Homebrew is present, the installer will try to install missing `llama.cpp` and `ffmpeg` automatically.
+- The installer no longer pulls local models. It assumes your llama.cpp text endpoint is on `127.0.0.1:18080` and your embedding endpoint is on `127.0.0.1:18081`.
+- Legacy Ollama compatibility remains available only when you explicitly opt into it with `OCMEMOG_USE_OLLAMA=true`.
 - If package install is unavailable in the local OpenClaw build, the installer falls back to local-path plugin install.
 - Advanced flags are available for local debugging/CI (`--skip-plugin-install`, `--skip-launchagents`, `--skip-model-pulls`, `--endpoint`, `--repo-url`).
@@ -154,7 +159,7 @@ launchctl bootstrap gui/$UID scripts/launchagents/com.openclaw.ocmemog.guard.pli
 ## Recent changes
-### 0.1.5 (current main)
+### 0.1.6 (current main)
 Package ownership + runtime safety release:
 - Publish package under `@simbimbo/memory-ocmemog` instead of the unauthorized `@openclaw` scope
@@ -193,7 +198,7 @@ plugins:
     memory-ocmemog:
       enabled: true
       config:
-        endpoint: http://127.0.0.1:17890
+        endpoint: http://127.0.0.1:17891
         timeoutMs: 30000
 ```

package/brain/runtime/config.py CHANGED Viewed

@@ -9,8 +9,13 @@ OCMEMOG_MEMORY_MODEL = os.environ.get("OCMEMOG_MEMORY_MODEL", "gpt-4o-mini")
 OCMEMOG_OPENAI_API_BASE = os.environ.get("OCMEMOG_OPENAI_API_BASE", "https://api.openai.com/v1")
 OCMEMOG_OPENAI_EMBED_MODEL = os.environ.get("OCMEMOG_OPENAI_EMBED_MODEL", "text-embedding-3-small")
+OCMEMOG_LOCAL_LLM_BASE_URL = os.environ.get("OCMEMOG_LOCAL_LLM_BASE_URL", "http://127.0.0.1:18080/v1")
+OCMEMOG_LOCAL_LLM_MODEL = os.environ.get("OCMEMOG_LOCAL_LLM_MODEL", "qwen2.5-7b-instruct")
+OCMEMOG_LOCAL_EMBED_BASE_URL = os.environ.get("OCMEMOG_LOCAL_EMBED_BASE_URL", "http://127.0.0.1:18081/v1")
+OCMEMOG_LOCAL_EMBED_MODEL = os.environ.get("OCMEMOG_LOCAL_EMBED_MODEL", "nomic-embed-text-v1.5")
 OCMEMOG_OLLAMA_HOST = os.environ.get("OCMEMOG_OLLAMA_HOST", "http://127.0.0.1:11434")
-OCMEMOG_OLLAMA_MODEL = os.environ.get("OCMEMOG_OLLAMA_MODEL", "phi3:latest")
+OCMEMOG_OLLAMA_MODEL = os.environ.get("OCMEMOG_OLLAMA_MODEL", "qwen2.5:7b")
 OCMEMOG_OLLAMA_EMBED_MODEL = os.environ.get("OCMEMOG_OLLAMA_EMBED_MODEL", "nomic-embed-text:latest")
 OCMEMOG_PROMOTION_THRESHOLD = float(os.environ.get("OCMEMOG_PROMOTION_THRESHOLD", "0.5"))

package/brain/runtime/inference.py CHANGED Viewed

@@ -11,6 +11,35 @@ from brain.runtime.instrumentation import emit_event
 LOGFILE = state_store.reports_dir() / "brain_memory.log.jsonl"
+def _infer_openai_compatible(prompt: str, *, base_url: str, model: str, api_key: str | None = None, provider_label: str = "openai-compatible") -> dict[str, str]:
+    url = f"{base_url.rstrip('/')}/chat/completions"
+    payload = {
+        "model": model,
+        "messages": [{"role": "user", "content": prompt}],
+        "temperature": 0.2,
+    }
+    data = json.dumps(payload).encode("utf-8")
+    req = urllib.request.Request(url, data=data, method="POST")
+    if api_key:
+        req.add_header("Authorization", f"Bearer {api_key}")
+    req.add_header("Content-Type", "application/json")
+    try:
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            response = json.loads(resp.read().decode("utf-8"))
+    except Exception as exc:
+        emit_event(LOGFILE, "brain_infer_error", status="error", provider=provider_label, error=str(exc))
+        return {"status": "error", "error": f"request_failed:{exc}"}
+    try:
+        output = response["choices"][0]["message"]["content"]
+    except Exception as exc:
+        emit_event(LOGFILE, "brain_infer_error", status="error", provider=provider_label, error=str(exc))
+        return {"status": "error", "error": "invalid_response"}
+    return {"status": "ok", "output": str(output).strip()}
 def _infer_ollama(prompt: str, model: str | None = None) -> dict[str, str]:
     payload = {
         "model": model or config.OCMEMOG_OLLAMA_MODEL,
@@ -33,47 +62,88 @@ def _infer_ollama(prompt: str, model: str | None = None) -> dict[str, str]:
     return {"status": "ok", "output": str(output).strip()}
+def _looks_like_local_openai_model(name: str) -> bool:
+    if not name:
+        return False
+    lowered = name.strip().lower()
+    return lowered.startswith("local-openai:") or lowered.startswith("local_openai:") or lowered.startswith("llamacpp:")
+def _normalize_local_model_name(name: str) -> str:
+    lowered = (name or "").strip()
+    for prefix in ("local-openai:", "local_openai:", "llamacpp:"):
+        if lowered.lower().startswith(prefix):
+            return lowered[len(prefix):]
+    return lowered
+def _looks_like_ollama_model(name: str) -> bool:
+    if not name:
+        return False
+    lowered = name.strip().lower()
+    if lowered.startswith("ollama:"):
+        return True
+    if "/" in lowered:
+        return False
+    return ":" in lowered
+def stats() -> dict[str, object]:
+    materialized_local = int(_LOCAL_INFER_STATS.get("local_success", 0)) + int(_LOCAL_INFER_STATS.get("cache_hits", 0))
+    est_prompt_tokens_saved = materialized_local * _AVG_PROMPT_TOKENS_SAVED
+    est_completion_tokens_saved = materialized_local * _AVG_COMPLETION_TOKENS_SAVED
+    est_cost_saved = (
+        (est_prompt_tokens_saved / 1000.0) * _EST_FRONTIER_INPUT_COST_PER_1K
+        + (est_completion_tokens_saved / 1000.0) * _EST_FRONTIER_OUTPUT_COST_PER_1K
+    )
+    return {
+        "cache_entries": len(_LOCAL_INFER_CACHE),
+        "warm_models": sorted(_MODEL_WARM_STATE.keys()),
+        "frontier_calls_avoided_est": materialized_local,
+        "prompt_tokens_saved_est": est_prompt_tokens_saved,
+        "completion_tokens_saved_est": est_completion_tokens_saved,
+        "cost_saved_usd_est": round(est_cost_saved, 4),
+        **{k: int(v) for k, v in _LOCAL_INFER_STATS.items()},
+    }
 def infer(prompt: str, provider_name: str | None = None) -> dict[str, str]:
     if not isinstance(prompt, str) or not prompt.strip():
         return {"status": "error", "error": "empty_prompt"}
     use_ollama = os.environ.get("OCMEMOG_USE_OLLAMA", "").lower() in {"1", "true", "yes"}
     model_override = provider_name or config.OCMEMOG_MEMORY_MODEL
-    if use_ollama or model_override.startswith("ollama:"):
+    if _looks_like_local_openai_model(model_override):
+        model = _normalize_local_model_name(model_override) or config.OCMEMOG_LOCAL_LLM_MODEL
+        return _infer_openai_compatible(
+            prompt,
+            base_url=config.OCMEMOG_LOCAL_LLM_BASE_URL,
+            model=model,
+            api_key=os.environ.get("OCMEMOG_LOCAL_LLM_API_KEY") or os.environ.get("LOCAL_LLM_API_KEY"),
+            provider_label="local-openai",
+        )
+    if use_ollama or _looks_like_ollama_model(model_override):
         model = model_override.split(":", 1)[-1] if model_override.startswith("ollama:") else model_override
         return _infer_ollama(prompt, model)
     api_key = os.environ.get("OCMEMOG_OPENAI_API_KEY") or os.environ.get("OPENAI_API_KEY")
     if not api_key:
-        # fall back to local ollama if configured
-        return _infer_ollama(prompt, config.OCMEMOG_OLLAMA_MODEL)
+        return _infer_openai_compatible(
+            prompt,
+            base_url=config.OCMEMOG_LOCAL_LLM_BASE_URL,
+            model=config.OCMEMOG_LOCAL_LLM_MODEL,
+            api_key=os.environ.get("OCMEMOG_LOCAL_LLM_API_KEY") or os.environ.get("LOCAL_LLM_API_KEY"),
+            provider_label="local-openai",
+        )
     model = model_override
-    url = f"{config.OCMEMOG_OPENAI_API_BASE.rstrip('/')}/chat/completions"
-    payload = {
-        "model": model,
-        "messages": [{"role": "user", "content": prompt}],
-        "temperature": 0.2,
-    }
-    data = json.dumps(payload).encode("utf-8")
-    req = urllib.request.Request(url, data=data, method="POST")
-    req.add_header("Authorization", f"Bearer {api_key}")
-    req.add_header("Content-Type", "application/json")
-    try:
-        with urllib.request.urlopen(req, timeout=30) as resp:
-            response = json.loads(resp.read().decode("utf-8"))
-    except Exception as exc:
-        emit_event(LOGFILE, "brain_infer_error", status="error", provider="openai", error=str(exc))
-        return {"status": "error", "error": f"request_failed:{exc}"}
-    try:
-        output = response["choices"][0]["message"]["content"]
-    except Exception as exc:
-        emit_event(LOGFILE, "brain_infer_error", status="error", provider="openai", error=str(exc))
-        return {"status": "error", "error": "invalid_response"}
-    return {"status": "ok", "output": str(output).strip()}
+    return _infer_openai_compatible(
+        prompt,
+        base_url=config.OCMEMOG_OPENAI_API_BASE,
+        model=model,
+        api_key=api_key,
+        provider_label="openai",
+    )
 def parse_operator_name(text: str) -> dict[str, str] | None: