npm - superlocalmemory - Versions diffs - 3.3.29 → 3.4.1 - Mend

superlocalmemory 3.3.29 → 3.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

package/src/superlocalmemory/server/routes/behavioral.py CHANGED Viewed

@@ -25,8 +25,8 @@ try:
     from superlocalmemory.learning.behavioral import BehavioralPatternStore
     from superlocalmemory.learning.outcomes import OutcomeTracker
     BEHAVIORAL_AVAILABLE = True
-except ImportError:
-    logger.info("V3 behavioral engine not available")
+except ImportError as e:
+    logger.warning("V3 behavioral engine import failed: %s", e)
 @router.get("/api/behavioral/status")
@@ -66,9 +66,13 @@ async def behavioral_status():
         try:
             store = BehavioralPatternStore(db_path)
             patterns = store.get_patterns(profile_id=profile)
-            cross_project_transfers = 0
+            # Count patterns spanning multiple projects
+            cross_project_transfers = len([
+                p for p in patterns
+                if isinstance(p, dict) and p.get("project_count", 1) > 1
+            ])
         except Exception as exc:
-            logger.debug("pattern store: %s", exc)
+            logger.warning("pattern store error: %s", exc)
         return {
             "available": True,

package/src/superlocalmemory/server/routes/chat.py ADDED Viewed

@@ -0,0 +1,320 @@
+# Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
+# Licensed under AGPL-3.0-or-later — see LICENSE file
+# Part of SuperLocalMemory v3.4.1 | https://qualixar.com
+"""Ask My Memory — SSE chat endpoint.
+Flow: query → 6-channel retrieval → format context → LLM stream → SSE
+Mode A: No LLM, returns formatted retrieval results.
+Mode B: Ollama local streaming via /api/chat.
+Mode C: Cloud LLM streaming (OpenAI-compatible).
+Part of Qualixar | Author: Varun Pratap Bhardwaj
+"""
+from __future__ import annotations
+import asyncio
+import json
+import logging
+import re
+from typing import AsyncGenerator
+import httpx
+from fastapi import APIRouter, Request
+from fastapi.responses import StreamingResponse
+logger = logging.getLogger(__name__)
+router = APIRouter(tags=["chat"])
+# Citation marker pattern: [MEM-1], [MEM-2], etc.
+_CITATION_RE = re.compile(r"\[MEM-(\d+)\]")
+# System prompt for LLM — instructs citation usage
+_SYSTEM_PROMPT = (
+    "You are a memory assistant. Answer the user's question using ONLY the "
+    "provided memories. When you use information from a memory, include its "
+    "marker inline, e.g. [MEM-1]. If no memories are relevant, say so. "
+    "Be concise and factual."
+)
+# ── SSE Stream Endpoint ─────────────────────────────────────────
+@router.post("/api/v3/chat/stream")
+async def chat_stream(request: Request):
+    """Stream a memory-grounded chat response via SSE.
+    Body: {"query": "...", "mode": "a"|"b"|"c", "limit": 10}
+    Response: text/event-stream with events: token, citation, done, error
+    """
+    try:
+        body = await request.json()
+    except Exception:
+        return StreamingResponse(
+            _sse_error("Invalid JSON body"),
+            media_type="text/event-stream",
+        )
+    query = (body.get("query") or "").strip()
+    if not query:
+        return StreamingResponse(
+            _sse_error("Query is required"),
+            media_type="text/event-stream",
+        )
+    mode = (body.get("mode") or "a").lower()
+    limit = min(body.get("limit", 10), 20)
+    return StreamingResponse(
+        _stream_chat(query, mode, limit),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache",
+            "X-Accel-Buffering": "no",          # nginx
+            "Content-Encoding": "identity",      # bypass GZipMiddleware
+        },
+    )
+# ── Core Chat Logic ──────────────────────────────────────────────
+async def _stream_chat(
+    query: str, mode: str, limit: int,
+) -> AsyncGenerator[str, None]:
+    """Retrieve memories, then stream LLM response with citations."""
+    # Step 1: Retrieve memories via WorkerPool (run in executor to avoid blocking)
+    memories = []
+    try:
+        loop = asyncio.get_event_loop()
+        memories = await loop.run_in_executor(None, _recall_memories, query, limit)
+    except Exception as exc:
+        yield _sse_event("error", json.dumps({"message": f"Retrieval failed: {exc}"}))
+        yield _sse_event("done", "")
+        return
+    if not memories:
+        yield _sse_event("token", "No relevant memories found for your query.")
+        yield _sse_event("done", "")
+        return
+    # Step 2: Send citation metadata
+    for i, mem in enumerate(memories):
+        citation_data = {
+            "index": i + 1,
+            "fact_id": mem.get("fact_id", ""),
+            "content_preview": (mem.get("content") or "")[:80],
+            "trust_score": mem.get("trust_score", 0),
+            "score": mem.get("score", 0),
+        }
+        yield _sse_event("citation", json.dumps(citation_data))
+    # Step 3: Route to appropriate mode
+    if mode == "a":
+        # Mode A: No LLM — return formatted retrieval results
+        async for event in _stream_mode_a(query, memories):
+            yield event
+    elif mode in ("b", "c"):
+        # Mode B/C: LLM streaming
+        async for event in _stream_mode_bc(query, memories, mode):
+            yield event
+    else:
+        yield _sse_event("token", "Unknown mode. Use a, b, or c.")
+    yield _sse_event("done", "")
+# ── Mode A: Raw Retrieval Results ────────────────────────────────
+async def _stream_mode_a(
+    query: str, memories: list,
+) -> AsyncGenerator[str, None]:
+    """Format retrieval results as readable answer (no LLM).
+    Mode A = zero-cloud. No LLM available, so we show raw retrieval
+    results in a structured format. For conversational AI answers,
+    users should switch to Mode B (Ollama) or Mode C (Cloud) in Settings.
+    """
+    yield _sse_event("token", "**Mode A — Raw Memory Retrieval** (no LLM connected)\n")
+    yield _sse_event("token", "For AI-powered answers, switch to Mode B or C in Settings.\n")
+    yield _sse_event("token", f"Found **{len(memories)}** relevant memories for: *{query}*\n\n")
+    await asyncio.sleep(0.03)
+    for i, mem in enumerate(memories):
+        content = mem.get("content") or mem.get("source_content") or ""
+        score = mem.get("score", 0)
+        trust = mem.get("trust_score", 0)
+        text = (
+            f"**[MEM-{i+1}]** (relevance: {score:.2f}, trust: {trust:.2f})\n"
+            f"{content}\n\n"
+        )
+        yield _sse_event("token", text)
+        await asyncio.sleep(0.03)
+# ── Mode B/C: LLM Streaming ─────────────────────────────────────
+async def _stream_mode_bc(
+    query: str, memories: list, mode: str,
+) -> AsyncGenerator[str, None]:
+    """Stream LLM response with memory context and citation detection."""
+    # Build context with citation markers
+    context_parts = []
+    for i, mem in enumerate(memories):
+        content = mem.get("content") or mem.get("source_content") or ""
+        trust = mem.get("trust_score", 0)
+        context_parts.append(f"[MEM-{i+1}] {content} (trust: {trust:.2f})")
+    context = "\n".join(context_parts)
+    messages = [
+        {"role": "system", "content": _SYSTEM_PROMPT},
+        {"role": "user", "content": f"Memories:\n{context}\n\nQuestion: {query}"},
+    ]
+    # Load LLM config
+    try:
+        from superlocalmemory.core.config import SLMConfig
+        config = SLMConfig.load()
+        provider = config.llm.provider or ""
+        model = config.llm.model or ""
+        api_key = config.llm.api_key or ""
+        api_base = config.llm.api_base or ""
+    except Exception:
+        yield _sse_event("token", "LLM not configured. Use Mode A or configure a provider in Settings.")
+        return
+    if not provider:
+        yield _sse_event("token", "No LLM provider configured. Showing raw results instead.\n\n")
+        async for event in _stream_mode_a(query, memories):
+            yield event
+        return
+    # Stream from provider
+    try:
+        if provider == "ollama":
+            async for token in _stream_ollama(messages, model, api_base):
+                yield _sse_event("token", token)
+        else:
+            async for token in _stream_openai_compat(
+                messages, model, api_key, api_base, provider,
+            ):
+                yield _sse_event("token", token)
+    except httpx.ConnectError:
+        yield _sse_event("token", f"\n\n[Connection failed — is {provider} running?]")
+    except Exception as exc:
+        yield _sse_event("token", f"\n\n[LLM error: {exc}]")
+# ── Ollama Streaming (/api/chat with messages) ───────────────────
+async def _stream_ollama(
+    messages: list, model: str, api_base: str,
+) -> AsyncGenerator[str, None]:
+    """Stream tokens from Ollama /api/chat endpoint."""
+    import os
+    base = api_base or os.environ.get("OLLAMA_HOST", "http://localhost:11434")
+    url = f"{base.rstrip('/')}/api/chat"
+    payload = {
+        "model": model or "llama3.2",
+        "messages": messages,
+        "stream": True,
+        "options": {"num_predict": 1024, "temperature": 0.3, "num_ctx": 4096},
+    }
+    async with httpx.AsyncClient(timeout=httpx.Timeout(120.0)) as client:
+        async with client.stream("POST", url, json=payload) as resp:
+            resp.raise_for_status()
+            async for line in resp.aiter_lines():
+                if not line:
+                    continue
+                try:
+                    chunk = json.loads(line)
+                    if chunk.get("done"):
+                        break
+                    token = chunk.get("message", {}).get("content", "")
+                    if token:
+                        yield token
+                except json.JSONDecodeError:
+                    continue
+# ── OpenAI-Compatible Streaming ──────────────────────────────────
+async def _stream_openai_compat(
+    messages: list, model: str, api_key: str,
+    api_base: str, provider: str,
+) -> AsyncGenerator[str, None]:
+    """Stream tokens from OpenAI-compatible API (OpenAI, Azure, OpenRouter)."""
+    import os
+    if provider == "azure":
+        url = api_base  # Azure uses full deployment URL
+        headers = {"api-key": api_key, "Content-Type": "application/json"}
+    elif provider == "openrouter":
+        url = api_base or "https://openrouter.ai/api/v1/chat/completions"
+        headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
+    elif provider == "anthropic":
+        # Anthropic uses a different streaming format — simplified here
+        url = api_base or "https://api.anthropic.com/v1/messages"
+        headers = {
+            "x-api-key": api_key,
+            "anthropic-version": "2023-06-01",
+            "Content-Type": "application/json",
+        }
+    else:
+        url = api_base or "https://api.openai.com/v1/chat/completions"
+        headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
+    payload = {
+        "model": model,
+        "messages": messages,
+        "stream": True,
+        "max_tokens": 1024,
+        "temperature": 0.3,
+    }
+    async with httpx.AsyncClient(timeout=httpx.Timeout(120.0)) as client:
+        async with client.stream("POST", url, json=payload, headers=headers) as resp:
+            resp.raise_for_status()
+            async for line in resp.aiter_lines():
+                if not line.startswith("data: "):
+                    continue
+                data = line[6:]
+                if data == "[DONE]":
+                    break
+                try:
+                    chunk = json.loads(data)
+                    token = chunk.get("choices", [{}])[0].get("delta", {}).get("content", "")
+                    if token:
+                        yield token
+                except json.JSONDecodeError:
+                    continue
+# ── Retrieval Helper ─────────────────────────────────────────────
+def _recall_memories(query: str, limit: int) -> list:
+    """Run 6-channel retrieval via WorkerPool (synchronous, runs in executor)."""
+    from superlocalmemory.core.worker_pool import WorkerPool
+    pool = WorkerPool.shared()
+    result = pool.recall(query, limit=limit)
+    if result.get("ok"):
+        return result.get("results", [])
+    return []
+# ── SSE Formatting ───────────────────────────────────────────────
+def _sse_event(event_type: str, data: str) -> str:
+    """Format a single SSE event."""
+    return f"event: {event_type}\ndata: {data}\n\n"
+async def _sse_error(message: str) -> AsyncGenerator[str, None]:
+    """Yield a single SSE error event."""
+    yield _sse_event("error", json.dumps({"message": message}))
+    yield _sse_event("done", "")