npm - synapse-orch-ai - Versions diffs - 1.5.6 → 1.6.0 - Mend

synapse-orch-ai 1.5.6 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

package/backend/core/cache/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""
+Caching layer for the orchestration and agent system.
+Three sub-modules:
+- prompt_cache: provider-payload decorators (Anthropic cache_control, etc.)
+- tool_cache:   memoization for deterministic MCP/builder/custom tool results
+- response_cache: exact + semantic cache for LLM responses (skips AGENT steps)
+All caches are opt-in per step via StepConfig.cache_* and globally via settings.
+"""
+from core.cache import prompt_cache, tool_cache, response_cache, store
+__all__ = ["prompt_cache", "tool_cache", "response_cache", "store"]

package/backend/core/cache/prompt_cache.py ADDED Viewed

@@ -0,0 +1,166 @@
+"""
+Provider-payload decorators that turn on prompt caching.
+Caching pricing is asymmetric and provider-specific:
+- Anthropic: cache writes cost ~1.25x base input; cache reads cost ~0.1x base input.
+- OpenAI:    automatic for >=1024-token stable prefixes; reads are ~0.5x base input.
+- DeepSeek:  server-side automatic; reads ~0.1x base input. Reported via
+             `prompt_cache_hit_tokens` / `prompt_cache_miss_tokens`.
+- Gemini:    requires an explicit `cached_content` handle via client.caches.create().
+             Reads ~0.25x base input. Minimum TTL: 5 min.
+- Bedrock:   Anthropic-on-Bedrock supports `cachePoint` content blocks in Converse.
+Caching is gated by:
+1. global settings.prompt_cache_enabled
+2. system prompt length (Anthropic minimum ~1024 tokens for Sonnet/Opus,
+   ~2048 for Haiku; we use a conservative 4000-char floor).
+When unsure or when a provider isn't supported, these helpers no-op so the
+caller's payload is unchanged.
+"""
+from typing import Any
+# Anthropic charges for cache writes; only worth it when the prefix is meaningful.
+# 4000 chars ≈ 1000 tokens — under Anthropic's minimum, the cache_control marker
+# is silently ignored, so this floor avoids paying for ineligible writes.
+MIN_CACHEABLE_CHARS = 4000
+# Separator emitted by core.tools.build_system_prompt between the stable section
+# (cacheable) and the volatile section (turn budget, current time, RAG context).
+# Splitting here keeps the cache prefix byte-stable across turns.
+VOLATILE_SEPARATOR = "\n---\n"
+def is_cacheable_system(system: str | None) -> bool:
+    return bool(system) and len(system) >= MIN_CACHEABLE_CHARS
+def split_stable_volatile(system: str | None) -> tuple[str, str]:
+    """Return (stable_prefix, volatile_suffix). Empty suffix when no separator."""
+    if not system:
+        return "", ""
+    idx = system.find(VOLATILE_SEPARATOR)
+    if idx < 0:
+        return system, ""
+    return system[:idx], system[idx:]
+# ── Anthropic ────────────────────────────────────────────────────────────────
+def decorate_anthropic_kwargs(kwargs: dict, system: str | None) -> dict:
+    """Mutate `kwargs` so the system prompt + tool block become cache breakpoints.
+    Anthropic supports up to 4 cache_control markers per request; we use 2:
+      - end of stable section of system prompt (1 marker)
+      - end of tools array (1 marker)
+    The system prompt is split on the VOLATILE_SEPARATOR ("\\n---\\n"). The
+    stable prefix is marked as cacheable; the volatile suffix (turn budget,
+    current time, RAG context) goes into a second uncached text block so
+    cache reads stay valid across turns even when those values change.
+    """
+    if not is_cacheable_system(system):
+        return kwargs
+    stable, volatile = split_stable_volatile(str(system))
+    blocks: list[dict] = [{
+        "type": "text",
+        "text": stable,
+        "cache_control": {"type": "ephemeral"},
+    }]
+    if volatile:
+        blocks.append({"type": "text", "text": volatile})
+    kwargs["system"] = blocks
+    # Mark the last tool definition so the whole tools array is part of the prefix.
+    tools = kwargs.get("tools")
+    if isinstance(tools, list) and tools:
+        last = dict(tools[-1])  # shallow copy — don't mutate caller's list
+        last["cache_control"] = {"type": "ephemeral"}
+        kwargs["tools"] = tools[:-1] + [last]
+    return kwargs
+def extract_anthropic_cache_tokens(response) -> tuple[int, int]:
+    """Return (cache_read_tokens, cache_write_tokens) from an Anthropic SDK response."""
+    usage = getattr(response, "usage", None)
+    if not usage:
+        return 0, 0
+    read = getattr(usage, "cache_read_input_tokens", 0) or 0
+    write = getattr(usage, "cache_creation_input_tokens", 0) or 0
+    return int(read), int(write)
+# ── OpenAI / Grok / v1-compatible ────────────────────────────────────────────
+def extract_openai_cache_tokens(usage: dict) -> tuple[int, int]:
+    """Return (cache_read_tokens, cache_write_tokens) from an OpenAI-style usage dict.
+    OpenAI's auto-caching only reports reads (`prompt_tokens_details.cached_tokens`).
+    There is no separate write cost — the first call just pays the normal input rate.
+    """
+    if not isinstance(usage, dict):
+        return 0, 0
+    details = usage.get("prompt_tokens_details") or {}
+    read = int(details.get("cached_tokens") or 0)
+    return read, 0
+# ── DeepSeek ─────────────────────────────────────────────────────────────────
+def extract_deepseek_cache_tokens(usage: dict) -> tuple[int, int]:
+    """DeepSeek surfaces hit/miss separately."""
+    if not isinstance(usage, dict):
+        return 0, 0
+    hit = int(usage.get("prompt_cache_hit_tokens") or 0)
+    # DeepSeek has no explicit write tier — misses are billed at the normal rate.
+    return hit, 0
+# ── Gemini ───────────────────────────────────────────────────────────────────
+def extract_gemini_cache_tokens(response) -> tuple[int, int]:
+    """Gemini reports cached tokens in usage_metadata.cached_content_token_count."""
+    um = getattr(response, "usage_metadata", None)
+    if not um:
+        return 0, 0
+    read = int(getattr(um, "cached_content_token_count", 0) or 0)
+    return read, 0
+# ── Bedrock ──────────────────────────────────────────────────────────────────
+def decorate_bedrock_system_blocks(system_blocks: list[dict], system: str | None) -> list[dict]:
+    """Append a cachePoint marker after the system text block.
+    Bedrock's Converse API uses `{"cachePoint": {"type": "default"}}` instead
+    of inline cache_control. Only supported on a subset of models (Anthropic
+    Claude on Bedrock, Nova). Unsupported models silently ignore the marker.
+    """
+    if not is_cacheable_system(system):
+        return system_blocks
+    if not system_blocks:
+        return system_blocks
+    # Append a cachePoint after the existing text blocks.
+    return list(system_blocks) + [{"cachePoint": {"type": "default"}}]
+def extract_bedrock_cache_tokens(resp: dict) -> tuple[int, int]:
+    """Bedrock returns cache metrics under response['usage']."""
+    if not isinstance(resp, dict):
+        return 0, 0
+    usage = resp.get("usage") or {}
+    read = int(usage.get("cacheReadInputTokens") or 0)
+    write = int(usage.get("cacheWriteInputTokens") or usage.get("cacheCreationInputTokens") or 0)
+    return read, write
+# ── Helper for callers ───────────────────────────────────────────────────────
+def cache_enabled(settings: dict | None) -> bool:
+    """Honor the global toggle. Defaults to True when the key is missing."""
+    if not settings:
+        return True
+    return bool(settings.get("prompt_cache_enabled", True))

package/backend/core/cache/response_cache.py ADDED Viewed

@@ -0,0 +1,204 @@
+"""
+LLM response cache — exact-match + optional semantic-match.
+Exact match: SHA256 of (model, system_prompt, messages, tools_json). O(1) lookup.
+Semantic match: embed the last user message, compare against prior cached entries
+                for the same (model, system_prompt) family.
+By design, this cache is OFF unless a caller explicitly opts in. AGENT steps in
+orchestration must NEVER consult it (their behaviour is state-dependent and the
+shared_state mutations from skipping the LLM call would diverge silently).
+LLM / EVALUATOR / EXTRACT_JSON steps can opt in safely.
+"""
+import json
+from typing import Any, Optional
+from core.cache import store
+NAMESPACE_EXACT = "responses_exact"
+# Semantic cache is opt-in per step; entries are scoped by step_id to keep
+# behaviour comparable to exact match (similar prompts on the same step only).
+NAMESPACE_SEMANTIC_PREFIX = "responses_semantic_"
+def _build_exact_key(
+    model: str,
+    system: str | None,
+    messages: list[dict] | None,
+    tools: list[dict] | None,
+) -> str:
+    # Tools are normalised to a stable string — list of function names + their schemas.
+    tools_norm: list[dict] = []
+    for t in tools or []:
+        fn = t.get("function", {}) if isinstance(t, dict) else {}
+        tools_norm.append({
+            "name": fn.get("name", ""),
+            "params": fn.get("parameters", {}),
+        })
+    return store.make_key(
+        "resp",
+        model or "",
+        system or "",
+        messages or [],
+        tools_norm,
+    )
+def get_exact(
+    model: str,
+    system: str | None,
+    messages: list[dict] | None,
+    tools: list[dict] | None,
+) -> Optional[dict]:
+    """Return the cached response entry {"text", "input_tokens", "output_tokens"} or None."""
+    key = _build_exact_key(model, system, messages, tools)
+    entry = store.get(NAMESPACE_EXACT, key)
+    if entry is None:
+        return None
+    return entry.get("value")
+def set_exact(
+    model: str,
+    system: str | None,
+    messages: list[dict] | None,
+    tools: list[dict] | None,
+    *,
+    text: str,
+    input_tokens: int,
+    output_tokens: int,
+    ttl_seconds: int = 3600,
+    step_id: str | None = None,
+) -> None:
+    key = _build_exact_key(model, system, messages, tools)
+    store.set(
+        NAMESPACE_EXACT,
+        key,
+        {
+            "text": text,
+            "input_tokens": input_tokens,
+            "output_tokens": output_tokens,
+        },
+        ttl_seconds=ttl_seconds,
+        meta={"model": model, "step_id": step_id},
+    )
+# ── Semantic cache (optional, ChromaDB-backed via memory.MemoryStore) ─────────
+#
+# Implementation is intentionally light. We reuse the same embedding pipeline
+# the chat memory layer uses, store the (system+user) text in a per-step Chroma
+# collection, and persist the response text in our flat-file store keyed by
+# the document's ID. A high similarity threshold (0.95 by default) keeps
+# semantic hits limited to nearly-identical prompts.
+_semantic_collections: dict[str, Any] = {}
+def _get_memory_store():
+    """Resolve the live MemoryStore from server module (initialised at startup)."""
+    try:
+        from core import server as _server
+        return getattr(_server, "memory_store", None)
+    except Exception:
+        return None
+def _get_semantic_collection(step_id: str):
+    """Lazy ChromaDB collection per step. Returns None on failure (cache disabled)."""
+    if step_id in _semantic_collections:
+        return _semantic_collections[step_id]
+    mem = _get_memory_store()
+    if mem is None or not getattr(mem, "client", None):
+        _semantic_collections[step_id] = None
+        return None
+    try:
+        coll = mem.client.get_or_create_collection(name=f"{NAMESPACE_SEMANTIC_PREFIX}{step_id}")
+        _semantic_collections[step_id] = coll
+        return coll
+    except Exception as e:
+        print(f"DEBUG cache: semantic cache unavailable ({e}); falling back to exact only")
+        _semantic_collections[step_id] = None
+        return None
+def _embed(text: str) -> Optional[list[float]]:
+    mem = _get_memory_store()
+    if mem is None:
+        return None
+    try:
+        return mem.get_embedding(text)
+    except Exception:
+        return None
+def get_semantic(
+    step_id: str,
+    model: str,
+    system: str | None,
+    user_message: str,
+    threshold: float = 0.95,
+) -> Optional[dict]:
+    """Return the response from the closest semantic neighbour, if any beat threshold."""
+    coll = _get_semantic_collection(step_id)
+    if coll is None:
+        return None
+    emb = _embed((system or "") + "\n\n" + user_message)
+    if emb is None:
+        return None
+    try:
+        res = coll.query(query_embeddings=[emb], n_results=1)
+    except Exception:
+        return None
+    ids = (res.get("ids") or [[]])[0]
+    distances = (res.get("distances") or [[]])[0]
+    metas = (res.get("metadatas") or [[]])[0]
+    if not ids:
+        return None
+    # Chroma returns cosine distance; similarity = 1 - distance.
+    similarity = 1.0 - float(distances[0])
+    if similarity < threshold:
+        return None
+    if metas[0].get("model") != model:
+        return None
+    entry = store.get(NAMESPACE_EXACT, ids[0])
+    if entry is None:
+        return None
+    return entry.get("value")
+def set_semantic(
+    step_id: str,
+    model: str,
+    system: str | None,
+    user_message: str,
+    *,
+    text: str,
+    input_tokens: int,
+    output_tokens: int,
+    ttl_seconds: int = 3600,
+) -> None:
+    coll = _get_semantic_collection(step_id)
+    if coll is None:
+        return
+    emb = _embed((system or "") + "\n\n" + user_message)
+    if emb is None:
+        return
+    # Reuse the exact-cache key as the Chroma document ID so storage stays unified.
+    key = store.make_key("resp_semantic", model, step_id, user_message)
+    store.set(
+        NAMESPACE_EXACT,
+        key,
+        {"text": text, "input_tokens": input_tokens, "output_tokens": output_tokens},
+        ttl_seconds=ttl_seconds,
+        meta={"model": model, "step_id": step_id, "semantic": True},
+    )
+    try:
+        coll.upsert(
+            ids=[key],
+            embeddings=[emb],
+            documents=[(user_message or "")[:2000]],
+            metadatas=[{"model": model, "step_id": step_id}],
+        )
+    except Exception as e:
+        print(f"DEBUG cache: semantic upsert failed ({e})")

package/backend/core/cache/store.py ADDED Viewed

@@ -0,0 +1,147 @@
+"""
+Shared disk-backed key/value store for the cache layer.
+Each cached value lives in its own JSON file under data/cache/<namespace>/<aa>/<full_hash>.json
+where <aa> is the first two hex chars of the hash (avoids cramming thousands of
+files into a single directory).
+Format on disk:
+{
+  "value": <jsonable>,
+  "created_at": <unix ts>,
+  "ttl_seconds": <int|None>,
+  "meta": {...}        // arbitrary caller metadata (tool_name, model, etc.)
+}
+The store is intentionally simple — no LRU, no compression, no Redis. The
+hot path is one open()+json.load() per lookup; for the dataset sizes we care
+about (tens of MB per namespace) this is well under a millisecond.
+"""
+import hashlib
+import json
+import os
+import threading
+import time
+from pathlib import Path
+from typing import Any, Optional
+from core.config import DATA_DIR
+CACHE_ROOT = Path(DATA_DIR) / "cache"
+_lock = threading.Lock()
+def _hash_key(key: str) -> str:
+    return hashlib.sha256(key.encode("utf-8")).hexdigest()
+def _path_for(namespace: str, key_hash: str) -> Path:
+    return CACHE_ROOT / namespace / key_hash[:2] / f"{key_hash}.json"
+def make_key(*parts: Any) -> str:
+    """Build a deterministic cache key from arbitrary parts.
+    Dicts/lists are serialised with sort_keys so attribute order doesn't break
+    the hash. Bytes and tuples are coerced via repr.
+    """
+    norm: list[str] = []
+    for p in parts:
+        if p is None:
+            norm.append("\x00")
+        elif isinstance(p, (dict, list)):
+            norm.append(json.dumps(p, sort_keys=True, default=str, separators=(",", ":")))
+        else:
+            norm.append(str(p))
+    return _hash_key("\x1f".join(norm))
+def get(namespace: str, key: str) -> Optional[dict]:
+    """Return the cached entry dict, or None if missing/expired."""
+    key_hash = key if len(key) == 64 and all(c in "0123456789abcdef" for c in key) else _hash_key(key)
+    path = _path_for(namespace, key_hash)
+    if not path.exists():
+        return None
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            entry = json.load(f)
+    except Exception:
+        return None
+    ttl = entry.get("ttl_seconds")
+    if ttl is not None and ttl > 0:
+        age = time.time() - entry.get("created_at", 0)
+        if age > ttl:
+            try:
+                path.unlink()
+            except Exception:
+                pass
+            return None
+    return entry
+def set(namespace: str, key: str, value: Any, ttl_seconds: Optional[int] = None, meta: Optional[dict] = None) -> str:
+    """Persist `value` under `key` in `namespace`. Returns the key hash."""
+    key_hash = key if len(key) == 64 and all(c in "0123456789abcdef" for c in key) else _hash_key(key)
+    path = _path_for(namespace, key_hash)
+    entry = {
+        "value": value,
+        "created_at": time.time(),
+        "ttl_seconds": ttl_seconds,
+        "meta": meta or {},
+    }
+    with _lock:
+        path.parent.mkdir(parents=True, exist_ok=True)
+        tmp = path.with_suffix(".tmp")
+        with open(tmp, "w", encoding="utf-8") as f:
+            json.dump(entry, f, ensure_ascii=False, default=str)
+        os.replace(tmp, path)
+    return key_hash
+def delete(namespace: str, key: str) -> bool:
+    key_hash = key if len(key) == 64 and all(c in "0123456789abcdef" for c in key) else _hash_key(key)
+    path = _path_for(namespace, key_hash)
+    if path.exists():
+        try:
+            path.unlink()
+            return True
+        except Exception:
+            return False
+    return False
+def clear_namespace(namespace: str) -> int:
+    """Delete every entry under a namespace. Returns the count removed."""
+    base = CACHE_ROOT / namespace
+    if not base.exists():
+        return 0
+    removed = 0
+    with _lock:
+        for p in base.rglob("*.json"):
+            try:
+                p.unlink()
+                removed += 1
+            except Exception:
+                pass
+    return removed
+def stats() -> dict:
+    """Return per-namespace entry count and total bytes on disk."""
+    out: dict[str, dict] = {}
+    if not CACHE_ROOT.exists():
+        return out
+    for ns_dir in CACHE_ROOT.iterdir():
+        if not ns_dir.is_dir():
+            continue
+        count = 0
+        size = 0
+        for p in ns_dir.rglob("*.json"):
+            try:
+                count += 1
+                size += p.stat().st_size
+            except Exception:
+                pass
+        out[ns_dir.name] = {"entries": count, "bytes": size}
+    return out

package/backend/core/cache/tool_cache.py ADDED Viewed

@@ -0,0 +1,71 @@
+"""
+Deterministic tool-result memoization.
+Only tools in DETERMINISTIC_TOOLS are eligible — anything that reads live state
+(bash, sql_agent, web_scraper, sandbox) is bypassed because cached results
+would silently mask reality.
+Scope rules:
+- "session": key includes the session_id (e.g. personal_details, user-bound configs)
+- "global":  key includes only tool_name + args (e.g. code_search, pdf_parser)
+"""
+from typing import Any, Optional
+from core.cache import store
+NAMESPACE = "tool_results"
+# Maps tool name → scope. Listed conservatively: only tools whose output is a
+# pure function of their args (and optionally the per-user session).
+DETERMINISTIC_TOOLS: dict[str, str] = {
+    "code_search":      "global",
+    "pdf_parser":       "global",
+    "xlsx_parser":      "global",
+    "time":             "global",
+    "code_indexer":     "global",
+    "collect_data":     "global",
+    "personal_details": "session",
+}
+def is_cacheable(tool_name: str) -> bool:
+    return tool_name in DETERMINISTIC_TOOLS
+def _key(tool_name: str, tool_args: dict, session_id: Optional[str]) -> str:
+    scope = DETERMINISTIC_TOOLS.get(tool_name, "global")
+    sid = session_id or "_global_" if scope == "session" else "_global_"
+    return store.make_key("tool", tool_name, sid, tool_args or {})
+def get(tool_name: str, tool_args: dict, session_id: Optional[str] = None) -> Optional[Any]:
+    """Return the cached tool result, or None if there's no live entry."""
+    if not is_cacheable(tool_name):
+        return None
+    entry = store.get(NAMESPACE, _key(tool_name, tool_args, session_id))
+    if entry is None:
+        return None
+    return entry.get("value")
+def set(
+    tool_name: str,
+    tool_args: dict,
+    result: Any,
+    ttl_seconds: int = 3600,
+    session_id: Optional[str] = None,
+) -> None:
+    if not is_cacheable(tool_name):
+        return
+    store.set(
+        NAMESPACE,
+        _key(tool_name, tool_args, session_id),
+        result,
+        ttl_seconds=ttl_seconds,
+        meta={"tool_name": tool_name, "scope": DETERMINISTIC_TOOLS.get(tool_name)},
+    )
+def clear_tool(tool_name: str) -> int:
+    """Helper for manual invalidation (e.g. after the user re-indexes their codebase)."""
+    return store.clear_namespace(f"{NAMESPACE}/{tool_name}")

package/backend/core/config.py CHANGED Viewed

@@ -50,8 +50,13 @@ def load_settings():
         "global_config": {},
         "vault_enabled": True,
         "vault_threshold": 100000,
-        "auto_compact_enabled": False,
-        "auto_compact_threshold": 100000,
+        "auto_compact_enabled": True,
+        "auto_compact_threshold": 80000,
+        # Prompt caching: decorate provider payloads with cache_control markers
+        # so subsequent ReAct turns reuse the cached system + tools prefix.
+        # ~50–80% cost reduction on multi-turn agents at the cost of a 25% write
+        # surcharge on the first turn. Disable only if a provider misbehaves.
+        "prompt_cache_enabled": True,
         "allow_db_write": False,
         "coding_agent_enabled": True,
         "report_agent_enabled": True,