PyPI - memorymaster - Versions diffs - 3.5.2__tar.gz → 3.6.0__tar.gz - Mend

memorymaster 3.5.2tar.gz → 3.6.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (321) hide show

{memorymaster-3.5.2/memorymaster.egg-info → memorymaster-3.6.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: memorymaster
-Version: 3.5.2
+Version: 3.6.0
 Summary: Production-grade memory reliability system for AI coding agents. Lifecycle-managed claims with citations, conflict detection, steward governance, and MCP integration.
 Author: wolverin0
 License: MIT

{memorymaster-3.5.2 → memorymaster-3.6.0}/memorymaster/entity_extractor.py RENAMED Viewed

@@ -496,7 +496,7 @@ def extract_patterns(text: str) -> list[Entity]:
 # Version identifier baked into the prompt. Bump this string when the prompt
 # changes so that downstream idempotency / caching keys invalidate cleanly.
-LLM_PROMPT_VERSION = "entity-l2-v2-2026-04-25"
+LLM_PROMPT_VERSION = "entity-l2-v3-2026-04-27"
 # Permitted entity kinds for Layer-2. Any `kind` returned by the LLM that is
 # not in this set is dropped to keep the registry schema predictable.
@@ -513,22 +513,50 @@ LLM_KINDS: frozenset[str] = frozenset(
 _LLM_ENV_FLAG = "MEMORYMASTER_ENTITY_LLM"
 _LLM_MAX_TEXT_CHARS = 4000  # Truncate long claims before sending to LLM.
-_LLM_MAX_ENTITIES = 8       # Hard cap to keep cost bounded per claim.
+_LLM_MAX_ENTITIES = 5       # v3: tightened from 8 to 5 — overgeneration was the
+                            # dominant failure mode in v2 backfill batches.
-_LLM_PROMPT = f"""Extract entities from the snippet that regex cannot catch.
+_LLM_PROMPT = f"""Extract HIGH-VALUE entities from the snippet — only ones a future agent would search for by name. Be conservative.
 Prompt version: {LLM_PROMPT_VERSION}
-Allowed kinds: person_name, spanish_surname, time_expression, model_name, library_name, concept.
-Skip: file paths, env-vars, hostnames, ports, commit SHAs, tool names.
-Max {_LLM_MAX_ENTITIES} entities. Output STRICT JSON ARRAY only — no prose, no code fence.
+ALLOWED kinds (return ONE per entity): person_name, spanish_surname, time_expression, model_name, library_name, concept.
+WHEN IN DOUBT, SKIP. A future agent searching memory for this claim should be searching by the entity name itself, not by a generic word.
+ALWAYS SKIP:
+- File paths, directories, env vars, hostnames, IPs, ports
+- Commit SHAs, branch names, tool names like "git", "docker", "npm", "sqlite", "psql"
+- Generic English words: "system", "config", "service", "module", "function", "component", "data", "process", "task", "user"
+- Generic Spanish words: "sistema", "config", "servicio", "modulo", "funcion", "componente", "datos", "proceso", "tarea", "usuario", "cosa", "caso"
+- Standalone numbers, percentages, dates already in YYYY-MM-DD form
+- HTML/CSS class names, JSON keys, code identifiers in snake_case or camelCase
+Quality bar by kind:
+- person_name: full name (≥2 capitalized words) of a real person, NOT a role like "user" or "developer"
+- spanish_surname: bare surname when it stands alone WITHOUT a first name
+- time_expression: relative phrases like "next Thursday", "el lunes pasado", "Q3 2026" — NOT absolute YYYY-MM-DD dates
+- model_name: AI model identifier with a recognizable family prefix (gpt-, claude-, gemini-, llama-, mistral-) AND a version
+- library_name: a SPECIFIC named library/framework like "FastAPI", "React", "pyafipws" — NOT "the API" or "the framework"
+- concept: a named domain concept (3+ words usually) that appears as a noun-phrase a person would research, like "RRF fusion", "byzantine consensus", "writer-lock contention" — NOT generic ideas
+Output STRICT JSON ARRAY only — no prose, no code fence. Max {_LLM_MAX_ENTITIES} entities. If nothing in the snippet rises to the bar, return [].
 Schema (use EXACT field names):
   [{{"kind": "...", "surface_form": "exact substring from text", "aliases": []}}]
-Example input: "Ada Lovelace y Charles Babbage usaron FastAPI y gpt-4o-mini."
-Example output: [{{"kind":"person_name","surface_form":"Ada Lovelace","aliases":[]}},{{"kind":"person_name","surface_form":"Charles Babbage","aliases":[]}},{{"kind":"library_name","surface_form":"FastAPI","aliases":[]}},{{"kind":"model_name","surface_form":"gpt-4o-mini","aliases":[]}}]
+POSITIVE example
+Input: "Ada Lovelace y Charles Babbage usaron FastAPI y gpt-4o-mini el lunes pasado para implementar RRF fusion."
+Output: [{{"kind":"person_name","surface_form":"Ada Lovelace","aliases":[]}},{{"kind":"person_name","surface_form":"Charles Babbage","aliases":[]}},{{"kind":"library_name","surface_form":"FastAPI","aliases":[]}},{{"kind":"model_name","surface_form":"gpt-4o-mini","aliases":[]}},{{"kind":"concept","surface_form":"RRF fusion","aliases":[]}}]
+NEGATIVE example (bloat to AVOID)
+Input: "El sistema usa la base de datos para guardar config del usuario en el modulo principal."
+Output: []
+(All terms are generic — system, database, config, user, module — none worth indexing.)
-If nothing fits, return: []
+NEGATIVE example (path/SHA noise)
+Input: "Bug fixed in commit a133bc6 in src/auth/login.py — see logs at /var/log/app.log"
+Output: []
+(Commit SHA, file path, log path — all skip.)
 """.strip()

{memorymaster-3.5.2 → memorymaster-3.6.0}/memorymaster/llm_provider.py RENAMED Viewed

@@ -453,17 +453,60 @@ def call_llm(prompt: str, text: str) -> str:
 def parse_json_response(text: str) -> list[dict]:
-    """Parse LLM response as JSON array, handling markdown code fences."""
+    """Parse LLM response as JSON array, handling markdown code fences and prose preambles.
+    Resilient to four common LLM output shapes:
+      1. raw JSON array: ``[{...}, {...}]``
+      2. fenced JSON: ``\u0060\u0060\u0060json\\n[...]\\n\u0060\u0060\u0060``
+      3. prose preamble + fenced: ``Here is the answer:\\n\u0060\u0060\u0060json\\n[...]\u0060\u0060\u0060``
+      4. prose preamble + raw: ``The entities are: [...]``
+    Strategy: try direct parse, then try fenced-strip from start, then fall back
+    to greedy-extracting the largest ``[...]`` block in the text.
+    """
     text = text.strip()
+    # Shape 2 — strict fenced from the very start.
     if text.startswith("```"):
-        text = re.sub(r"^```(?:json)?\n?", "", text)
-        text = re.sub(r"\n?```$", "", text)
+        stripped = re.sub(r"^```(?:json)?\n?", "", text)
+        stripped = re.sub(r"\n?```$", "", stripped)
+        try:
+            result = json.loads(stripped)
+            return _coerce_to_list(result)
+        except (json.JSONDecodeError, ValueError):
+            pass
+    # Shape 1 — direct parse.
     try:
         result = json.loads(text)
-        if isinstance(result, list):
-            return result
-        if isinstance(result, dict):
-            return [result]
-        return []
+        return _coerce_to_list(result)
     except (json.JSONDecodeError, ValueError):
-        return []
+        pass
+    # Shapes 3 + 4 — find the first ``\u0060\u0060\u0060json``/``\u0060\u0060\u0060`` block; if absent, the largest ``[...]``.
+    fenced_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", text)
+    if fenced_match:
+        try:
+            result = json.loads(fenced_match.group(1).strip())
+            return _coerce_to_list(result)
+        except (json.JSONDecodeError, ValueError):
+            pass
+    # Greedy: first ``[`` to last matching ``]``. Defensive against prose with stray brackets.
+    first = text.find("[")
+    last = text.rfind("]")
+    if first != -1 and last > first:
+        try:
+            result = json.loads(text[first : last + 1])
+            return _coerce_to_list(result)
+        except (json.JSONDecodeError, ValueError):
+            pass
+    return []
+def _coerce_to_list(result) -> list[dict]:
+    if isinstance(result, list):
+        return result
+    if isinstance(result, dict):
+        return [result]
+    return []

{memorymaster-3.5.2 → memorymaster-3.6.0/memorymaster.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: memorymaster
-Version: 3.5.2
+Version: 3.6.0
 Summary: Production-grade memory reliability system for AI coding agents. Lifecycle-managed claims with citations, conflict detection, steward governance, and MCP integration.
 Author: wolverin0
 License: MIT

{memorymaster-3.5.2 → memorymaster-3.6.0}/memorymaster.egg-info/SOURCES.txt RENAMED Viewed

@@ -193,10 +193,12 @@ scripts/grid_recall_weights.py
 scripts/index_claims_to_qdrant.py
 scripts/ingest_planning_docs.py
 scripts/jira_live_to_turns.py
+scripts/label_prompts_with_judge.py
 scripts/llm_benchmark.py
 scripts/merge_scope_variants.py
 scripts/messages_to_turns.py
 scripts/operator_metrics.py
+scripts/precompute_candidates.py
 scripts/recurring_incident_drill.py
 scripts/release_readiness.py
 scripts/run_codex_autologger.py

{memorymaster-3.5.2 → memorymaster-3.6.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "memorymaster"
-version = "3.5.2"
+version = "3.6.0"
 description = "Production-grade memory reliability system for AI coding agents. Lifecycle-managed claims with citations, conflict detection, steward governance, and MCP integration."
 license = {text = "MIT"}
 authors = [{name = "wolverin0"}]

memorymaster-3.6.0/scripts/label_prompts_with_judge.py ADDED Viewed

@@ -0,0 +1,211 @@
+"""LLM-judge: label which retrieved claims actually answer each synthetic prompt.
+For each prompt in the input JSONL:
+  1. Run the production recall hook to get the top-K (default 20) candidate claims.
+  2. Send (prompt + candidate snippets) to a haiku judge.
+  3. Judge returns the subset of claim IDs that genuinely answer the prompt.
+  4. Write {sha1_16(prompt): [claim_ids]} into the labels JSON.
+Usage:
+    python scripts/label_prompts_with_judge.py \
+        --prompts artifacts/real-prompts-1000.jsonl \
+        --db memorymaster.db \
+        --labels-out artifacts/real-prompts-1000-labels.json \
+        --top-k 20 \
+        --max-prompts 1000
+The output is consumed by scripts/eval_recall_precision_at_5.py via the
+``<prompts>-labels.json`` convention.
+"""
+from __future__ import annotations
+import argparse
+import hashlib
+import json
+import os
+import sys
+import time
+from pathlib import Path
+def _sha1_16(text: str) -> str:
+    return hashlib.sha1(text.encode("utf-8")).hexdigest()[:16]
+def _judge_prompt(prompt: str, candidates: list[dict]) -> str:
+    candidate_lines = "\n".join(
+        f"[{c['id']}] {c['text'][:300]}" for c in candidates
+    )
+    return f"""You are a relevance judge. Given a USER QUERY and a list of CANDIDATE memory claims, return the subset of claim IDs that genuinely answer the query.
+USER QUERY: {prompt}
+CANDIDATES (id and snippet):
+{candidate_lines}
+Rules:
+- Return ONLY claim IDs that DIRECTLY answer the query (not tangentially related).
+- An empty list is a valid answer if no candidate genuinely answers.
+- Return JSON ARRAY ONLY of integer IDs, no prose, no fence. Example: [123, 456]
+- Be strict — pick at most 5, prefer 0-3 high-quality matches over many weak ones."""
+def _get_candidates(db_path: str, prompt: str, top_k: int) -> list[dict]:
+    """Run production recall via context_hook and return top-K candidates."""
+    # Use the same return_ids=True path as the eval harness.
+    from memorymaster import context_hook
+    # Recall returns rendered bullet text; we need ids + raw claim text.
+    # Easiest: get the IDs from recall, then fetch claim text from DB.
+    try:
+        # context_hook.recall signature:
+        #   recall(query, *, db_path='', budget=2000, format='text', skip_qdrant=False, return_ids=False)
+        result = context_hook.recall(
+            prompt,
+            db_path=db_path,
+            return_ids=True,
+        )
+        if isinstance(result, tuple):
+            _, ids = result
+        else:
+            ids = []
+    except Exception as exc:
+        print(f"[label] recall() raised: {exc}", flush=True)
+        ids = []
+    if not ids:
+        return []
+    import sqlite3
+    conn = sqlite3.connect(db_path)
+    try:
+        rows = []
+        for cid in ids[:top_k]:
+            row = conn.execute(
+                "SELECT id, text FROM claims WHERE id = ?", (cid,)
+            ).fetchone()
+            if row:
+                rows.append({"id": row[0], "text": row[1] or ""})
+        return rows
+    finally:
+        conn.close()
+def _call_judge(prompt: str, candidates: list[dict]) -> list[int]:
+    """Single LLM call to the judge. Returns list of claim IDs."""
+    from memorymaster.llm_provider import call_llm, parse_json_response
+    judge_text = _judge_prompt(prompt, candidates)
+    raw = call_llm(judge_text, "")
+    if not raw:
+        return []
+    parsed = parse_json_response(raw)
+    # parse_json_response returns list of dicts; we want bare ints.
+    # If it returns [{"id": 123}, ...] coerce; otherwise try raw int parsing.
+    ids: list[int] = []
+    for item in parsed:
+        if isinstance(item, int):
+            ids.append(item)
+        elif isinstance(item, dict):
+            v = item.get("id") or item.get("claim_id")
+            if isinstance(v, int):
+                ids.append(v)
+        elif isinstance(item, str) and item.strip().lstrip("-").isdigit():
+            ids.append(int(item.strip()))
+    # Fallback: regex-extract integers from raw if parser missed it
+    if not ids:
+        import re
+        ids = [int(m) for m in re.findall(r"\b\d{2,8}\b", raw)]
+    return ids
+def main() -> int:
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument("--prompts", type=Path, required=True)
+    p.add_argument("--db", type=str, required=True)
+    p.add_argument("--labels-out", type=Path, required=True)
+    p.add_argument("--top-k", type=int, default=20)
+    p.add_argument("--max-prompts", type=int, default=1000)
+    p.add_argument(
+        "--checkpoint-every",
+        type=int,
+        default=25,
+        help="Flush labels JSON every N prompts (resume-safe).",
+    )
+    args = p.parse_args()
+    # Force claude_cli for the judge — Gemini API is rate-limited and slow.
+    # Direct assignment (NOT setdefault) — avoid the v3.5.0 hook bug where
+    # an inherited shell env left the provider stale.
+    os.environ["MEMORYMASTER_LLM_PROVIDER"] = "claude_cli"
+    os.environ["MEMORYMASTER_LLM_MODEL"] = "claude-haiku-4-5-20251001"
+    prompts: list[dict] = []
+    with args.prompts.open(encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            prompts.append(json.loads(line))
+    prompts = prompts[: args.max_prompts]
+    # Resume from existing labels file if present
+    labels: dict[str, list[int]] = {}
+    if args.labels_out.exists():
+        labels = json.loads(args.labels_out.read_text(encoding="utf-8")).get(
+            "labels", {}
+        )
+        print(f"[label] resuming from {len(labels)} existing labels", flush=True)
+    t_start = time.monotonic()
+    for i, p_obj in enumerate(prompts, 1):
+        text = p_obj["text"]
+        sha = _sha1_16(text)
+        if sha in labels:
+            continue
+        try:
+            cands = _get_candidates(args.db, text, args.top_k)
+            if not cands:
+                labels[sha] = []
+            else:
+                ids = _call_judge(text, cands)
+                # Filter to only IDs that were actually in the candidate set
+                cand_ids = {c["id"] for c in cands}
+                labels[sha] = [i for i in ids if i in cand_ids][:5]
+        except Exception as exc:
+            print(f"[label] {i}: ERROR {exc}", flush=True)
+            labels[sha] = []
+        if i % 5 == 0:
+            elapsed = time.monotonic() - t_start
+            avg = elapsed / i
+            eta = avg * (len(prompts) - i)
+            print(
+                f"[label] {i}/{len(prompts)}  avg={avg:.1f}s  eta={eta/60:.1f}min  "
+                f"last={labels[sha]}",
+                flush=True,
+            )
+        if i % args.checkpoint_every == 0:
+            args.labels_out.write_text(
+                json.dumps({"labels": labels}, indent=2), encoding="utf-8"
+            )
+    args.labels_out.write_text(
+        json.dumps({"labels": labels}, indent=2), encoding="utf-8"
+    )
+    n_labeled = sum(1 for v in labels.values() if v)
+    print(
+        f"[label] DONE wrote {len(labels)} labels "
+        f"({n_labeled} non-empty) to {args.labels_out}",
+        flush=True,
+    )
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

memorymaster-3.6.0/scripts/precompute_candidates.py ADDED Viewed

@@ -0,0 +1,94 @@
+"""Pre-compute top-K recall candidates for each prompt + write a chunked batch file
+suitable for parallel labeling by subagents.
+For each prompt in the input JSONL, run production recall once and capture the
+top-K candidate (id, text snippet) pairs. Output is a JSON file with one entry
+per prompt, organized as one chunk per output file so multiple labeling
+subagents can work in parallel.
+Usage:
+    python scripts/precompute_candidates.py \\
+        --prompts artifacts/real-prompts-1000.jsonl \\
+        --db memorymaster.db \\
+        --out-dir artifacts/label-batches \\
+        --chunk-size 100 --top-k 15
+"""
+from __future__ import annotations
+import argparse
+import hashlib
+import json
+import sqlite3
+import sys
+import time
+from pathlib import Path
+def _sha1_16(text: str) -> str:
+    return hashlib.sha1(text.encode("utf-8")).hexdigest()[:16]
+def _get_candidates(conn: sqlite3.Connection, prompt: str, top_k: int) -> list[dict]:
+    from memorymaster import context_hook
+    try:
+        result = context_hook.recall(prompt, db_path=conn.execute("PRAGMA database_list").fetchone()[2], return_ids=True)
+        if isinstance(result, tuple):
+            _, ids = result
+        else:
+            ids = []
+    except Exception as exc:
+        print(f"[recall] error: {exc}", flush=True)
+        ids = []
+    rows = []
+    for cid in ids[:top_k]:
+        row = conn.execute("SELECT id, text FROM claims WHERE id = ?", (cid,)).fetchone()
+        if row:
+            rows.append({"id": row[0], "text": (row[1] or "")[:300]})
+    return rows
+def main() -> int:
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument("--prompts", type=Path, required=True)
+    p.add_argument("--db", type=str, required=True)
+    p.add_argument("--out-dir", type=Path, required=True)
+    p.add_argument("--chunk-size", type=int, default=100)
+    p.add_argument("--top-k", type=int, default=15)
+    args = p.parse_args()
+    args.out_dir.mkdir(parents=True, exist_ok=True)
+    prompts: list[dict] = []
+    with args.prompts.open(encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            prompts.append(json.loads(line))
+    conn = sqlite3.connect(args.db)
+    t0 = time.monotonic()
+    items: list[dict] = []
+    for i, p_obj in enumerate(prompts, 1):
+        text = p_obj["text"]
+        sha = _sha1_16(text)
+        cands = _get_candidates(conn, text, args.top_k)
+        items.append({"sha": sha, "prompt": text, "candidates": cands})
+        if i % 50 == 0:
+            print(f"[precompute] {i}/{len(prompts)}  wall={time.monotonic()-t0:.1f}s", flush=True)
+    conn.close()
+    # Write chunks
+    n_chunks = (len(items) + args.chunk_size - 1) // args.chunk_size
+    for ci in range(n_chunks):
+        chunk = items[ci * args.chunk_size : (ci + 1) * args.chunk_size]
+        out = args.out_dir / f"in-chunk{ci+1:02d}.json"
+        out.write_text(json.dumps(chunk, ensure_ascii=False, indent=2), encoding="utf-8")
+    print(f"[precompute] DONE wrote {n_chunks} chunks ({len(items)} prompts) to {args.out_dir}", flush=True)
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())