PyPI - memorymaster - Versions diffs - 3.5.1__tar.gz → 3.6.0__tar.gz - Mend

memorymaster 3.5.1tar.gz → 3.6.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (321) hide show

{memorymaster-3.5.1/memorymaster.egg-info → memorymaster-3.6.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: memorymaster
-Version: 3.5.1
+Version: 3.6.0
 Summary: Production-grade memory reliability system for AI coding agents. Lifecycle-managed claims with citations, conflict detection, steward governance, and MCP integration.
 Author: wolverin0
 License: MIT

{memorymaster-3.5.1 → memorymaster-3.6.0}/memorymaster/config_templates/hooks/memorymaster-steward-cycle.py RENAMED Viewed

@@ -6,6 +6,19 @@ DB_PATH = os.path.join(PROJECT_ROOT, "memorymaster.db")
 sys.path.insert(0, PROJECT_ROOT)
 os.environ["MEMORYMASTER_DEFAULT_DB"] = DB_PATH
+# LLM stack: claude_cli (Claude Code OAuth via local `claude --print`) is the
+# primary, with Ollama gemma4:e4b as a defensive fallback. Direct assignment
+# (NOT setdefault) — the hook MUST own these vars so an inherited shell env
+# can't silently route LLM calls to a stale provider. Bug observed 2026-04-25:
+# setdefault was a no-op when the inherited env already had MEMORYMASTER_LLM_PROVIDER
+# set, so the new model name routed to the OLD provider → 50× HTTP 404 per cycle
+# before the fallback chain saved it. Captured as v3.5.0 release notes.
+os.environ["MEMORYMASTER_LLM_PROVIDER"] = "claude_cli"
+os.environ["MEMORYMASTER_LLM_MODEL"] = "claude-haiku-4-5-20251001"
+os.environ["MEMORYMASTER_LLM_FALLBACK_PROVIDER"] = "ollama"
+os.environ["MEMORYMASTER_LLM_FALLBACK_MODEL"] = "gemma4:e4b"
 os.chdir(PROJECT_ROOT)
 try:
@@ -36,11 +49,9 @@ try:
 except Exception as e:
     print(f"[MemoryMaster] auto-archive error: {e}", file=sys.stderr)
-# Wiki absorb (compiled truth + timeline articles)
+# Wiki absorb (compiled truth + timeline articles). Inherits the LLM provider
+# block above — uses the same OAuth-backed haiku stack as the steward.
 try:
-    # Keys come from the rotator file (~/.memorymaster/gemini-keys.env) or a
-    # singular GEMINI_API_KEY env var. Hook must never hardcode credentials.
-    os.environ.setdefault("MEMORYMASTER_LLM_PROVIDER", "google")
     from memorymaster.wiki_engine import absorb
     wiki_path = os.path.join(PROJECT_ROOT, "obsidian-vault", "wiki")
     stats = absorb(DB_PATH, wiki_path)

{memorymaster-3.5.1 → memorymaster-3.6.0}/memorymaster/entity_extractor.py RENAMED Viewed

@@ -496,7 +496,7 @@ def extract_patterns(text: str) -> list[Entity]:
 # Version identifier baked into the prompt. Bump this string when the prompt
 # changes so that downstream idempotency / caching keys invalidate cleanly.
-LLM_PROMPT_VERSION = "entity-l2-v2-2026-04-25"
+LLM_PROMPT_VERSION = "entity-l2-v3-2026-04-27"
 # Permitted entity kinds for Layer-2. Any `kind` returned by the LLM that is
 # not in this set is dropped to keep the registry schema predictable.
@@ -513,22 +513,50 @@ LLM_KINDS: frozenset[str] = frozenset(
 _LLM_ENV_FLAG = "MEMORYMASTER_ENTITY_LLM"
 _LLM_MAX_TEXT_CHARS = 4000  # Truncate long claims before sending to LLM.
-_LLM_MAX_ENTITIES = 8       # Hard cap to keep cost bounded per claim.
+_LLM_MAX_ENTITIES = 5       # v3: tightened from 8 to 5 — overgeneration was the
+                            # dominant failure mode in v2 backfill batches.
-_LLM_PROMPT = f"""Extract entities from the snippet that regex cannot catch.
+_LLM_PROMPT = f"""Extract HIGH-VALUE entities from the snippet — only ones a future agent would search for by name. Be conservative.
 Prompt version: {LLM_PROMPT_VERSION}
-Allowed kinds: person_name, spanish_surname, time_expression, model_name, library_name, concept.
-Skip: file paths, env-vars, hostnames, ports, commit SHAs, tool names.
-Max {_LLM_MAX_ENTITIES} entities. Output STRICT JSON ARRAY only — no prose, no code fence.
+ALLOWED kinds (return ONE per entity): person_name, spanish_surname, time_expression, model_name, library_name, concept.
+WHEN IN DOUBT, SKIP. A future agent searching memory for this claim should be searching by the entity name itself, not by a generic word.
+ALWAYS SKIP:
+- File paths, directories, env vars, hostnames, IPs, ports
+- Commit SHAs, branch names, tool names like "git", "docker", "npm", "sqlite", "psql"
+- Generic English words: "system", "config", "service", "module", "function", "component", "data", "process", "task", "user"
+- Generic Spanish words: "sistema", "config", "servicio", "modulo", "funcion", "componente", "datos", "proceso", "tarea", "usuario", "cosa", "caso"
+- Standalone numbers, percentages, dates already in YYYY-MM-DD form
+- HTML/CSS class names, JSON keys, code identifiers in snake_case or camelCase
+Quality bar by kind:
+- person_name: full name (≥2 capitalized words) of a real person, NOT a role like "user" or "developer"
+- spanish_surname: bare surname when it stands alone WITHOUT a first name
+- time_expression: relative phrases like "next Thursday", "el lunes pasado", "Q3 2026" — NOT absolute YYYY-MM-DD dates
+- model_name: AI model identifier with a recognizable family prefix (gpt-, claude-, gemini-, llama-, mistral-) AND a version
+- library_name: a SPECIFIC named library/framework like "FastAPI", "React", "pyafipws" — NOT "the API" or "the framework"
+- concept: a named domain concept (3+ words usually) that appears as a noun-phrase a person would research, like "RRF fusion", "byzantine consensus", "writer-lock contention" — NOT generic ideas
+Output STRICT JSON ARRAY only — no prose, no code fence. Max {_LLM_MAX_ENTITIES} entities. If nothing in the snippet rises to the bar, return [].
 Schema (use EXACT field names):
   [{{"kind": "...", "surface_form": "exact substring from text", "aliases": []}}]
-Example input: "Ada Lovelace y Charles Babbage usaron FastAPI y gpt-4o-mini."
-Example output: [{{"kind":"person_name","surface_form":"Ada Lovelace","aliases":[]}},{{"kind":"person_name","surface_form":"Charles Babbage","aliases":[]}},{{"kind":"library_name","surface_form":"FastAPI","aliases":[]}},{{"kind":"model_name","surface_form":"gpt-4o-mini","aliases":[]}}]
+POSITIVE example
+Input: "Ada Lovelace y Charles Babbage usaron FastAPI y gpt-4o-mini el lunes pasado para implementar RRF fusion."
+Output: [{{"kind":"person_name","surface_form":"Ada Lovelace","aliases":[]}},{{"kind":"person_name","surface_form":"Charles Babbage","aliases":[]}},{{"kind":"library_name","surface_form":"FastAPI","aliases":[]}},{{"kind":"model_name","surface_form":"gpt-4o-mini","aliases":[]}},{{"kind":"concept","surface_form":"RRF fusion","aliases":[]}}]
+NEGATIVE example (bloat to AVOID)
+Input: "El sistema usa la base de datos para guardar config del usuario en el modulo principal."
+Output: []
+(All terms are generic — system, database, config, user, module — none worth indexing.)
-If nothing fits, return: []
+NEGATIVE example (path/SHA noise)
+Input: "Bug fixed in commit a133bc6 in src/auth/login.py — see logs at /var/log/app.log"
+Output: []
+(Commit SHA, file path, log path — all skip.)
 """.strip()

{memorymaster-3.5.1 → memorymaster-3.6.0}/memorymaster/llm_provider.py RENAMED Viewed

@@ -453,17 +453,60 @@ def call_llm(prompt: str, text: str) -> str:
 def parse_json_response(text: str) -> list[dict]:
-    """Parse LLM response as JSON array, handling markdown code fences."""
+    """Parse LLM response as JSON array, handling markdown code fences and prose preambles.
+    Resilient to four common LLM output shapes:
+      1. raw JSON array: ``[{...}, {...}]``
+      2. fenced JSON: ``\u0060\u0060\u0060json\\n[...]\\n\u0060\u0060\u0060``
+      3. prose preamble + fenced: ``Here is the answer:\\n\u0060\u0060\u0060json\\n[...]\u0060\u0060\u0060``
+      4. prose preamble + raw: ``The entities are: [...]``
+    Strategy: try direct parse, then try fenced-strip from start, then fall back
+    to greedy-extracting the largest ``[...]`` block in the text.
+    """
     text = text.strip()
+    # Shape 2 — strict fenced from the very start.
     if text.startswith("```"):
-        text = re.sub(r"^```(?:json)?\n?", "", text)
-        text = re.sub(r"\n?```$", "", text)
+        stripped = re.sub(r"^```(?:json)?\n?", "", text)
+        stripped = re.sub(r"\n?```$", "", stripped)
+        try:
+            result = json.loads(stripped)
+            return _coerce_to_list(result)
+        except (json.JSONDecodeError, ValueError):
+            pass
+    # Shape 1 — direct parse.
     try:
         result = json.loads(text)
-        if isinstance(result, list):
-            return result
-        if isinstance(result, dict):
-            return [result]
-        return []
+        return _coerce_to_list(result)
     except (json.JSONDecodeError, ValueError):
-        return []
+        pass
+    # Shapes 3 + 4 — find the first ``\u0060\u0060\u0060json``/``\u0060\u0060\u0060`` block; if absent, the largest ``[...]``.
+    fenced_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", text)
+    if fenced_match:
+        try:
+            result = json.loads(fenced_match.group(1).strip())
+            return _coerce_to_list(result)
+        except (json.JSONDecodeError, ValueError):
+            pass
+    # Greedy: first ``[`` to last matching ``]``. Defensive against prose with stray brackets.
+    first = text.find("[")
+    last = text.rfind("]")
+    if first != -1 and last > first:
+        try:
+            result = json.loads(text[first : last + 1])
+            return _coerce_to_list(result)
+        except (json.JSONDecodeError, ValueError):
+            pass
+    return []
+def _coerce_to_list(result) -> list[dict]:
+    if isinstance(result, list):
+        return result
+    if isinstance(result, dict):
+        return [result]
+    return []

{memorymaster-3.5.1 → memorymaster-3.6.0/memorymaster.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: memorymaster
-Version: 3.5.1
+Version: 3.6.0
 Summary: Production-grade memory reliability system for AI coding agents. Lifecycle-managed claims with citations, conflict detection, steward governance, and MCP integration.
 Author: wolverin0
 License: MIT

{memorymaster-3.5.1 → memorymaster-3.6.0}/memorymaster.egg-info/SOURCES.txt RENAMED Viewed

@@ -189,13 +189,16 @@ scripts/generate_drill_signoff.py
 scripts/git_to_turns.py
 scripts/github_live_to_turns.py
 scripts/gitnexus_to_claims.py
+scripts/grid_recall_weights.py
 scripts/index_claims_to_qdrant.py
 scripts/ingest_planning_docs.py
 scripts/jira_live_to_turns.py
+scripts/label_prompts_with_judge.py
 scripts/llm_benchmark.py
 scripts/merge_scope_variants.py
 scripts/messages_to_turns.py
 scripts/operator_metrics.py
+scripts/precompute_candidates.py
 scripts/recurring_incident_drill.py
 scripts/release_readiness.py
 scripts/run_codex_autologger.py
@@ -251,12 +254,14 @@ tests/test_fts5_search.py
 tests/test_graph_distance.py
 tests/test_graph_store.py
 tests/test_handler_regressions.py
+tests/test_hook_env_isolation.py
 tests/test_human_id.py
 tests/test_incident_drill_runner.py
 tests/test_integration_workflows.py
 tests/test_key_rotator.py
 tests/test_lifecycle.py
 tests/test_llm_fallback.py
+tests/test_llm_provider_claude_cli.py
 tests/test_llm_steward_coverage.py
 tests/test_llm_steward_key_rotation.py
 tests/test_mcp_helpers.py

{memorymaster-3.5.1 → memorymaster-3.6.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "memorymaster"
-version = "3.5.1"
+version = "3.6.0"
 description = "Production-grade memory reliability system for AI coding agents. Lifecycle-managed claims with citations, conflict detection, steward governance, and MCP integration."
 license = {text = "MIT"}
 authors = [{name = "wolverin0"}]

memorymaster-3.6.0/scripts/grid_recall_weights.py ADDED Viewed

@@ -0,0 +1,191 @@
+"""Grid-search the recall weight knobs against precision@5.
+Sweeps W_LEXICAL × W_FRESHNESS × W_GRAPH against the existing 100-prompt
+evaluation harness (`scripts/eval_recall_precision_at_5.py`) and writes a
+sorted markdown table + raw JSONL log so a future tweak is reproducible.
+W_VECTOR is skipped because the local DB has no Qdrant; the stream is a
+no-op without `MEMORYMASTER_USE_QDRANT=1` and a populated index.
+Usage:
+    python scripts/grid_recall_weights.py \
+        --prompts artifacts/real-prompts-100.jsonl \
+        --db memorymaster.db \
+        --output artifacts/recall-weight-tuning-2026-04-26.md
+"""
+from __future__ import annotations
+import argparse
+import itertools
+import json
+import os
+import re
+import subprocess
+import sys
+import time
+from pathlib import Path
+# Modest 3 × 3 × 4 = 36 grid. Bounded by ~10s/combo wall via subprocess startup.
+W_LEXICAL_GRID = (0.2, 0.3, 0.4)
+W_FRESHNESS_GRID = (0.0, 0.05, 0.1)
+W_GRAPH_GRID = (0.0, 0.05, 0.1, 0.2)
+METRIC_RE = {
+    "precision@5": re.compile(r"precision@5\s*=\s*([\d.]+)"),
+    "MAP@5": re.compile(r"MAP@5\s*=\s*([\d.]+)"),
+    "hit@5": re.compile(r"hit@5\s*=\s*([\d.]+)"),
+    "p95_ms": re.compile(r"p95\s*=\s*([\d.]+)\s*ms"),
+}
+def _run_eval(
+    eval_script: Path,
+    prompts: Path,
+    db: Path,
+    weights: dict,
+    json_out: Path,
+    label: str,
+) -> dict | None:
+    env = os.environ.copy()
+    for k, v in weights.items():
+        env[f"MEMORYMASTER_RECALL_{k}"] = str(v)
+    # The GRAPH stream is opt-in: W_GRAPH alone is a no-op unless the stream
+    # itself is enabled. Turn it on only when the weight is non-zero — keeps
+    # the latency-cost cells out of the grid when they can't possibly help.
+    if weights.get("W_GRAPH", 0) > 0:
+        env["MEMORYMASTER_RECALL_GRAPH"] = "1"
+    # Same for the freshness stream.
+    if weights.get("W_FRESHNESS", 0) > 0:
+        env["MEMORYMASTER_RECALL_FRESHNESS"] = "1"
+    proc = subprocess.run(
+        [
+            sys.executable,
+            str(eval_script),
+            "--prompts",
+            str(prompts),
+            "--db",
+            str(db),
+            "--json-out",
+            str(json_out),
+            "--label",
+            label,
+        ],
+        capture_output=True,
+        text=True,
+        env=env,
+        timeout=600,
+    )
+    if proc.returncode != 0:
+        return {"error": proc.stderr.strip()[:200] or "non-zero exit"}
+    out = proc.stdout
+    parsed = {"label": label, **{k: v for k, v in weights.items()}}
+    for metric, rgx in METRIC_RE.items():
+        m = rgx.search(out)
+        parsed[metric] = float(m.group(1)) if m else None
+    return parsed
+def main() -> int:
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument("--prompts", type=Path, required=True)
+    p.add_argument("--db", type=Path, required=True)
+    p.add_argument("--output", type=Path, required=True)
+    p.add_argument(
+        "--eval-script",
+        type=Path,
+        default=Path("scripts/eval_recall_precision_at_5.py"),
+    )
+    p.add_argument(
+        "--per-run-json-dir",
+        type=Path,
+        default=Path("artifacts/grid-runs"),
+        help="Per-cell raw eval JSONL dump directory.",
+    )
+    args = p.parse_args()
+    args.per_run_json_dir.mkdir(parents=True, exist_ok=True)
+    combos = list(
+        itertools.product(W_LEXICAL_GRID, W_FRESHNESS_GRID, W_GRAPH_GRID)
+    )
+    print(f"[grid] running {len(combos)} cells over W_LEXICAL × W_FRESHNESS × W_GRAPH")
+    rows: list[dict] = []
+    t_total = time.monotonic()
+    for i, (w_lex, w_fresh, w_graph) in enumerate(combos, 1):
+        weights = {"W_LEXICAL": w_lex, "W_FRESHNESS": w_fresh, "W_GRAPH": w_graph}
+        label = f"L{w_lex}_F{w_fresh}_G{w_graph}"
+        json_out = args.per_run_json_dir / f"{label}.jsonl"
+        t0 = time.monotonic()
+        row = _run_eval(args.eval_script, args.prompts, args.db, weights, json_out, label)
+        wall = time.monotonic() - t0
+        if row is None:
+            row = {"error": "no output"}
+        row["wall_s"] = round(wall, 1)
+        rows.append(row)
+        prec = row.get("precision@5")
+        prec_str = f"{prec:.3f}" if isinstance(prec, float) else "ERR"
+        print(f"[grid] {i}/{len(combos)} {label} wall={wall:.1f}s p@5={prec_str}")
+    # Pick best by precision@5 (tie-break MAP@5 desc, then p95 asc)
+    valid = [r for r in rows if isinstance(r.get("precision@5"), float)]
+    valid.sort(
+        key=lambda r: (
+            -r["precision@5"],
+            -(r.get("MAP@5") or 0.0),
+            r.get("p95_ms") or 1e9,
+        )
+    )
+    # Write markdown report
+    lines = [
+        "# Recall weight grid — precision@5 tuning",
+        "",
+        f"- Eval prompts: `{args.prompts}` (100, 70 labeled)",
+        f"- DB: `{args.db}` (post-L2-backfill snapshot)",
+        f"- Grid: W_LEXICAL × W_FRESHNESS × W_GRAPH = "
+        f"{len(W_LEXICAL_GRID)} × {len(W_FRESHNESS_GRID)} × {len(W_GRAPH_GRID)} = {len(combos)} cells",
+        f"- Total wall: {round(time.monotonic()-t_total, 1)}s",
+        "",
+        "## Top 10 by precision@5",
+        "",
+        "| W_LEXICAL | W_FRESHNESS | W_GRAPH | precision@5 | MAP@5 | hit@5 | p95 ms | wall s |",
+        "|---|---|---|---|---|---|---|---|",
+    ]
+    for r in valid[:10]:
+        lines.append(
+            f"| {r['W_LEXICAL']} | {r['W_FRESHNESS']} | {r['W_GRAPH']} "
+            f"| {r['precision@5']:.3f} | {r.get('MAP@5'):.3f} | {r.get('hit@5'):.3f} "
+            f"| {r.get('p95_ms')} | {r['wall_s']} |"
+        )
+    if not valid:
+        lines.append("| — | — | — | NO VALID RUNS | | | | |")
+    else:
+        winner = valid[0]
+        lines += [
+            "",
+            "## Winner",
+            "",
+            f"`MEMORYMASTER_RECALL_W_LEXICAL={winner['W_LEXICAL']}` "
+            f"`MEMORYMASTER_RECALL_W_FRESHNESS={winner['W_FRESHNESS']}` "
+            f"`MEMORYMASTER_RECALL_W_GRAPH={winner['W_GRAPH']}`",
+            "",
+            f"precision@5 = **{winner['precision@5']:.3f}** "
+            f"(baseline 0.152, delta = {(winner['precision@5'] - 0.152):+.3f})",
+        ]
+    args.output.write_text("\n".join(lines) + "\n", encoding="utf-8")
+    # Also dump rows as JSON for downstream automation
+    json_path = args.output.with_suffix(".json")
+    json_path.write_text(json.dumps(rows, indent=2), encoding="utf-8")
+    print(f"[grid] wrote {args.output}")
+    print(f"[grid] wrote {json_path}")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

memorymaster-3.6.0/scripts/label_prompts_with_judge.py ADDED Viewed

@@ -0,0 +1,211 @@
+"""LLM-judge: label which retrieved claims actually answer each synthetic prompt.
+For each prompt in the input JSONL:
+  1. Run the production recall hook to get the top-K (default 20) candidate claims.
+  2. Send (prompt + candidate snippets) to a haiku judge.
+  3. Judge returns the subset of claim IDs that genuinely answer the prompt.
+  4. Write {sha1_16(prompt): [claim_ids]} into the labels JSON.
+Usage:
+    python scripts/label_prompts_with_judge.py \
+        --prompts artifacts/real-prompts-1000.jsonl \
+        --db memorymaster.db \
+        --labels-out artifacts/real-prompts-1000-labels.json \
+        --top-k 20 \
+        --max-prompts 1000
+The output is consumed by scripts/eval_recall_precision_at_5.py via the
+``<prompts>-labels.json`` convention.
+"""
+from __future__ import annotations
+import argparse
+import hashlib
+import json
+import os
+import sys
+import time
+from pathlib import Path
+def _sha1_16(text: str) -> str:
+    return hashlib.sha1(text.encode("utf-8")).hexdigest()[:16]
+def _judge_prompt(prompt: str, candidates: list[dict]) -> str:
+    candidate_lines = "\n".join(
+        f"[{c['id']}] {c['text'][:300]}" for c in candidates
+    )
+    return f"""You are a relevance judge. Given a USER QUERY and a list of CANDIDATE memory claims, return the subset of claim IDs that genuinely answer the query.
+USER QUERY: {prompt}
+CANDIDATES (id and snippet):
+{candidate_lines}
+Rules:
+- Return ONLY claim IDs that DIRECTLY answer the query (not tangentially related).
+- An empty list is a valid answer if no candidate genuinely answers.
+- Return JSON ARRAY ONLY of integer IDs, no prose, no fence. Example: [123, 456]
+- Be strict — pick at most 5, prefer 0-3 high-quality matches over many weak ones."""
+def _get_candidates(db_path: str, prompt: str, top_k: int) -> list[dict]:
+    """Run production recall via context_hook and return top-K candidates."""
+    # Use the same return_ids=True path as the eval harness.
+    from memorymaster import context_hook
+    # Recall returns rendered bullet text; we need ids + raw claim text.
+    # Easiest: get the IDs from recall, then fetch claim text from DB.
+    try:
+        # context_hook.recall signature:
+        #   recall(query, *, db_path='', budget=2000, format='text', skip_qdrant=False, return_ids=False)
+        result = context_hook.recall(
+            prompt,
+            db_path=db_path,
+            return_ids=True,
+        )
+        if isinstance(result, tuple):
+            _, ids = result
+        else:
+            ids = []
+    except Exception as exc:
+        print(f"[label] recall() raised: {exc}", flush=True)
+        ids = []
+    if not ids:
+        return []
+    import sqlite3
+    conn = sqlite3.connect(db_path)
+    try:
+        rows = []
+        for cid in ids[:top_k]:
+            row = conn.execute(
+                "SELECT id, text FROM claims WHERE id = ?", (cid,)
+            ).fetchone()
+            if row:
+                rows.append({"id": row[0], "text": row[1] or ""})
+        return rows
+    finally:
+        conn.close()
+def _call_judge(prompt: str, candidates: list[dict]) -> list[int]:
+    """Single LLM call to the judge. Returns list of claim IDs."""
+    from memorymaster.llm_provider import call_llm, parse_json_response
+    judge_text = _judge_prompt(prompt, candidates)
+    raw = call_llm(judge_text, "")
+    if not raw:
+        return []
+    parsed = parse_json_response(raw)
+    # parse_json_response returns list of dicts; we want bare ints.
+    # If it returns [{"id": 123}, ...] coerce; otherwise try raw int parsing.
+    ids: list[int] = []
+    for item in parsed:
+        if isinstance(item, int):
+            ids.append(item)
+        elif isinstance(item, dict):
+            v = item.get("id") or item.get("claim_id")
+            if isinstance(v, int):
+                ids.append(v)
+        elif isinstance(item, str) and item.strip().lstrip("-").isdigit():
+            ids.append(int(item.strip()))
+    # Fallback: regex-extract integers from raw if parser missed it
+    if not ids:
+        import re
+        ids = [int(m) for m in re.findall(r"\b\d{2,8}\b", raw)]
+    return ids
+def main() -> int:
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument("--prompts", type=Path, required=True)
+    p.add_argument("--db", type=str, required=True)
+    p.add_argument("--labels-out", type=Path, required=True)
+    p.add_argument("--top-k", type=int, default=20)
+    p.add_argument("--max-prompts", type=int, default=1000)
+    p.add_argument(
+        "--checkpoint-every",
+        type=int,
+        default=25,
+        help="Flush labels JSON every N prompts (resume-safe).",
+    )
+    args = p.parse_args()
+    # Force claude_cli for the judge — Gemini API is rate-limited and slow.
+    # Direct assignment (NOT setdefault) — avoid the v3.5.0 hook bug where
+    # an inherited shell env left the provider stale.
+    os.environ["MEMORYMASTER_LLM_PROVIDER"] = "claude_cli"
+    os.environ["MEMORYMASTER_LLM_MODEL"] = "claude-haiku-4-5-20251001"
+    prompts: list[dict] = []
+    with args.prompts.open(encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            prompts.append(json.loads(line))
+    prompts = prompts[: args.max_prompts]
+    # Resume from existing labels file if present
+    labels: dict[str, list[int]] = {}
+    if args.labels_out.exists():
+        labels = json.loads(args.labels_out.read_text(encoding="utf-8")).get(
+            "labels", {}
+        )
+        print(f"[label] resuming from {len(labels)} existing labels", flush=True)
+    t_start = time.monotonic()
+    for i, p_obj in enumerate(prompts, 1):
+        text = p_obj["text"]
+        sha = _sha1_16(text)
+        if sha in labels:
+            continue
+        try:
+            cands = _get_candidates(args.db, text, args.top_k)
+            if not cands:
+                labels[sha] = []
+            else:
+                ids = _call_judge(text, cands)
+                # Filter to only IDs that were actually in the candidate set
+                cand_ids = {c["id"] for c in cands}
+                labels[sha] = [i for i in ids if i in cand_ids][:5]
+        except Exception as exc:
+            print(f"[label] {i}: ERROR {exc}", flush=True)
+            labels[sha] = []
+        if i % 5 == 0:
+            elapsed = time.monotonic() - t_start
+            avg = elapsed / i
+            eta = avg * (len(prompts) - i)
+            print(
+                f"[label] {i}/{len(prompts)}  avg={avg:.1f}s  eta={eta/60:.1f}min  "
+                f"last={labels[sha]}",
+                flush=True,
+            )
+        if i % args.checkpoint_every == 0:
+            args.labels_out.write_text(
+                json.dumps({"labels": labels}, indent=2), encoding="utf-8"
+            )
+    args.labels_out.write_text(
+        json.dumps({"labels": labels}, indent=2), encoding="utf-8"
+    )
+    n_labeled = sum(1 for v in labels.values() if v)
+    print(
+        f"[label] DONE wrote {len(labels)} labels "
+        f"({n_labeled} non-empty) to {args.labels_out}",
+        flush=True,
+    )
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

memorymaster 3.5.1__tar.gz → 3.6.0__tar.gz

memorymaster 3.5.1tar.gz → 3.6.0tar.gz