PyPI - anvil-serving - Versions diffs - 0.4.0__py3-none-any.whl - Mend

anvil-serving 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

anvil_serving/__init__.py +2 -0
anvil_serving/_aggregate_usage.py +193 -0
anvil_serving/_role_split.py +91 -0
anvil_serving/_sync.py +359 -0
anvil_serving/benchmark.py +224 -0
anvil_serving/cache_prune.py +608 -0
anvil_serving/cli.py +31 -0
anvil_serving/config.py +40 -0
anvil_serving/deploy.py +51 -0
anvil_serving/eval.py +167 -0
anvil_serving/models.py +22 -0
anvil_serving/multiplexer.py +773 -0
anvil_serving/preflight.py +144 -0
anvil_serving/profile.py +22 -0
anvil_serving/py.typed +0 -0
anvil_serving/router/__init__.py +102 -0
anvil_serving/router/__main__.py +45 -0
anvil_serving/router/backends/__init__.py +32 -0
anvil_serving/router/backends/cloud.py +470 -0
anvil_serving/router/backends/local.py +62 -0
anvil_serving/router/backends/relay.py +72 -0
anvil_serving/router/calibrate.py +402 -0
anvil_serving/router/classify.py +225 -0
anvil_serving/router/commit_window.py +330 -0
anvil_serving/router/config.py +338 -0
anvil_serving/router/decision_log.py +236 -0
anvil_serving/router/dialects/__init__.py +52 -0
anvil_serving/router/dialects/anthropic.py +264 -0
anvil_serving/router/dialects/openai.py +216 -0
anvil_serving/router/discovery.py +60 -0
anvil_serving/router/fallback.py +689 -0
anvil_serving/router/fingerprint.py +133 -0
anvil_serving/router/front_door.py +683 -0
anvil_serving/router/intent.py +263 -0
anvil_serving/router/internal.py +185 -0
anvil_serving/router/metrics.py +428 -0
anvil_serving/router/policy.py +308 -0
anvil_serving/router/prices.py +92 -0
anvil_serving/router/profile_bootstrap.py +528 -0
anvil_serving/router/profile_store.py +374 -0
anvil_serving/router/registry.py +356 -0
anvil_serving/router/seams.py +267 -0
anvil_serving/router/secrets.py +233 -0
anvil_serving/router/serve.py +592 -0
anvil_serving/router/tier0_keywords.json +7 -0
anvil_serving/router/verify.py +733 -0
anvil_serving/score.py +621 -0
anvil_serving/serves.py +240 -0
anvil_serving-0.4.0.dist-info/METADATA +474 -0
anvil_serving-0.4.0.dist-info/RECORD +54 -0
anvil_serving-0.4.0.dist-info/WHEEL +5 -0
anvil_serving-0.4.0.dist-info/entry_points.txt +2 -0
anvil_serving-0.4.0.dist-info/licenses/LICENSE +21 -0
anvil_serving-0.4.0.dist-info/top_level.txt +1 -0

anvil_serving/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ """anvil-serving — right-size and run a local LLM serving tier from your coding-agent usage."""
2	+ __version__ = "0.4.0"

anvil_serving/_aggregate_usage.py ADDED Viewed

@@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+"""aggregate_usage.py - roll up ALL Claude Code session logs into inference-sizing metrics.
+Mirrors session-retro/session_stats.py field definitions (output/input/cache tokens,
+tool_use, workflow <usage> fan-out) and adds the distributions that size a local
+inference server: per-call context size, generation length, concurrency, throughput.
+Stdlib only. Reads ~/.claude/projects/**/*.jsonl. Writes JSON to stdout (or --out).
+"""
+import json
+import os
+import re
+import sys
+import glob
+from collections import Counter, defaultdict
+from datetime import datetime
+PROJECTS = os.environ.get("ANVIL_CLAUDE_LOGS") or os.path.expanduser("~/.claude/projects")
+def pct(sorted_vals, q):
+    if not sorted_vals: return 0
+    if q <= 0: return sorted_vals[0]
+    if q >= 100: return sorted_vals[-1]
+    i = (len(sorted_vals)-1) * q/100.0
+    lo = int(i); hi = min(lo+1, len(sorted_vals)-1)
+    frac = i-lo
+    return int(round(sorted_vals[lo]*(1-frac) + sorted_vals[hi]*frac))
+def fiso(x):
+    try: return datetime.fromisoformat(x.replace("Z","+00:00"))
+    except Exception: return None
+def main():
+    out_path = None
+    if "--out" in sys.argv:
+        out_path = sys.argv[sys.argv.index("--out")+1]
+    files = glob.glob(os.path.join(PROJECTS, "**", "*.jsonl"), recursive=True)
+    # global accumulators
+    tot = Counter()                      # output/input/cc/cr/asst_calls
+    by_model_calls = Counter()
+    by_model_out = Counter()
+    tools = Counter()
+    skills = Counter()
+    agent_types = Counter()
+    ctx_sizes = []                       # input+cc+cr per assistant call
+    out_sizes = []                       # output_tokens per assistant call
+    per_min_calls = Counter()            # ts[:16] -> calls
+    per_min_sessions = defaultdict(set)  # ts[:16] -> {sessionId}
+    per_hour_out = Counter()             # ts[:13] -> output tokens
+    per_day = defaultdict(lambda: [0,0,0])  # day -> [out_tokens, calls, ]
+    per_day_sessions = defaultdict(set)
+    wf_count = 0
+    wf_agent_counts = []                 # agent_count per workflow dispatch
+    wf_subagent_tokens = 0
+    sess_summ = []                       # (duration_h, asst_turns, out_tokens)
+    n_sessions = 0
+    def g(pat, txt):
+        m = re.search(pat, txt, re.S)
+        return m.group(1) if m else None
+    for f in files:
+        n_sessions += 1
+        s_first=s_last=None; s_asst=0; s_out=0; sid=None
+        try:
+            fh = open(f, encoding="utf-8", errors="replace")
+        except Exception:
+            continue
+        with fh:
+            for line in fh:
+                line=line.strip()
+                if not line: continue
+                try: d = json.loads(line)
+                except Exception: continue
+                if sid is None: sid = d.get("sessionId")
+                ts = d.get("timestamp")
+                if ts:
+                    s_first = s_first or ts; s_last = ts
+                m = d.get("message")
+                if not isinstance(m, dict): continue
+                t = d.get("type")
+                if t == "assistant":
+                    u = m.get("usage") or {}
+                    it = u.get("input_tokens",0) or 0
+                    ot = u.get("output_tokens",0) or 0
+                    cc = u.get("cache_creation_input_tokens",0) or 0
+                    cr = u.get("cache_read_input_tokens",0) or 0
+                    tot["out"]+=ot; tot["inp"]+=it; tot["cc"]+=cc; tot["cr"]+=cr; tot["asst"]+=1
+                    s_asst+=1; s_out+=ot
+                    model = m.get("model") or "unknown"
+                    by_model_calls[model]+=1; by_model_out[model]+=ot
+                    ctx = it+cc+cr
+                    ctx_sizes.append(ctx); out_sizes.append(ot)
+                    if ts:
+                        mn = ts[:16]; hr = ts[:13]; day = ts[:10]
+                        per_min_calls[mn]+=1
+                        if sid: per_min_sessions[mn].add(sid)
+                        per_hour_out[hr]+=ot
+                        per_day[day][0]+=ot; per_day[day][1]+=1
+                        if sid: per_day_sessions[day].add(sid)
+                    for c in (m.get("content") or []):
+                        if isinstance(c, dict) and c.get("type")=="tool_use":
+                            nm = c.get("name","?"); tools[nm]+=1
+                            inp = c.get("input") or {}
+                            if nm=="Skill": skills[inp.get("skill","?")]+=1
+                            elif nm=="Agent": agent_types[inp.get("subagent_type") or "general-purpose"]+=1
+                elif t == "user":
+                    c = m.get("content")
+                    txt = c if isinstance(c,str) else (" ".join(x.get("text","") for x in c if isinstance(x,dict) and x.get("type")=="text") if isinstance(c,list) else "")
+                    if "<usage>" in txt or "<task-notification>" in txt:
+                        wf_count += 1
+                        ac = g(r"<agent_count>(\d+)", txt); st = g(r"<subagent_tokens>(\d+)", txt)
+                        if ac: wf_agent_counts.append(int(ac))
+                        if st: wf_subagent_tokens += int(st)
+        if s_asst:
+            dur = 0.0
+            if s_first and s_last:
+                a,b = fiso(s_first), fiso(s_last)
+                if a and b: dur = round((b-a).total_seconds()/3600,3)
+            sess_summ.append((dur, s_asst, s_out))
+    ctx_sizes.sort(); out_sizes.sort()
+    durs = sorted(x[0] for x in sess_summ)
+    turns = sorted(x[1] for x in sess_summ)
+    # concurrency / throughput
+    peak_calls_min = per_min_calls.most_common(1)[0] if per_min_calls else ("",0)
+    min_call_counts = sorted(per_min_calls.values())
+    peak_sessions_min = max(((mn, len(s)) for mn,s in per_min_sessions.items()), key=lambda x:x[1], default=("",0))
+    sess_per_min = sorted(len(s) for s in per_min_sessions.values())
+    peak_hour = per_hour_out.most_common(1)[0] if per_hour_out else ("",0)
+    days = sorted(per_day.keys())
+    daily = [{"day":d, "out":per_day[d][0], "calls":per_day[d][1], "sessions":len(per_day_sessions[d])} for d in days]
+    busiest = max(daily, key=lambda x:x["out"], default=None)
+    wf_agent_counts.sort()
+    res = dict(
+        window=dict(files=len(files), sessions_with_calls=len(sess_summ),
+                    first_day=days[0] if days else None, last_day=days[-1] if days else None,
+                    active_days=len(days)),
+        totals=dict(assistant_calls=tot["asst"], output_tokens=tot["out"], fresh_input_tokens=tot["inp"],
+                    cache_creation_tokens=tot["cc"], cache_read_tokens=tot["cr"],
+                    total_processed=tot["out"]+tot["inp"]+tot["cc"]+tot["cr"]),
+        model_mix=dict(calls=dict(by_model_calls.most_common()), output_tokens=dict(by_model_out.most_common())),
+        context_size_per_call=dict(n=len(ctx_sizes), p50=pct(ctx_sizes,50), p90=pct(ctx_sizes,90),
+                                   p95=pct(ctx_sizes,95), p99=pct(ctx_sizes,99), max=ctx_sizes[-1] if ctx_sizes else 0,
+                                   mean=int(sum(ctx_sizes)/len(ctx_sizes)) if ctx_sizes else 0),
+        generation_per_call=dict(p50=pct(out_sizes,50), p90=pct(out_sizes,90), p95=pct(out_sizes,95),
+                                 p99=pct(out_sizes,99), max=out_sizes[-1] if out_sizes else 0,
+                                 mean=int(sum(out_sizes)/len(out_sizes)) if out_sizes else 0),
+        concurrency=dict(
+            peak_calls_per_min=dict(minute=peak_calls_min[0], calls=peak_calls_min[1]),
+            p99_calls_per_min=pct(min_call_counts,99), p95_calls_per_min=pct(min_call_counts,95),
+            peak_parallel_sessions=dict(minute=peak_sessions_min[0], sessions=peak_sessions_min[1]),
+            p99_parallel_sessions=pct(sess_per_min,99), p95_parallel_sessions=pct(sess_per_min,95),
+        ),
+        throughput=dict(peak_output_tokens_per_hour=dict(hour=peak_hour[0], tokens=peak_hour[1]),
+                        peak_sustained_tok_per_s=round(peak_hour[1]/3600,1)),
+        workflows=dict(dispatches=wf_count, subagent_tokens=wf_subagent_tokens,
+                       agent_count_p50=pct(wf_agent_counts,50), agent_count_p95=pct(wf_agent_counts,95),
+                       agent_count_max=wf_agent_counts[-1] if wf_agent_counts else 0,
+                       total_subagents=sum(wf_agent_counts)),
+        sessions=dict(count=len(sess_summ), median_duration_h=pct(durs,50), p95_duration_h=pct([int(x*1000) for x in durs],95)/1000.0 if durs else 0,
+                      max_duration_h=durs[-1] if durs else 0, median_asst_turns=pct(turns,50), max_asst_turns=turns[-1] if turns else 0,
+                      sessions_per_active_day=round(len(sess_summ)/max(len(days),1),1)),
+        tools=dict(tools.most_common(25)),
+        skills=dict(skills.most_common(20)),
+        agent_types=dict(agent_types.most_common(20)),
+        busiest_day=busiest,
+        daily=daily,
+    )
+    txt = json.dumps(res, indent=1)
+    if out_path:
+        open(out_path,"w").write(txt)
+        print("WROTE", out_path, len(txt), "bytes")
+    # always print a compact human summary
+    T=res["totals"]; C=res["context_size_per_call"]; G=res["generation_per_call"]; X=res["concurrency"]; W=res["workflows"]
+    print("=== SUMMARY ===")
+    print("window:", res["window"])
+    print("assistant_calls:", T["assistant_calls"], "| output:", T["output_tokens"], "| total_processed:", T["total_processed"])
+    print("model_mix calls:", res["model_mix"]["calls"])
+    print("ctx/call p50/p90/p95/p99/max:", C["p50"],C["p90"],C["p95"],C["p99"],C["max"])
+    print("gen/call p50/p95/max:", G["p50"],G["p95"],G["max"])
+    print("peak calls/min:", X["peak_calls_per_min"], "| peak parallel sessions:", X["peak_parallel_sessions"])
+    print("throughput:", res["throughput"])
+    print("workflows:", W)
+    print("sessions:", res["sessions"])
+    print("busiest_day:", res["busiest_day"])
+if __name__ == "__main__":
+    main()

anvil_serving/_role_split.py ADDED Viewed

@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+"""role_split.py - split Claude Code per-call context/generation by ROLE.
+Splits assistant calls by isSidechain (subagent vs main orchestrator) and by model,
+so we can size the LOCAL specialist context ceiling from the subagent distribution
+rather than the orchestrator's long-context tail. Stdlib only.
+"""
+import json
+import os
+import glob
+import sys
+from collections import defaultdict
+PROJECTS = os.environ.get("ANVIL_CLAUDE_LOGS") or os.path.expanduser("~/.claude/projects")
+def pct(v, q):
+    if not v: return 0
+    if q<=0: return v[0]
+    if q>=100: return v[-1]
+    i=(len(v)-1)*q/100.0; lo=int(i); hi=min(lo+1,len(v)-1); f=i-lo
+    return int(round(v[lo]*(1-f)+v[hi]*f))
+def summarize(ctx, out):
+    ctx=sorted(ctx); out=sorted(out)
+    return dict(n=len(ctx),
+        ctx=dict(p50=pct(ctx,50),p90=pct(ctx,90),p95=pct(ctx,95),p99=pct(ctx,99),max=ctx[-1] if ctx else 0,
+                 mean=int(sum(ctx)/len(ctx)) if ctx else 0),
+        gen=dict(p50=pct(out,50),p95=pct(out,95),max=out[-1] if out else 0))
+def main():
+    out_path = sys.argv[sys.argv.index("--out")+1] if "--out" in sys.argv else None
+    files = glob.glob(os.path.join(PROJECTS,"**","*.jsonl"), recursive=True)
+    groups = defaultdict(lambda: ([],[]))   # key -> (ctx_list, out_list)
+    by_model = defaultdict(lambda: ([],[]))
+    seen_sidechain_field = 0; total = 0
+    # buckets to show what share of SUBAGENT calls fit a given local context ceiling
+    ceilings = [16384, 32768, 65536, 131072, 262144]
+    sub_ctx_all = []
+    for f in files:
+        try: fh=open(f, encoding="utf-8", errors="replace")
+        except Exception: continue
+        with fh:
+            for line in fh:
+                line=line.strip()
+                if not line: continue
+                try: d=json.loads(line)
+                except Exception: continue
+                if d.get("type")!="assistant": continue
+                m=d.get("message")
+                if not isinstance(m,dict): continue
+                u=m.get("usage") or {}
+                ctx=(u.get("input_tokens",0) or 0)+(u.get("cache_creation_input_tokens",0) or 0)+(u.get("cache_read_input_tokens",0) or 0)
+                ot=u.get("output_tokens",0) or 0
+                total+=1
+                sc = d.get("isSidechain")
+                if sc is not None: seen_sidechain_field+=1
+                role = "subagent" if sc else "main"
+                groups[role][0].append(ctx); groups[role][1].append(ot)
+                model=m.get("model") or "unknown"
+                by_model[model][0].append(ctx); by_model[model][1].append(ot)
+                if role=="subagent":
+                    sub_ctx_all.append(ctx)
+    res = dict(
+        total_assistant_calls=total,
+        records_with_isSidechain_field=seen_sidechain_field,
+        by_role={k:summarize(*v) for k,v in groups.items()},
+        by_model={k:summarize(*v) for k,v in sorted(by_model.items(), key=lambda x:-len(x[1][0]))},
+    )
+    # what context ceiling covers what fraction of SUBAGENT calls
+    sub_ctx_all.sort(); n=len(sub_ctx_all)
+    cov={}
+    for c in ceilings:
+        cov[c]= round(100.0*sum(1 for x in sub_ctx_all if x<=c)/n,1) if n else 0
+    res["subagent_context_coverage_pct"]=cov
+    txt=json.dumps(res, indent=1)
+    if out_path: open(out_path,"w").write(txt); print("WROTE",out_path)
+    print("=== ROLE SPLIT ===")
+    print("total assistant calls:",total,"| with isSidechain field:",seen_sidechain_field)
+    for role,v in res["by_role"].items():
+        print(f"\n[{role}] n={v['n']:,}")
+        print("  ctx p50/p90/p95/p99/max:", v["ctx"]["p50"],v["ctx"]["p90"],v["ctx"]["p95"],v["ctx"]["p99"],v["ctx"]["max"])
+        print("  gen p50/p95/max:", v["gen"]["p50"],v["gen"]["p95"],v["gen"]["max"])
+    print("\n[by model] (ctx p50/p95/max | gen p50)")
+    for mdl,v in list(res["by_model"].items())[:8]:
+        print(f"  {mdl:28} n={v['n']:>7,}  ctx {v['ctx']['p50']:>7,}/{v['ctx']['p95']:>7,}/{v['ctx']['max']:>7,}  gen p50 {v['gen']['p50']}")
+    print("\nSUBAGENT context coverage (share of subagent calls <= ceiling):")
+    for c,p in res["subagent_context_coverage_pct"].items():
+        print(f"  <= {c:>7,} tok : {p}%")
+if __name__=="__main__":
+    main()

anvil_serving/_sync.py ADDED Viewed

@@ -0,0 +1,359 @@
+#!/usr/bin/env python3
+"""sync_model_cards.py - catalog local models: pull HF cards + local config, extract serving facts.
+Scans HuggingFace caches and plain model dirs, downloads each model's README card,
+reads local config.json / generation_config.json, extracts the fields that matter for
+SERVING (format GGUF-vs-safetensors, quant, context, license, thinking-default,
+recommended sampling, benchmarks, serving notes), and writes:
+  cards/<owner>__<repo>.md        (raw card)
+  cards/<owner>__<repo>.json      (extracted summary)
+  INDEX.md                        (master table)
+Runs on the host (WSL python): sees /home/<user>/.cache + /mnt/c/... + the internet.
+Stdlib only. Public models need no token; set HF_TOKEN env for gated ones.
+"""
+import os
+import re
+import json
+import glob
+import time
+import urllib.request
+import urllib.error
+HERE = os.environ.get("ANVIL_MODELS_OUT") or os.path.join(os.getcwd(), "model-library")
+CARDS = os.path.join(HERE, "cards")
+STATE = os.path.join(HERE, "_seen.json")
+os.makedirs(CARDS, exist_ok=True)
+# Scan roots: (path, kind). HF caches use models--owner--repo; "dir" = plain model folders.
+def _auto_roots():
+    import glob as _g
+    roots, seen = [], set()
+    def add(p, kind):
+        p = os.path.normpath(p)
+        if p not in seen and os.path.isdir(p): seen.add(p); roots.append((p, kind))
+    env = os.environ.get("ANVIL_HF_ROOTS")
+    if env:
+        for p in env.split(os.pathsep): add(p, "hf")
+    add(os.path.expanduser("~/.cache/huggingface/hub"), "hf")
+    up = os.environ.get("USERPROFILE")
+    if up: add(os.path.join(up, ".cache", "huggingface", "hub"), "hf")
+    for u in _g.glob("/mnt/c/Users/*/.cache/huggingface/hub"): add(u, "hf")
+    for d in (os.environ.get("ANVIL_MODEL_DIRS") or "").split(os.pathsep):
+        if d: add(d, "dir")
+    for u in _g.glob("/mnt/c/Users/*/models"): add(u, "dir")
+    return roots
+ROOTS = _auto_roots()
+# --- sm_120 (Blackwell) SGLang loadability hazards -------------------------
+# A safetensors model is normally SGLang-loadable, BUT some quant+arch combos
+# load "successfully" then hang or return silent zeros on sm_120. Encode that
+# judgment as a small table; add a row when a new case appears (don't abstract).
+# Refs: FP8-MoE hang -> sglang#16816 ; NVFP4 GEMM silent zeros ->
+#       flashinfer#2577, vllm#24921.
+SM120_HAZARDS = [
+    # (quant substring, requires_moe, caveat)
+    ("fp8",   True,  "FP8-MoE hangs post-load on sm_120 (sglang#16816)"),
+    ("nvfp4", False, "NVFP4 GEMM broken on sm_120: silent zeros (flashinfer#2577, vllm#24921)"),
+]
+# Standard MoE config keys across families: num_experts (generic), num_local_experts
+# (Mixtral / gpt-oss), n_routed_experts (DeepSeek-V3). Plus model_type/arch fallback.
+_MOE_KEYS = ("num_experts", "num_local_experts", "n_routed_experts")
+_MOE_SUBSTRINGS = ("moe", "mixtral", "deepseek", "qwen3_moe", "gpt_oss")
+def _is_moe(cfg, txt):
+    if any(isinstance(d, dict) and any(k in d for k in _MOE_KEYS) for d in (cfg, txt)):
+        return True
+    blob = (str(cfg.get("model_type") or "") + " " +
+            " ".join(str(x) for x in (cfg.get("architectures") or []))).lower()
+    return any(s in blob for s in _MOE_SUBSTRINGS)
+def _parse_quant(cfg):
+    """Derive (qmethod, qbits, qsig) from cfg's quantization_config.
+    qsig folds BOTH the raw quant strings (catches native-fp8 repos whose method
+    is literally 'fp8') AND tokens derived from the quantized WEIGHT spec.
+    compressed-tensors reports quant_method='compressed-tensors',
+    format='float-quantized' and hides the real precision in
+    config_groups[*].weights {num_bits, type}; surface it as fp8/nvfp4.
+    MXFP4 (gpt-oss) is servable via triton, so it is never folded to nvfp4.
+    """
+    quant = cfg.get("quantization_config") or {}     # JSON null -> {} (no AttributeError)
+    qmethod = quant.get("quant_method") or quant.get("format")
+    qbits, qtype = None, ""
+    for g in (quant.get("config_groups") or {}).values():
+        w = (g or {}).get("weights") or {}
+        if w.get("num_bits"):
+            qbits = w["num_bits"]
+            qtype = (w.get("type") or "").lower()
+    qsig = (str(qmethod or "") + " " + str(quant.get("format") or "")).lower()
+    if qbits == 8 and qtype == "float":
+        qsig += " fp8"
+    elif qbits == 4 and "mxfp4" not in qsig and ("float" in qtype or "float" in qsig):
+        qsig += " nvfp4"
+    return qmethod, qbits, qsig
+def sm120_hazard(fmt, qsig, cfg, txt):
+    """safetensors quant+arch hazardous on sm_120 -> caveat string, else None."""
+    if fmt != "safetensors":
+        return None
+    q = (qsig or "").lower()
+    moe = _is_moe(cfg, txt)
+    for sub, requires_moe, caveat in SM120_HAZARDS:
+        if sub in q and (moe or not requires_moe):
+            return caveat
+    return None
+def dir_size_bytes(p):
+    """Raw on-disk byte total for a model dir (deduping HF blob hardlinks)."""
+    blobs = os.path.join(p, "blobs")
+    seen, t = set(), 0
+    targets = [blobs] if os.path.isdir(blobs) else [p]
+    for base in targets:
+        for dp, _, fns in os.walk(base):
+            for f in fns:
+                try:
+                    rp = os.path.realpath(os.path.join(dp, f))
+                    if rp in seen: continue
+                    seen.add(rp); t += os.path.getsize(rp)
+                except OSError: pass
+    return t
+def dir_size_gb(p):
+    """Display size in GB (rounded to 0.1). Use dir_size_bytes for exact totals."""
+    return round(dir_size_bytes(p)/1e9, 1)
+def newest_snapshot(model_dir):
+    snaps = glob.glob(os.path.join(model_dir, "snapshots", "*"))
+    snaps = [s for s in snaps if os.path.isdir(s)]
+    # prefer the snapshot that actually has config.json / weights
+    snaps.sort(key=lambda s: (os.path.exists(os.path.join(s,"config.json")),
+                              len(os.listdir(s)), os.path.getmtime(s)), reverse=True)
+    return snaps[0] if snaps else None
+def load_json(p):
+    try:
+        with open(p, encoding="utf-8") as f: v = json.load(f)
+        return v if isinstance(v, dict) else {}   # non-dict JSON -> {} (no .get crash)
+    except Exception: return {}
+def detect_format(d):
+    g = glob.glob(os.path.join(d, "**", "*.gguf"), recursive=True)
+    s = glob.glob(os.path.join(d, "**", "*.safetensors"), recursive=True)
+    if g: return "GGUF"
+    if s: return "safetensors"
+    return "?"
+def fetch_card(owner, repo):
+    url = f"https://huggingface.co/{owner}/{repo}/raw/main/README.md"
+    req = urllib.request.Request(url, headers={"User-Agent":"model-card-sync"})
+    tok = os.environ.get("HF_TOKEN")
+    if tok: req.add_header("Authorization", "Bearer "+tok)
+    try:
+        with urllib.request.urlopen(req, timeout=30) as r:
+            return r.read().decode("utf-8", "replace")
+    except Exception:
+        return None
+def parse_frontmatter(card):
+    fm = {}
+    if card and card.startswith("---"):
+        end = card.find("\n---", 3)
+        if end > 0:
+            for line in card[3:end].splitlines():
+                m = re.match(r"\s*([A-Za-z_]+):\s*(.+?)\s*$", line)
+                if m: fm[m.group(1).lower()] = m.group(2).strip()
+    return fm
+def extract_from_card(card):
+    out = {}
+    if not card: return out
+    low = card.lower()
+    out["thinking_default"] = ("thinking mode" in low and "default" in low) or "enable_thinking" in low
+    # first recommended sampling line
+    m = re.search(r"temperature\s*=\s*([0-9.]+).{0,80}?top_p\s*=\s*([0-9.]+)", card, re.S|re.I)
+    if m: out["sampling_hint"] = f"temp={m.group(1)}, top_p={m.group(2)}"
+    # context length mentions
+    ctxs = re.findall(r"([0-9][0-9,]{3,})\s*(?:tokens|context)", card, re.I)
+    if ctxs: out["context_hint"] = max(int(c.replace(",","")) for c in ctxs)
+    # benchmark hints
+    for bench in ["SWE-bench Verified","SWE-Bench Verified","Terminal-Bench","BFCL","TAU2","LiveCodeBench"]:
+        m = re.search(re.escape(bench)+r"[^0-9]{0,30}([0-9]{1,3}\.?[0-9]?)", card)
+        if m: out.setdefault("benchmarks", {})[bench] = m.group(1)
+    out["mentions_sglang"] = "sglang" in low
+    out["mentions_vllm"] = "vllm" in low
+    return out
+def summarize(owner, repo, model_dir, kind):
+    snap = newest_snapshot(model_dir) if kind == "hf" else model_dir
+    cfg = load_json(os.path.join(snap, "config.json")) if snap else {}
+    gen = load_json(os.path.join(snap, "generation_config.json")) if snap else {}
+    txt = cfg.get("text_config", {}) if isinstance(cfg.get("text_config"), dict) else {}
+    qmethod, qbits, qsig = _parse_quant(cfg)
+    fmt = detect_format(snap) if snap else "?"
+    sm120 = sm120_hazard(fmt, qsig, cfg, txt)
+    s = dict(
+        id=f"{owner}/{repo}" if owner else repo,
+        owner=owner, repo=repo, local_path=model_dir, source=kind,
+        size_gb=dir_size_gb(model_dir),
+        format=fmt,
+        architectures=cfg.get("architectures"),
+        model_type=cfg.get("model_type"),
+        context=cfg.get("max_position_embeddings") or txt.get("max_position_embeddings"),
+        quant=qmethod, quant_bits=qbits,
+        gen_sampling={k:gen[k] for k in ("temperature","top_p","top_k") if k in gen},
+        sglang_loadable=(fmt == "safetensors" and not sm120),
+        sm120_caveat=sm120,
+        synced=time.strftime("%Y-%m-%d %H:%M"),
+    )
+    card = fetch_card(owner, repo) if owner else None
+    if card:
+        open(os.path.join(CARDS, f"{owner}__{repo}.md"), "w", encoding="utf-8").write(card)
+        fm = parse_frontmatter(card)
+        s["license"] = fm.get("license")
+        s["pipeline_tag"] = fm.get("pipeline_tag")
+        s["base_model"] = fm.get("base_model")
+        s.update(extract_from_card(card))
+        s["card_saved"] = f"cards/{owner}__{repo}.md"
+    else:
+        s["card_saved"] = None
+    open(os.path.join(CARDS, f"{owner}__{repo}.json" if owner else f"{repo}.json"), "w",
+         encoding="utf-8").write(json.dumps(s, indent=1))
+    return s
+def discover():
+    found = []
+    for root, kind in ROOTS:
+        if not os.path.isdir(root): continue
+        if kind == "hf":
+            for d in glob.glob(os.path.join(root, "models--*")):
+                base = os.path.basename(d)
+                parts = base.split("--")
+                if len(parts) < 3: continue
+                owner, repo = parts[1], "--".join(parts[2:])
+                found.append((owner, repo, d, "hf"))
+        else:
+            for d in glob.glob(os.path.join(root, "*")):
+                if os.path.isdir(d):
+                    cfg = load_json(os.path.join(d, "config.json"))
+                    nm = cfg.get("_name_or_path") or os.path.basename(d)
+                    owner, repo = (nm.split("/",1)+[None])[:2] if "/" in str(nm) else (None, os.path.basename(d))
+                    found.append((owner, repo, d, "dir"))
+    return found
+def is_real_model_row(r):
+    """SHARED 3-part real-model gate (used by write_index AND cache_prune).
+    A row is a real servable model only if all three hold:
+      1. not an unslothai mirror,
+      2. not a tiny (<0.2GB) format-unknown dir,
+      3. POSITIVE evidence it is a model: a model_type OR a weights format.
+    The third gate is decisive — without it a dataset / partial download /
+    tokenizer-only dir (no weights, no model_type, >=0.2GB) would slip through
+    and, in cache_prune, become a DELETION candidate. Keep the three parts here
+    so the prune planner and the index can never drift apart.
+    """
+    if r.get("owner") == "unslothai": return False
+    if (r.get("size_gb") or 0) < 0.2 and r.get("format") == "?": return False
+    return bool(r.get("model_type")) or r.get("format") in ("safetensors", "GGUF")
+def write_index(rows):
+    rows = [r for r in rows if is_real_model_row(r)]
+    rows.sort(key=lambda r: (r["format"] != "safetensors", -(r.get("size_gb") or 0)))
+    L = ["# Model Library — Index", "",
+         f"_Auto-generated by `sync_model_cards.py` — {time.strftime('%Y-%m-%d %H:%M')}. "
+         f"{len(rows)} models. Cards in `cards/`._", "",
+         "| Model | Format | SGLang? | Params/size | Context | Quant | License | Thinking | Coding bench | Local |",
+         "|---|---|---|---|---|---|---|---|---|---|"]
+    for r in rows:
+        bench = ""
+        b = r.get("benchmarks") or {}
+        for k in ("SWE-bench Verified","SWE-Bench Verified","Terminal-Bench"):
+            if k in b: bench = f"{k.split('-')[0]} {b[k]}"; break
+        ctx = r.get("context") or r.get("context_hint") or ""
+        ctx = f"{int(ctx)//1024}K" if str(ctx).isdigit() and int(ctx)>=1024 else (ctx or "")
+        L.append("| {id} | {fmt} | {ok} | {sz} GB | {ctx} | {q} | {lic} | {th} | {bn} | {src} |".format(
+            id=r["id"], fmt=r.get("format","?"),
+            ok=("✅" if r.get("sglang_loadable")
+                else "⚠️ sm_120" if r.get("sm120_caveat")
+                else "❌ (llama.cpp)" if r.get("format")=="GGUF"
+                else "?"),
+            sz=r.get("size_gb","?"), ctx=ctx,
+            q=(f"{r.get('quant') or ''} {r.get('quant_bits') or ''}".strip() or "—"),
+            lic=r.get("license") or "—",
+            th=("yes" if r.get("thinking_default") else "—"),
+            bn=bench or "—", src=("win" if "/mnt/c" in r.get("local_path","") else "wsl")))
+    open(os.path.join(HERE, "INDEX.md"), "w", encoding="utf-8").write("\n".join(L)+"\n")
+def main():
+    models = discover()
+    print(f"discovered {len(models)} model folders")
+    rows = []
+    for owner, repo, d, kind in models:
+        try:
+            s = summarize(owner, repo, d, kind)
+            rows.append(s)
+            print(f"  [{s.get('format'):>11}] {s['id']}  {s.get('size_gb')}GB  card={'y' if s.get('card_saved') else 'n'}")
+        except Exception as e:
+            print(f"  ERROR {owner}/{repo}: {e}")
+    write_index(rows)
+    # new-model detection (vs last run) for the Cowork analysis task
+    real_ids = [r["id"] for r in rows if r.get("format") in ("safetensors","GGUF")]
+    prior = set((load_json(STATE) or {}).get("ids", []))
+    new_ids = [i for i in real_ids if i not in prior]
+    json.dump({"ids": real_ids, "updated": time.strftime("%Y-%m-%d %H:%M")},
+              open(STATE, "w", encoding="utf-8"), indent=1)
+    print(f"wrote INDEX.md + {len(rows)} summaries to {HERE}")
+    print("NEW_MODELS: " + (", ".join(new_ids) if new_ids else "none"))
+def _loadable(cfg, fmt="safetensors"):
+    """Mirror summarize()'s decision via the same derivation, for self-check."""
+    _, _, qsig = _parse_quant(cfg)
+    sm120 = sm120_hazard(fmt, qsig, cfg, cfg.get("text_config") or {})
+    return (fmt == "safetensors" and not sm120), sm120
+def _selfcheck():
+    # --- REAL derivation path: compressed-tensors hides FP8-ness in the weight
+    #     spec (num_bits=8/type=float), NOT in the method string. ---
+    ct_fp8 = {"quant_method": "compressed-tensors", "format": "float-quantized",
+              "config_groups": {"group_0": {"weights": {"num_bits": 8, "type": "float"}}}}
+    # FP8 + MoE (num_local_experts, Mixtral/gpt-oss style) must NOT be clean-loadable.
+    cfg_fp8_moe = {"quantization_config": ct_fp8, "num_local_experts": 128,
+                   "architectures": ["Qwen3MoeForCausalLM"]}
+    ok, caveat = _loadable(cfg_fp8_moe)
+    assert not ok and caveat, "compressed-tensors FP8 on MoE must caveat (not clean)"
+    # FP8 + MoE via DeepSeek-V3 key n_routed_experts must also caveat.
+    cfg_fp8_ds = {"quantization_config": ct_fp8, "n_routed_experts": 256}
+    ok, caveat = _loadable(cfg_fp8_ds)
+    assert not ok and caveat, "compressed-tensors FP8 on DeepSeek MoE must caveat"
+    # NVFP4 via weight spec (num_bits=4/type=float) caveats on any arch.
+    ct_nvfp4 = {"quantization_config": {"quant_method": "compressed-tensors",
+                "config_groups": {"g": {"weights": {"num_bits": 4, "type": "float"}}}}}
+    ok, caveat = _loadable(ct_nvfp4)
+    assert not ok and caveat, "compressed-tensors NVFP4 must caveat on any arch"
+    # gpt-oss MXFP4 is SERVABLE via triton -> must NOT be flagged.
+    gptoss = {"quantization_config": {"quant_method": "mxfp4"},
+              "model_type": "gpt_oss", "num_local_experts": 128}
+    ok, caveat = _loadable(gptoss)
+    assert ok and caveat is None, "gpt-oss MXFP4 is servable, must not be flagged"
+    # Dense AWQ (int4) stays clean.
+    awq = {"quantization_config": {"quant_method": "awq",
+            "config_groups": {"g": {"weights": {"num_bits": 4, "type": "int"}}}}}
+    ok, caveat = _loadable(awq)
+    assert ok and caveat is None, "dense AWQ (int4) stays clean-loadable"
+    # JSON null quantization_config must not crash.
+    ok, _ = _loadable({"quantization_config": None})
+    assert ok, "null quantization_config -> clean, no AttributeError"
+    # --- Legacy string-folding path still holds (native-fp8 repos etc.). ---
+    assert sm120_hazard("safetensors", "fp8", {"num_experts": 128}, {}), "FP8-MoE should caveat"
+    assert sm120_hazard("safetensors", "nvfp4", {}, {}), "NVFP4 caveats on any arch"
+    assert sm120_hazard("safetensors", "fp8", {}, {}) is None, "dense FP8 is fine"
+    assert sm120_hazard("safetensors", "awq", {"num_experts": 128}, {}) is None, "AWQ-MoE clean"
+    assert sm120_hazard("safetensors", "awq", {}, {}) is None, "dense AWQ clean"
+    assert sm120_hazard("GGUF", "fp8", {"num_experts": 128}, {}) is None, "GGUF handled elsewhere"
+if __name__ == "__main__":
+    _selfcheck()
+    main()