anvil-serving 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. anvil_serving/__init__.py +2 -0
  2. anvil_serving/_aggregate_usage.py +193 -0
  3. anvil_serving/_role_split.py +91 -0
  4. anvil_serving/_sync.py +359 -0
  5. anvil_serving/benchmark.py +224 -0
  6. anvil_serving/cache_prune.py +608 -0
  7. anvil_serving/cli.py +31 -0
  8. anvil_serving/config.py +40 -0
  9. anvil_serving/deploy.py +51 -0
  10. anvil_serving/eval.py +167 -0
  11. anvil_serving/models.py +22 -0
  12. anvil_serving/multiplexer.py +773 -0
  13. anvil_serving/preflight.py +144 -0
  14. anvil_serving/profile.py +22 -0
  15. anvil_serving/py.typed +0 -0
  16. anvil_serving/router/__init__.py +102 -0
  17. anvil_serving/router/__main__.py +45 -0
  18. anvil_serving/router/backends/__init__.py +32 -0
  19. anvil_serving/router/backends/cloud.py +470 -0
  20. anvil_serving/router/backends/local.py +62 -0
  21. anvil_serving/router/backends/relay.py +72 -0
  22. anvil_serving/router/calibrate.py +402 -0
  23. anvil_serving/router/classify.py +225 -0
  24. anvil_serving/router/commit_window.py +330 -0
  25. anvil_serving/router/config.py +338 -0
  26. anvil_serving/router/decision_log.py +236 -0
  27. anvil_serving/router/dialects/__init__.py +52 -0
  28. anvil_serving/router/dialects/anthropic.py +264 -0
  29. anvil_serving/router/dialects/openai.py +216 -0
  30. anvil_serving/router/discovery.py +60 -0
  31. anvil_serving/router/fallback.py +689 -0
  32. anvil_serving/router/fingerprint.py +133 -0
  33. anvil_serving/router/front_door.py +683 -0
  34. anvil_serving/router/intent.py +263 -0
  35. anvil_serving/router/internal.py +185 -0
  36. anvil_serving/router/metrics.py +428 -0
  37. anvil_serving/router/policy.py +308 -0
  38. anvil_serving/router/prices.py +92 -0
  39. anvil_serving/router/profile_bootstrap.py +528 -0
  40. anvil_serving/router/profile_store.py +374 -0
  41. anvil_serving/router/registry.py +356 -0
  42. anvil_serving/router/seams.py +267 -0
  43. anvil_serving/router/secrets.py +233 -0
  44. anvil_serving/router/serve.py +592 -0
  45. anvil_serving/router/tier0_keywords.json +7 -0
  46. anvil_serving/router/verify.py +733 -0
  47. anvil_serving/score.py +621 -0
  48. anvil_serving/serves.py +240 -0
  49. anvil_serving-0.4.0.dist-info/METADATA +474 -0
  50. anvil_serving-0.4.0.dist-info/RECORD +54 -0
  51. anvil_serving-0.4.0.dist-info/WHEEL +5 -0
  52. anvil_serving-0.4.0.dist-info/entry_points.txt +2 -0
  53. anvil_serving-0.4.0.dist-info/licenses/LICENSE +21 -0
  54. anvil_serving-0.4.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2 @@
1
+ """anvil-serving — right-size and run a local LLM serving tier from your coding-agent usage."""
2
+ __version__ = "0.4.0"
@@ -0,0 +1,193 @@
1
+ #!/usr/bin/env python3
2
+ """aggregate_usage.py - roll up ALL Claude Code session logs into inference-sizing metrics.
3
+
4
+ Mirrors session-retro/session_stats.py field definitions (output/input/cache tokens,
5
+ tool_use, workflow <usage> fan-out) and adds the distributions that size a local
6
+ inference server: per-call context size, generation length, concurrency, throughput.
7
+
8
+ Stdlib only. Reads ~/.claude/projects/**/*.jsonl. Writes JSON to stdout (or --out).
9
+ """
10
+ import json
11
+ import os
12
+ import re
13
+ import sys
14
+ import glob
15
+ from collections import Counter, defaultdict
16
+ from datetime import datetime
17
+
18
+ PROJECTS = os.environ.get("ANVIL_CLAUDE_LOGS") or os.path.expanduser("~/.claude/projects")
19
+
20
+ def pct(sorted_vals, q):
21
+ if not sorted_vals: return 0
22
+ if q <= 0: return sorted_vals[0]
23
+ if q >= 100: return sorted_vals[-1]
24
+ i = (len(sorted_vals)-1) * q/100.0
25
+ lo = int(i); hi = min(lo+1, len(sorted_vals)-1)
26
+ frac = i-lo
27
+ return int(round(sorted_vals[lo]*(1-frac) + sorted_vals[hi]*frac))
28
+
29
+ def fiso(x):
30
+ try: return datetime.fromisoformat(x.replace("Z","+00:00"))
31
+ except Exception: return None
32
+
33
+ def main():
34
+ out_path = None
35
+ if "--out" in sys.argv:
36
+ out_path = sys.argv[sys.argv.index("--out")+1]
37
+ files = glob.glob(os.path.join(PROJECTS, "**", "*.jsonl"), recursive=True)
38
+
39
+ # global accumulators
40
+ tot = Counter() # output/input/cc/cr/asst_calls
41
+ by_model_calls = Counter()
42
+ by_model_out = Counter()
43
+ tools = Counter()
44
+ skills = Counter()
45
+ agent_types = Counter()
46
+ ctx_sizes = [] # input+cc+cr per assistant call
47
+ out_sizes = [] # output_tokens per assistant call
48
+ per_min_calls = Counter() # ts[:16] -> calls
49
+ per_min_sessions = defaultdict(set) # ts[:16] -> {sessionId}
50
+ per_hour_out = Counter() # ts[:13] -> output tokens
51
+ per_day = defaultdict(lambda: [0,0,0]) # day -> [out_tokens, calls, ]
52
+ per_day_sessions = defaultdict(set)
53
+ wf_count = 0
54
+ wf_agent_counts = [] # agent_count per workflow dispatch
55
+ wf_subagent_tokens = 0
56
+ sess_summ = [] # (duration_h, asst_turns, out_tokens)
57
+ n_sessions = 0
58
+
59
+ def g(pat, txt):
60
+ m = re.search(pat, txt, re.S)
61
+ return m.group(1) if m else None
62
+
63
+ for f in files:
64
+ n_sessions += 1
65
+ s_first=s_last=None; s_asst=0; s_out=0; sid=None
66
+ try:
67
+ fh = open(f, encoding="utf-8", errors="replace")
68
+ except Exception:
69
+ continue
70
+ with fh:
71
+ for line in fh:
72
+ line=line.strip()
73
+ if not line: continue
74
+ try: d = json.loads(line)
75
+ except Exception: continue
76
+ if sid is None: sid = d.get("sessionId")
77
+ ts = d.get("timestamp")
78
+ if ts:
79
+ s_first = s_first or ts; s_last = ts
80
+ m = d.get("message")
81
+ if not isinstance(m, dict): continue
82
+ t = d.get("type")
83
+ if t == "assistant":
84
+ u = m.get("usage") or {}
85
+ it = u.get("input_tokens",0) or 0
86
+ ot = u.get("output_tokens",0) or 0
87
+ cc = u.get("cache_creation_input_tokens",0) or 0
88
+ cr = u.get("cache_read_input_tokens",0) or 0
89
+ tot["out"]+=ot; tot["inp"]+=it; tot["cc"]+=cc; tot["cr"]+=cr; tot["asst"]+=1
90
+ s_asst+=1; s_out+=ot
91
+ model = m.get("model") or "unknown"
92
+ by_model_calls[model]+=1; by_model_out[model]+=ot
93
+ ctx = it+cc+cr
94
+ ctx_sizes.append(ctx); out_sizes.append(ot)
95
+ if ts:
96
+ mn = ts[:16]; hr = ts[:13]; day = ts[:10]
97
+ per_min_calls[mn]+=1
98
+ if sid: per_min_sessions[mn].add(sid)
99
+ per_hour_out[hr]+=ot
100
+ per_day[day][0]+=ot; per_day[day][1]+=1
101
+ if sid: per_day_sessions[day].add(sid)
102
+ for c in (m.get("content") or []):
103
+ if isinstance(c, dict) and c.get("type")=="tool_use":
104
+ nm = c.get("name","?"); tools[nm]+=1
105
+ inp = c.get("input") or {}
106
+ if nm=="Skill": skills[inp.get("skill","?")]+=1
107
+ elif nm=="Agent": agent_types[inp.get("subagent_type") or "general-purpose"]+=1
108
+ elif t == "user":
109
+ c = m.get("content")
110
+ txt = c if isinstance(c,str) else (" ".join(x.get("text","") for x in c if isinstance(x,dict) and x.get("type")=="text") if isinstance(c,list) else "")
111
+ if "<usage>" in txt or "<task-notification>" in txt:
112
+ wf_count += 1
113
+ ac = g(r"<agent_count>(\d+)", txt); st = g(r"<subagent_tokens>(\d+)", txt)
114
+ if ac: wf_agent_counts.append(int(ac))
115
+ if st: wf_subagent_tokens += int(st)
116
+ if s_asst:
117
+ dur = 0.0
118
+ if s_first and s_last:
119
+ a,b = fiso(s_first), fiso(s_last)
120
+ if a and b: dur = round((b-a).total_seconds()/3600,3)
121
+ sess_summ.append((dur, s_asst, s_out))
122
+
123
+ ctx_sizes.sort(); out_sizes.sort()
124
+ durs = sorted(x[0] for x in sess_summ)
125
+ turns = sorted(x[1] for x in sess_summ)
126
+
127
+ # concurrency / throughput
128
+ peak_calls_min = per_min_calls.most_common(1)[0] if per_min_calls else ("",0)
129
+ min_call_counts = sorted(per_min_calls.values())
130
+ peak_sessions_min = max(((mn, len(s)) for mn,s in per_min_sessions.items()), key=lambda x:x[1], default=("",0))
131
+ sess_per_min = sorted(len(s) for s in per_min_sessions.values())
132
+ peak_hour = per_hour_out.most_common(1)[0] if per_hour_out else ("",0)
133
+
134
+ days = sorted(per_day.keys())
135
+ daily = [{"day":d, "out":per_day[d][0], "calls":per_day[d][1], "sessions":len(per_day_sessions[d])} for d in days]
136
+ busiest = max(daily, key=lambda x:x["out"], default=None)
137
+
138
+ wf_agent_counts.sort()
139
+ res = dict(
140
+ window=dict(files=len(files), sessions_with_calls=len(sess_summ),
141
+ first_day=days[0] if days else None, last_day=days[-1] if days else None,
142
+ active_days=len(days)),
143
+ totals=dict(assistant_calls=tot["asst"], output_tokens=tot["out"], fresh_input_tokens=tot["inp"],
144
+ cache_creation_tokens=tot["cc"], cache_read_tokens=tot["cr"],
145
+ total_processed=tot["out"]+tot["inp"]+tot["cc"]+tot["cr"]),
146
+ model_mix=dict(calls=dict(by_model_calls.most_common()), output_tokens=dict(by_model_out.most_common())),
147
+ context_size_per_call=dict(n=len(ctx_sizes), p50=pct(ctx_sizes,50), p90=pct(ctx_sizes,90),
148
+ p95=pct(ctx_sizes,95), p99=pct(ctx_sizes,99), max=ctx_sizes[-1] if ctx_sizes else 0,
149
+ mean=int(sum(ctx_sizes)/len(ctx_sizes)) if ctx_sizes else 0),
150
+ generation_per_call=dict(p50=pct(out_sizes,50), p90=pct(out_sizes,90), p95=pct(out_sizes,95),
151
+ p99=pct(out_sizes,99), max=out_sizes[-1] if out_sizes else 0,
152
+ mean=int(sum(out_sizes)/len(out_sizes)) if out_sizes else 0),
153
+ concurrency=dict(
154
+ peak_calls_per_min=dict(minute=peak_calls_min[0], calls=peak_calls_min[1]),
155
+ p99_calls_per_min=pct(min_call_counts,99), p95_calls_per_min=pct(min_call_counts,95),
156
+ peak_parallel_sessions=dict(minute=peak_sessions_min[0], sessions=peak_sessions_min[1]),
157
+ p99_parallel_sessions=pct(sess_per_min,99), p95_parallel_sessions=pct(sess_per_min,95),
158
+ ),
159
+ throughput=dict(peak_output_tokens_per_hour=dict(hour=peak_hour[0], tokens=peak_hour[1]),
160
+ peak_sustained_tok_per_s=round(peak_hour[1]/3600,1)),
161
+ workflows=dict(dispatches=wf_count, subagent_tokens=wf_subagent_tokens,
162
+ agent_count_p50=pct(wf_agent_counts,50), agent_count_p95=pct(wf_agent_counts,95),
163
+ agent_count_max=wf_agent_counts[-1] if wf_agent_counts else 0,
164
+ total_subagents=sum(wf_agent_counts)),
165
+ sessions=dict(count=len(sess_summ), median_duration_h=pct(durs,50), p95_duration_h=pct([int(x*1000) for x in durs],95)/1000.0 if durs else 0,
166
+ max_duration_h=durs[-1] if durs else 0, median_asst_turns=pct(turns,50), max_asst_turns=turns[-1] if turns else 0,
167
+ sessions_per_active_day=round(len(sess_summ)/max(len(days),1),1)),
168
+ tools=dict(tools.most_common(25)),
169
+ skills=dict(skills.most_common(20)),
170
+ agent_types=dict(agent_types.most_common(20)),
171
+ busiest_day=busiest,
172
+ daily=daily,
173
+ )
174
+ txt = json.dumps(res, indent=1)
175
+ if out_path:
176
+ open(out_path,"w").write(txt)
177
+ print("WROTE", out_path, len(txt), "bytes")
178
+ # always print a compact human summary
179
+ T=res["totals"]; C=res["context_size_per_call"]; G=res["generation_per_call"]; X=res["concurrency"]; W=res["workflows"]
180
+ print("=== SUMMARY ===")
181
+ print("window:", res["window"])
182
+ print("assistant_calls:", T["assistant_calls"], "| output:", T["output_tokens"], "| total_processed:", T["total_processed"])
183
+ print("model_mix calls:", res["model_mix"]["calls"])
184
+ print("ctx/call p50/p90/p95/p99/max:", C["p50"],C["p90"],C["p95"],C["p99"],C["max"])
185
+ print("gen/call p50/p95/max:", G["p50"],G["p95"],G["max"])
186
+ print("peak calls/min:", X["peak_calls_per_min"], "| peak parallel sessions:", X["peak_parallel_sessions"])
187
+ print("throughput:", res["throughput"])
188
+ print("workflows:", W)
189
+ print("sessions:", res["sessions"])
190
+ print("busiest_day:", res["busiest_day"])
191
+
192
+ if __name__ == "__main__":
193
+ main()
@@ -0,0 +1,91 @@
1
+ #!/usr/bin/env python3
2
+ """role_split.py - split Claude Code per-call context/generation by ROLE.
3
+
4
+ Splits assistant calls by isSidechain (subagent vs main orchestrator) and by model,
5
+ so we can size the LOCAL specialist context ceiling from the subagent distribution
6
+ rather than the orchestrator's long-context tail. Stdlib only.
7
+ """
8
+ import json
9
+ import os
10
+ import glob
11
+ import sys
12
+ from collections import defaultdict
13
+
14
+ PROJECTS = os.environ.get("ANVIL_CLAUDE_LOGS") or os.path.expanduser("~/.claude/projects")
15
+
16
+ def pct(v, q):
17
+ if not v: return 0
18
+ if q<=0: return v[0]
19
+ if q>=100: return v[-1]
20
+ i=(len(v)-1)*q/100.0; lo=int(i); hi=min(lo+1,len(v)-1); f=i-lo
21
+ return int(round(v[lo]*(1-f)+v[hi]*f))
22
+
23
+ def summarize(ctx, out):
24
+ ctx=sorted(ctx); out=sorted(out)
25
+ return dict(n=len(ctx),
26
+ ctx=dict(p50=pct(ctx,50),p90=pct(ctx,90),p95=pct(ctx,95),p99=pct(ctx,99),max=ctx[-1] if ctx else 0,
27
+ mean=int(sum(ctx)/len(ctx)) if ctx else 0),
28
+ gen=dict(p50=pct(out,50),p95=pct(out,95),max=out[-1] if out else 0))
29
+
30
+ def main():
31
+ out_path = sys.argv[sys.argv.index("--out")+1] if "--out" in sys.argv else None
32
+ files = glob.glob(os.path.join(PROJECTS,"**","*.jsonl"), recursive=True)
33
+ groups = defaultdict(lambda: ([],[])) # key -> (ctx_list, out_list)
34
+ by_model = defaultdict(lambda: ([],[]))
35
+ seen_sidechain_field = 0; total = 0
36
+ # buckets to show what share of SUBAGENT calls fit a given local context ceiling
37
+ ceilings = [16384, 32768, 65536, 131072, 262144]
38
+ sub_ctx_all = []
39
+ for f in files:
40
+ try: fh=open(f, encoding="utf-8", errors="replace")
41
+ except Exception: continue
42
+ with fh:
43
+ for line in fh:
44
+ line=line.strip()
45
+ if not line: continue
46
+ try: d=json.loads(line)
47
+ except Exception: continue
48
+ if d.get("type")!="assistant": continue
49
+ m=d.get("message")
50
+ if not isinstance(m,dict): continue
51
+ u=m.get("usage") or {}
52
+ ctx=(u.get("input_tokens",0) or 0)+(u.get("cache_creation_input_tokens",0) or 0)+(u.get("cache_read_input_tokens",0) or 0)
53
+ ot=u.get("output_tokens",0) or 0
54
+ total+=1
55
+ sc = d.get("isSidechain")
56
+ if sc is not None: seen_sidechain_field+=1
57
+ role = "subagent" if sc else "main"
58
+ groups[role][0].append(ctx); groups[role][1].append(ot)
59
+ model=m.get("model") or "unknown"
60
+ by_model[model][0].append(ctx); by_model[model][1].append(ot)
61
+ if role=="subagent":
62
+ sub_ctx_all.append(ctx)
63
+ res = dict(
64
+ total_assistant_calls=total,
65
+ records_with_isSidechain_field=seen_sidechain_field,
66
+ by_role={k:summarize(*v) for k,v in groups.items()},
67
+ by_model={k:summarize(*v) for k,v in sorted(by_model.items(), key=lambda x:-len(x[1][0]))},
68
+ )
69
+ # what context ceiling covers what fraction of SUBAGENT calls
70
+ sub_ctx_all.sort(); n=len(sub_ctx_all)
71
+ cov={}
72
+ for c in ceilings:
73
+ cov[c]= round(100.0*sum(1 for x in sub_ctx_all if x<=c)/n,1) if n else 0
74
+ res["subagent_context_coverage_pct"]=cov
75
+ txt=json.dumps(res, indent=1)
76
+ if out_path: open(out_path,"w").write(txt); print("WROTE",out_path)
77
+ print("=== ROLE SPLIT ===")
78
+ print("total assistant calls:",total,"| with isSidechain field:",seen_sidechain_field)
79
+ for role,v in res["by_role"].items():
80
+ print(f"\n[{role}] n={v['n']:,}")
81
+ print(" ctx p50/p90/p95/p99/max:", v["ctx"]["p50"],v["ctx"]["p90"],v["ctx"]["p95"],v["ctx"]["p99"],v["ctx"]["max"])
82
+ print(" gen p50/p95/max:", v["gen"]["p50"],v["gen"]["p95"],v["gen"]["max"])
83
+ print("\n[by model] (ctx p50/p95/max | gen p50)")
84
+ for mdl,v in list(res["by_model"].items())[:8]:
85
+ print(f" {mdl:28} n={v['n']:>7,} ctx {v['ctx']['p50']:>7,}/{v['ctx']['p95']:>7,}/{v['ctx']['max']:>7,} gen p50 {v['gen']['p50']}")
86
+ print("\nSUBAGENT context coverage (share of subagent calls <= ceiling):")
87
+ for c,p in res["subagent_context_coverage_pct"].items():
88
+ print(f" <= {c:>7,} tok : {p}%")
89
+
90
+ if __name__=="__main__":
91
+ main()
anvil_serving/_sync.py ADDED
@@ -0,0 +1,359 @@
1
+ #!/usr/bin/env python3
2
+ """sync_model_cards.py - catalog local models: pull HF cards + local config, extract serving facts.
3
+
4
+ Scans HuggingFace caches and plain model dirs, downloads each model's README card,
5
+ reads local config.json / generation_config.json, extracts the fields that matter for
6
+ SERVING (format GGUF-vs-safetensors, quant, context, license, thinking-default,
7
+ recommended sampling, benchmarks, serving notes), and writes:
8
+ cards/<owner>__<repo>.md (raw card)
9
+ cards/<owner>__<repo>.json (extracted summary)
10
+ INDEX.md (master table)
11
+
12
+ Runs on the host (WSL python): sees /home/<user>/.cache + /mnt/c/... + the internet.
13
+ Stdlib only. Public models need no token; set HF_TOKEN env for gated ones.
14
+ """
15
+ import os
16
+ import re
17
+ import json
18
+ import glob
19
+ import time
20
+ import urllib.request
21
+ import urllib.error
22
+
23
+ HERE = os.environ.get("ANVIL_MODELS_OUT") or os.path.join(os.getcwd(), "model-library")
24
+ CARDS = os.path.join(HERE, "cards")
25
+ STATE = os.path.join(HERE, "_seen.json")
26
+ os.makedirs(CARDS, exist_ok=True)
27
+
28
+ # Scan roots: (path, kind). HF caches use models--owner--repo; "dir" = plain model folders.
29
+ def _auto_roots():
30
+ import glob as _g
31
+ roots, seen = [], set()
32
+ def add(p, kind):
33
+ p = os.path.normpath(p)
34
+ if p not in seen and os.path.isdir(p): seen.add(p); roots.append((p, kind))
35
+ env = os.environ.get("ANVIL_HF_ROOTS")
36
+ if env:
37
+ for p in env.split(os.pathsep): add(p, "hf")
38
+ add(os.path.expanduser("~/.cache/huggingface/hub"), "hf")
39
+ up = os.environ.get("USERPROFILE")
40
+ if up: add(os.path.join(up, ".cache", "huggingface", "hub"), "hf")
41
+ for u in _g.glob("/mnt/c/Users/*/.cache/huggingface/hub"): add(u, "hf")
42
+ for d in (os.environ.get("ANVIL_MODEL_DIRS") or "").split(os.pathsep):
43
+ if d: add(d, "dir")
44
+ for u in _g.glob("/mnt/c/Users/*/models"): add(u, "dir")
45
+ return roots
46
+ ROOTS = _auto_roots()
47
+
48
+ # --- sm_120 (Blackwell) SGLang loadability hazards -------------------------
49
+ # A safetensors model is normally SGLang-loadable, BUT some quant+arch combos
50
+ # load "successfully" then hang or return silent zeros on sm_120. Encode that
51
+ # judgment as a small table; add a row when a new case appears (don't abstract).
52
+ # Refs: FP8-MoE hang -> sglang#16816 ; NVFP4 GEMM silent zeros ->
53
+ # flashinfer#2577, vllm#24921.
54
+ SM120_HAZARDS = [
55
+ # (quant substring, requires_moe, caveat)
56
+ ("fp8", True, "FP8-MoE hangs post-load on sm_120 (sglang#16816)"),
57
+ ("nvfp4", False, "NVFP4 GEMM broken on sm_120: silent zeros (flashinfer#2577, vllm#24921)"),
58
+ ]
59
+
60
+ # Standard MoE config keys across families: num_experts (generic), num_local_experts
61
+ # (Mixtral / gpt-oss), n_routed_experts (DeepSeek-V3). Plus model_type/arch fallback.
62
+ _MOE_KEYS = ("num_experts", "num_local_experts", "n_routed_experts")
63
+ _MOE_SUBSTRINGS = ("moe", "mixtral", "deepseek", "qwen3_moe", "gpt_oss")
64
+
65
+ def _is_moe(cfg, txt):
66
+ if any(isinstance(d, dict) and any(k in d for k in _MOE_KEYS) for d in (cfg, txt)):
67
+ return True
68
+ blob = (str(cfg.get("model_type") or "") + " " +
69
+ " ".join(str(x) for x in (cfg.get("architectures") or []))).lower()
70
+ return any(s in blob for s in _MOE_SUBSTRINGS)
71
+
72
+ def _parse_quant(cfg):
73
+ """Derive (qmethod, qbits, qsig) from cfg's quantization_config.
74
+
75
+ qsig folds BOTH the raw quant strings (catches native-fp8 repos whose method
76
+ is literally 'fp8') AND tokens derived from the quantized WEIGHT spec.
77
+ compressed-tensors reports quant_method='compressed-tensors',
78
+ format='float-quantized' and hides the real precision in
79
+ config_groups[*].weights {num_bits, type}; surface it as fp8/nvfp4.
80
+ MXFP4 (gpt-oss) is servable via triton, so it is never folded to nvfp4.
81
+ """
82
+ quant = cfg.get("quantization_config") or {} # JSON null -> {} (no AttributeError)
83
+ qmethod = quant.get("quant_method") or quant.get("format")
84
+ qbits, qtype = None, ""
85
+ for g in (quant.get("config_groups") or {}).values():
86
+ w = (g or {}).get("weights") or {}
87
+ if w.get("num_bits"):
88
+ qbits = w["num_bits"]
89
+ qtype = (w.get("type") or "").lower()
90
+ qsig = (str(qmethod or "") + " " + str(quant.get("format") or "")).lower()
91
+ if qbits == 8 and qtype == "float":
92
+ qsig += " fp8"
93
+ elif qbits == 4 and "mxfp4" not in qsig and ("float" in qtype or "float" in qsig):
94
+ qsig += " nvfp4"
95
+ return qmethod, qbits, qsig
96
+
97
+ def sm120_hazard(fmt, qsig, cfg, txt):
98
+ """safetensors quant+arch hazardous on sm_120 -> caveat string, else None."""
99
+ if fmt != "safetensors":
100
+ return None
101
+ q = (qsig or "").lower()
102
+ moe = _is_moe(cfg, txt)
103
+ for sub, requires_moe, caveat in SM120_HAZARDS:
104
+ if sub in q and (moe or not requires_moe):
105
+ return caveat
106
+ return None
107
+
108
+ def dir_size_bytes(p):
109
+ """Raw on-disk byte total for a model dir (deduping HF blob hardlinks)."""
110
+ blobs = os.path.join(p, "blobs")
111
+ seen, t = set(), 0
112
+ targets = [blobs] if os.path.isdir(blobs) else [p]
113
+ for base in targets:
114
+ for dp, _, fns in os.walk(base):
115
+ for f in fns:
116
+ try:
117
+ rp = os.path.realpath(os.path.join(dp, f))
118
+ if rp in seen: continue
119
+ seen.add(rp); t += os.path.getsize(rp)
120
+ except OSError: pass
121
+ return t
122
+
123
+ def dir_size_gb(p):
124
+ """Display size in GB (rounded to 0.1). Use dir_size_bytes for exact totals."""
125
+ return round(dir_size_bytes(p)/1e9, 1)
126
+
127
+ def newest_snapshot(model_dir):
128
+ snaps = glob.glob(os.path.join(model_dir, "snapshots", "*"))
129
+ snaps = [s for s in snaps if os.path.isdir(s)]
130
+ # prefer the snapshot that actually has config.json / weights
131
+ snaps.sort(key=lambda s: (os.path.exists(os.path.join(s,"config.json")),
132
+ len(os.listdir(s)), os.path.getmtime(s)), reverse=True)
133
+ return snaps[0] if snaps else None
134
+
135
+ def load_json(p):
136
+ try:
137
+ with open(p, encoding="utf-8") as f: v = json.load(f)
138
+ return v if isinstance(v, dict) else {} # non-dict JSON -> {} (no .get crash)
139
+ except Exception: return {}
140
+
141
+ def detect_format(d):
142
+ g = glob.glob(os.path.join(d, "**", "*.gguf"), recursive=True)
143
+ s = glob.glob(os.path.join(d, "**", "*.safetensors"), recursive=True)
144
+ if g: return "GGUF"
145
+ if s: return "safetensors"
146
+ return "?"
147
+
148
+ def fetch_card(owner, repo):
149
+ url = f"https://huggingface.co/{owner}/{repo}/raw/main/README.md"
150
+ req = urllib.request.Request(url, headers={"User-Agent":"model-card-sync"})
151
+ tok = os.environ.get("HF_TOKEN")
152
+ if tok: req.add_header("Authorization", "Bearer "+tok)
153
+ try:
154
+ with urllib.request.urlopen(req, timeout=30) as r:
155
+ return r.read().decode("utf-8", "replace")
156
+ except Exception:
157
+ return None
158
+
159
+ def parse_frontmatter(card):
160
+ fm = {}
161
+ if card and card.startswith("---"):
162
+ end = card.find("\n---", 3)
163
+ if end > 0:
164
+ for line in card[3:end].splitlines():
165
+ m = re.match(r"\s*([A-Za-z_]+):\s*(.+?)\s*$", line)
166
+ if m: fm[m.group(1).lower()] = m.group(2).strip()
167
+ return fm
168
+
169
+ def extract_from_card(card):
170
+ out = {}
171
+ if not card: return out
172
+ low = card.lower()
173
+ out["thinking_default"] = ("thinking mode" in low and "default" in low) or "enable_thinking" in low
174
+ # first recommended sampling line
175
+ m = re.search(r"temperature\s*=\s*([0-9.]+).{0,80}?top_p\s*=\s*([0-9.]+)", card, re.S|re.I)
176
+ if m: out["sampling_hint"] = f"temp={m.group(1)}, top_p={m.group(2)}"
177
+ # context length mentions
178
+ ctxs = re.findall(r"([0-9][0-9,]{3,})\s*(?:tokens|context)", card, re.I)
179
+ if ctxs: out["context_hint"] = max(int(c.replace(",","")) for c in ctxs)
180
+ # benchmark hints
181
+ for bench in ["SWE-bench Verified","SWE-Bench Verified","Terminal-Bench","BFCL","TAU2","LiveCodeBench"]:
182
+ m = re.search(re.escape(bench)+r"[^0-9]{0,30}([0-9]{1,3}\.?[0-9]?)", card)
183
+ if m: out.setdefault("benchmarks", {})[bench] = m.group(1)
184
+ out["mentions_sglang"] = "sglang" in low
185
+ out["mentions_vllm"] = "vllm" in low
186
+ return out
187
+
188
+ def summarize(owner, repo, model_dir, kind):
189
+ snap = newest_snapshot(model_dir) if kind == "hf" else model_dir
190
+ cfg = load_json(os.path.join(snap, "config.json")) if snap else {}
191
+ gen = load_json(os.path.join(snap, "generation_config.json")) if snap else {}
192
+ txt = cfg.get("text_config", {}) if isinstance(cfg.get("text_config"), dict) else {}
193
+ qmethod, qbits, qsig = _parse_quant(cfg)
194
+ fmt = detect_format(snap) if snap else "?"
195
+ sm120 = sm120_hazard(fmt, qsig, cfg, txt)
196
+ s = dict(
197
+ id=f"{owner}/{repo}" if owner else repo,
198
+ owner=owner, repo=repo, local_path=model_dir, source=kind,
199
+ size_gb=dir_size_gb(model_dir),
200
+ format=fmt,
201
+ architectures=cfg.get("architectures"),
202
+ model_type=cfg.get("model_type"),
203
+ context=cfg.get("max_position_embeddings") or txt.get("max_position_embeddings"),
204
+ quant=qmethod, quant_bits=qbits,
205
+ gen_sampling={k:gen[k] for k in ("temperature","top_p","top_k") if k in gen},
206
+ sglang_loadable=(fmt == "safetensors" and not sm120),
207
+ sm120_caveat=sm120,
208
+ synced=time.strftime("%Y-%m-%d %H:%M"),
209
+ )
210
+ card = fetch_card(owner, repo) if owner else None
211
+ if card:
212
+ open(os.path.join(CARDS, f"{owner}__{repo}.md"), "w", encoding="utf-8").write(card)
213
+ fm = parse_frontmatter(card)
214
+ s["license"] = fm.get("license")
215
+ s["pipeline_tag"] = fm.get("pipeline_tag")
216
+ s["base_model"] = fm.get("base_model")
217
+ s.update(extract_from_card(card))
218
+ s["card_saved"] = f"cards/{owner}__{repo}.md"
219
+ else:
220
+ s["card_saved"] = None
221
+ open(os.path.join(CARDS, f"{owner}__{repo}.json" if owner else f"{repo}.json"), "w",
222
+ encoding="utf-8").write(json.dumps(s, indent=1))
223
+ return s
224
+
225
+ def discover():
226
+ found = []
227
+ for root, kind in ROOTS:
228
+ if not os.path.isdir(root): continue
229
+ if kind == "hf":
230
+ for d in glob.glob(os.path.join(root, "models--*")):
231
+ base = os.path.basename(d)
232
+ parts = base.split("--")
233
+ if len(parts) < 3: continue
234
+ owner, repo = parts[1], "--".join(parts[2:])
235
+ found.append((owner, repo, d, "hf"))
236
+ else:
237
+ for d in glob.glob(os.path.join(root, "*")):
238
+ if os.path.isdir(d):
239
+ cfg = load_json(os.path.join(d, "config.json"))
240
+ nm = cfg.get("_name_or_path") or os.path.basename(d)
241
+ owner, repo = (nm.split("/",1)+[None])[:2] if "/" in str(nm) else (None, os.path.basename(d))
242
+ found.append((owner, repo, d, "dir"))
243
+ return found
244
+
245
+ def is_real_model_row(r):
246
+ """SHARED 3-part real-model gate (used by write_index AND cache_prune).
247
+
248
+ A row is a real servable model only if all three hold:
249
+ 1. not an unslothai mirror,
250
+ 2. not a tiny (<0.2GB) format-unknown dir,
251
+ 3. POSITIVE evidence it is a model: a model_type OR a weights format.
252
+ The third gate is decisive — without it a dataset / partial download /
253
+ tokenizer-only dir (no weights, no model_type, >=0.2GB) would slip through
254
+ and, in cache_prune, become a DELETION candidate. Keep the three parts here
255
+ so the prune planner and the index can never drift apart.
256
+ """
257
+ if r.get("owner") == "unslothai": return False
258
+ if (r.get("size_gb") or 0) < 0.2 and r.get("format") == "?": return False
259
+ return bool(r.get("model_type")) or r.get("format") in ("safetensors", "GGUF")
260
+
261
+ def write_index(rows):
262
+ rows = [r for r in rows if is_real_model_row(r)]
263
+ rows.sort(key=lambda r: (r["format"] != "safetensors", -(r.get("size_gb") or 0)))
264
+ L = ["# Model Library — Index", "",
265
+ f"_Auto-generated by `sync_model_cards.py` — {time.strftime('%Y-%m-%d %H:%M')}. "
266
+ f"{len(rows)} models. Cards in `cards/`._", "",
267
+ "| Model | Format | SGLang? | Params/size | Context | Quant | License | Thinking | Coding bench | Local |",
268
+ "|---|---|---|---|---|---|---|---|---|---|"]
269
+ for r in rows:
270
+ bench = ""
271
+ b = r.get("benchmarks") or {}
272
+ for k in ("SWE-bench Verified","SWE-Bench Verified","Terminal-Bench"):
273
+ if k in b: bench = f"{k.split('-')[0]} {b[k]}"; break
274
+ ctx = r.get("context") or r.get("context_hint") or ""
275
+ ctx = f"{int(ctx)//1024}K" if str(ctx).isdigit() and int(ctx)>=1024 else (ctx or "")
276
+ L.append("| {id} | {fmt} | {ok} | {sz} GB | {ctx} | {q} | {lic} | {th} | {bn} | {src} |".format(
277
+ id=r["id"], fmt=r.get("format","?"),
278
+ ok=("✅" if r.get("sglang_loadable")
279
+ else "⚠️ sm_120" if r.get("sm120_caveat")
280
+ else "❌ (llama.cpp)" if r.get("format")=="GGUF"
281
+ else "?"),
282
+ sz=r.get("size_gb","?"), ctx=ctx,
283
+ q=(f"{r.get('quant') or ''} {r.get('quant_bits') or ''}".strip() or "—"),
284
+ lic=r.get("license") or "—",
285
+ th=("yes" if r.get("thinking_default") else "—"),
286
+ bn=bench or "—", src=("win" if "/mnt/c" in r.get("local_path","") else "wsl")))
287
+ open(os.path.join(HERE, "INDEX.md"), "w", encoding="utf-8").write("\n".join(L)+"\n")
288
+
289
+ def main():
290
+ models = discover()
291
+ print(f"discovered {len(models)} model folders")
292
+ rows = []
293
+ for owner, repo, d, kind in models:
294
+ try:
295
+ s = summarize(owner, repo, d, kind)
296
+ rows.append(s)
297
+ print(f" [{s.get('format'):>11}] {s['id']} {s.get('size_gb')}GB card={'y' if s.get('card_saved') else 'n'}")
298
+ except Exception as e:
299
+ print(f" ERROR {owner}/{repo}: {e}")
300
+ write_index(rows)
301
+ # new-model detection (vs last run) for the Cowork analysis task
302
+ real_ids = [r["id"] for r in rows if r.get("format") in ("safetensors","GGUF")]
303
+ prior = set((load_json(STATE) or {}).get("ids", []))
304
+ new_ids = [i for i in real_ids if i not in prior]
305
+ json.dump({"ids": real_ids, "updated": time.strftime("%Y-%m-%d %H:%M")},
306
+ open(STATE, "w", encoding="utf-8"), indent=1)
307
+ print(f"wrote INDEX.md + {len(rows)} summaries to {HERE}")
308
+ print("NEW_MODELS: " + (", ".join(new_ids) if new_ids else "none"))
309
+
310
+ def _loadable(cfg, fmt="safetensors"):
311
+ """Mirror summarize()'s decision via the same derivation, for self-check."""
312
+ _, _, qsig = _parse_quant(cfg)
313
+ sm120 = sm120_hazard(fmt, qsig, cfg, cfg.get("text_config") or {})
314
+ return (fmt == "safetensors" and not sm120), sm120
315
+
316
+ def _selfcheck():
317
+ # --- REAL derivation path: compressed-tensors hides FP8-ness in the weight
318
+ # spec (num_bits=8/type=float), NOT in the method string. ---
319
+ ct_fp8 = {"quant_method": "compressed-tensors", "format": "float-quantized",
320
+ "config_groups": {"group_0": {"weights": {"num_bits": 8, "type": "float"}}}}
321
+ # FP8 + MoE (num_local_experts, Mixtral/gpt-oss style) must NOT be clean-loadable.
322
+ cfg_fp8_moe = {"quantization_config": ct_fp8, "num_local_experts": 128,
323
+ "architectures": ["Qwen3MoeForCausalLM"]}
324
+ ok, caveat = _loadable(cfg_fp8_moe)
325
+ assert not ok and caveat, "compressed-tensors FP8 on MoE must caveat (not clean)"
326
+ # FP8 + MoE via DeepSeek-V3 key n_routed_experts must also caveat.
327
+ cfg_fp8_ds = {"quantization_config": ct_fp8, "n_routed_experts": 256}
328
+ ok, caveat = _loadable(cfg_fp8_ds)
329
+ assert not ok and caveat, "compressed-tensors FP8 on DeepSeek MoE must caveat"
330
+ # NVFP4 via weight spec (num_bits=4/type=float) caveats on any arch.
331
+ ct_nvfp4 = {"quantization_config": {"quant_method": "compressed-tensors",
332
+ "config_groups": {"g": {"weights": {"num_bits": 4, "type": "float"}}}}}
333
+ ok, caveat = _loadable(ct_nvfp4)
334
+ assert not ok and caveat, "compressed-tensors NVFP4 must caveat on any arch"
335
+ # gpt-oss MXFP4 is SERVABLE via triton -> must NOT be flagged.
336
+ gptoss = {"quantization_config": {"quant_method": "mxfp4"},
337
+ "model_type": "gpt_oss", "num_local_experts": 128}
338
+ ok, caveat = _loadable(gptoss)
339
+ assert ok and caveat is None, "gpt-oss MXFP4 is servable, must not be flagged"
340
+ # Dense AWQ (int4) stays clean.
341
+ awq = {"quantization_config": {"quant_method": "awq",
342
+ "config_groups": {"g": {"weights": {"num_bits": 4, "type": "int"}}}}}
343
+ ok, caveat = _loadable(awq)
344
+ assert ok and caveat is None, "dense AWQ (int4) stays clean-loadable"
345
+ # JSON null quantization_config must not crash.
346
+ ok, _ = _loadable({"quantization_config": None})
347
+ assert ok, "null quantization_config -> clean, no AttributeError"
348
+
349
+ # --- Legacy string-folding path still holds (native-fp8 repos etc.). ---
350
+ assert sm120_hazard("safetensors", "fp8", {"num_experts": 128}, {}), "FP8-MoE should caveat"
351
+ assert sm120_hazard("safetensors", "nvfp4", {}, {}), "NVFP4 caveats on any arch"
352
+ assert sm120_hazard("safetensors", "fp8", {}, {}) is None, "dense FP8 is fine"
353
+ assert sm120_hazard("safetensors", "awq", {"num_experts": 128}, {}) is None, "AWQ-MoE clean"
354
+ assert sm120_hazard("safetensors", "awq", {}, {}) is None, "dense AWQ clean"
355
+ assert sm120_hazard("GGUF", "fp8", {"num_experts": 128}, {}) is None, "GGUF handled elsewhere"
356
+
357
+ if __name__ == "__main__":
358
+ _selfcheck()
359
+ main()