quantfit 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quantfit/__init__.py +6 -0
- quantfit/backends/__init__.py +1 -0
- quantfit/backends/compressed_tensors.py +104 -0
- quantfit/backends/gguf.py +121 -0
- quantfit/cli.py +103 -0
- quantfit/fit.py +130 -0
- quantfit/gpufit.py +97 -0
- quantfit/quantize.py +98 -0
- quantfit/registry.py +119 -0
- quantfit/safety/__init__.py +12 -0
- quantfit/safety/verify.py +253 -0
- quantfit/spec.py +32 -0
- quantfit/verify.py +46 -0
- quantfit-0.1.0.dist-info/METADATA +121 -0
- quantfit-0.1.0.dist-info/RECORD +19 -0
- quantfit-0.1.0.dist-info/WHEEL +5 -0
- quantfit-0.1.0.dist-info/entry_points.txt +2 -0
- quantfit-0.1.0.dist-info/licenses/LICENSE +154 -0
- quantfit-0.1.0.dist-info/top_level.txt +1 -0
quantfit/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Quantization backends. Each turns a (method, scheme) into a saved artifact."""
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""compressed-tensors backend (llm-compressor): the method × scheme matrix.
|
|
2
|
+
|
|
3
|
+
awq / gptq / autoround / smoothquant calibrate; fp8 / rtn do not. All emit
|
|
4
|
+
compressed-tensors (vLLM-loadable). For the calibrated algorithms the only
|
|
5
|
+
cross-method difference is the algorithm itself — same calibration, same format
|
|
6
|
+
— so the methods are comparable, not confounded.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from quantfit.spec import QuantSpec
|
|
13
|
+
|
|
14
|
+
_TARGETS = ["Linear"]
|
|
15
|
+
_IGNORE = ["lm_head"]
|
|
16
|
+
_SMOOTHING_STRENGTH = 0.8 # SmoothQuant migration strength (standard default)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def build_recipe(method: str, scheme: str):
|
|
20
|
+
"""Construct the llm-compressor recipe (modifier or modifier list) for a method."""
|
|
21
|
+
from llmcompressor.modifiers.awq import AWQModifier
|
|
22
|
+
from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationModifier
|
|
23
|
+
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
|
|
24
|
+
|
|
25
|
+
common = dict(targets=_TARGETS, ignore=_IGNORE)
|
|
26
|
+
if method == "awq":
|
|
27
|
+
return AWQModifier(scheme=scheme, **common)
|
|
28
|
+
if method == "gptq":
|
|
29
|
+
return GPTQModifier(scheme=scheme, **common)
|
|
30
|
+
if method == "smoothquant":
|
|
31
|
+
return [
|
|
32
|
+
SmoothQuantModifier(smoothing_strength=_SMOOTHING_STRENGTH),
|
|
33
|
+
GPTQModifier(scheme=scheme, **common),
|
|
34
|
+
]
|
|
35
|
+
if method in ("fp8", "rtn"):
|
|
36
|
+
return QuantizationModifier(scheme=scheme, **common)
|
|
37
|
+
raise ValueError(f"no compressed-tensors recipe for method {method!r}")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def calib_dataset(spec: QuantSpec, tokenizer, token: str | None = None):
|
|
41
|
+
"""Packed fixed-length calibration: concatenate text, chunk into seq-len blocks.
|
|
42
|
+
|
|
43
|
+
Uniform-length sequences are required by AutoRound (it stacks samples and
|
|
44
|
+
rejects ragged lengths) and are the standard GPTQ/AWQ calibration form, so one
|
|
45
|
+
packed dataset serves every calibrated method. Deterministic under the spec.
|
|
46
|
+
"""
|
|
47
|
+
from datasets import Dataset, load_dataset
|
|
48
|
+
|
|
49
|
+
ds = load_dataset(
|
|
50
|
+
spec.calib_dataset, spec.calib_config, split=spec.calib_split, token=token
|
|
51
|
+
)
|
|
52
|
+
ds = ds.filter(lambda ex: ex["text"] is not None and ex["text"].strip() != "")
|
|
53
|
+
ds = ds.shuffle(seed=spec.seed)
|
|
54
|
+
|
|
55
|
+
needed = spec.calib_samples * spec.calib_seqlen
|
|
56
|
+
buf: list[int] = []
|
|
57
|
+
for ex in ds:
|
|
58
|
+
buf.extend(tokenizer(ex["text"]).input_ids)
|
|
59
|
+
if len(buf) >= needed:
|
|
60
|
+
break
|
|
61
|
+
blocks = [
|
|
62
|
+
buf[i : i + spec.calib_seqlen]
|
|
63
|
+
for i in range(0, needed, spec.calib_seqlen)
|
|
64
|
+
]
|
|
65
|
+
return Dataset.from_dict(
|
|
66
|
+
{"input_ids": blocks, "attention_mask": [[1] * len(b) for b in blocks]}
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def quantize_ct(
|
|
71
|
+
model_id: str,
|
|
72
|
+
method: str,
|
|
73
|
+
scheme: str,
|
|
74
|
+
out_dir: str,
|
|
75
|
+
spec: QuantSpec,
|
|
76
|
+
needs_calibration: bool,
|
|
77
|
+
token: str | None = None,
|
|
78
|
+
offload: bool = False,
|
|
79
|
+
) -> Path:
|
|
80
|
+
"""Run llm-compressor oneshot for `method`/`scheme` into `out_dir`."""
|
|
81
|
+
from llmcompressor import oneshot
|
|
82
|
+
from transformers import AutoTokenizer
|
|
83
|
+
|
|
84
|
+
out = Path(out_dir)
|
|
85
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
86
|
+
tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
|
|
87
|
+
|
|
88
|
+
kwargs: dict = dict(
|
|
89
|
+
model=model_id,
|
|
90
|
+
tokenizer=tokenizer,
|
|
91
|
+
recipe=build_recipe(method, scheme),
|
|
92
|
+
output_dir=str(out),
|
|
93
|
+
)
|
|
94
|
+
if offload:
|
|
95
|
+
# Quantize layer-by-layer with the model held on CPU -> fits any size.
|
|
96
|
+
kwargs["sequential_offload_device"] = "cpu"
|
|
97
|
+
if needs_calibration:
|
|
98
|
+
kwargs.update(
|
|
99
|
+
dataset=calib_dataset(spec, tokenizer, token=token),
|
|
100
|
+
num_calibration_samples=spec.calib_samples,
|
|
101
|
+
max_seq_length=spec.calib_seqlen,
|
|
102
|
+
)
|
|
103
|
+
oneshot(**kwargs)
|
|
104
|
+
return out
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""GGUF backend (llama.cpp).
|
|
2
|
+
|
|
3
|
+
Produces GGUF k-quants (Q4_K_M, etc.) for the Ollama / llama.cpp / LM Studio
|
|
4
|
+
world. Quantization is CPU-only. Two tools are provisioned into a cache on first
|
|
5
|
+
use (or located via QUANTFIT_LLAMACPP pointing at a llama.cpp checkout):
|
|
6
|
+
- the prebuilt `llama-quantize` binary (from the pinned llama.cpp release zip)
|
|
7
|
+
- the repo's `convert_hf_to_gguf.py` (HF safetensors -> GGUF f16); it imports a
|
|
8
|
+
sibling `conversion` package, so a shallow clone of the repo is required.
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
import platform
|
|
14
|
+
import subprocess
|
|
15
|
+
import sys
|
|
16
|
+
import urllib.request
|
|
17
|
+
import zipfile
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
LLAMACPP_TAG = "b9817" # pinned release; binary + convert script must match
|
|
21
|
+
GGUF_TYPES = (
|
|
22
|
+
"Q2_K", "Q3_K_S", "Q3_K_M", "Q4_K_S", "Q4_K_M", "Q5_K_M", "Q6_K", "Q8_0", "IQ4_XS",
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
_REPO = "https://github.com/ggml-org/llama.cpp"
|
|
26
|
+
_RELEASES = "https://github.com/ggml-org/llama.cpp/releases/download"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _cache_dir() -> Path:
|
|
30
|
+
root = os.environ.get("QUANTFIT_CACHE")
|
|
31
|
+
d = Path(root) if root else Path.home() / ".cache" / "quantfit"
|
|
32
|
+
d.mkdir(parents=True, exist_ok=True)
|
|
33
|
+
return d
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _exe_name() -> str:
|
|
37
|
+
return "llama-quantize.exe" if platform.system() == "Windows" else "llama-quantize"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _binary_asset() -> str:
|
|
41
|
+
sysname = platform.system()
|
|
42
|
+
if sysname == "Windows":
|
|
43
|
+
return f"llama-{LLAMACPP_TAG}-bin-win-cpu-x64.zip"
|
|
44
|
+
if sysname == "Linux":
|
|
45
|
+
return f"llama-{LLAMACPP_TAG}-bin-ubuntu-x64.zip"
|
|
46
|
+
raise RuntimeError(
|
|
47
|
+
f"no prebuilt llama.cpp binary wired for {sysname}; install llama.cpp and "
|
|
48
|
+
f"set QUANTFIT_LLAMACPP to its directory"
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _first_match(root: Path, name: str) -> Path | None:
|
|
53
|
+
if (root / name).exists():
|
|
54
|
+
return root / name
|
|
55
|
+
return next(iter(root.rglob(name)), None)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def llama_quantize_bin() -> Path:
|
|
59
|
+
"""Locate (env) or download+extract the llama-quantize binary."""
|
|
60
|
+
exe = _exe_name()
|
|
61
|
+
env = os.environ.get("QUANTFIT_LLAMACPP")
|
|
62
|
+
if env and (hit := _first_match(Path(env), exe)):
|
|
63
|
+
return hit
|
|
64
|
+
|
|
65
|
+
bindir = _cache_dir() / f"llamacpp-bin-{LLAMACPP_TAG}"
|
|
66
|
+
if hit := _first_match(bindir, exe):
|
|
67
|
+
return hit
|
|
68
|
+
|
|
69
|
+
asset = _binary_asset()
|
|
70
|
+
zip_path = _cache_dir() / asset
|
|
71
|
+
if not zip_path.exists():
|
|
72
|
+
urllib.request.urlretrieve(f"{_RELEASES}/{LLAMACPP_TAG}/{asset}", zip_path)
|
|
73
|
+
bindir.mkdir(parents=True, exist_ok=True)
|
|
74
|
+
with zipfile.ZipFile(zip_path) as z:
|
|
75
|
+
z.extractall(bindir)
|
|
76
|
+
if hit := _first_match(bindir, exe):
|
|
77
|
+
return hit
|
|
78
|
+
raise RuntimeError(f"{exe} not found inside {asset}")
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def convert_script() -> Path:
|
|
82
|
+
"""Locate (env) or shallow-clone the repo's convert_hf_to_gguf.py."""
|
|
83
|
+
name = "convert_hf_to_gguf.py"
|
|
84
|
+
env = os.environ.get("QUANTFIT_LLAMACPP")
|
|
85
|
+
if env and (Path(env) / name).exists():
|
|
86
|
+
return Path(env) / name
|
|
87
|
+
|
|
88
|
+
repo = _cache_dir() / f"llama.cpp-{LLAMACPP_TAG}"
|
|
89
|
+
if (repo / name).exists():
|
|
90
|
+
return repo / name
|
|
91
|
+
subprocess.run(
|
|
92
|
+
["git", "clone", "--depth", "1", "--branch", LLAMACPP_TAG, _REPO, str(repo)],
|
|
93
|
+
check=True,
|
|
94
|
+
)
|
|
95
|
+
if not (repo / name).exists():
|
|
96
|
+
raise RuntimeError(f"{name} missing after cloning {_REPO}@{LLAMACPP_TAG}")
|
|
97
|
+
return repo / name
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def quantize_gguf(model_id: str, qtype: str, out_dir: str, token: str | None = None) -> Path:
|
|
101
|
+
"""HF model -> GGUF f16 -> quantized GGUF (CPU-only)."""
|
|
102
|
+
from huggingface_hub import snapshot_download
|
|
103
|
+
|
|
104
|
+
out = Path(out_dir)
|
|
105
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
106
|
+
|
|
107
|
+
quant_bin = llama_quantize_bin()
|
|
108
|
+
convert = convert_script()
|
|
109
|
+
model_dir = snapshot_download(model_id, token=token)
|
|
110
|
+
|
|
111
|
+
f16 = out / "model.f16.gguf"
|
|
112
|
+
final = out / f"model.{qtype}.gguf"
|
|
113
|
+
env = dict(os.environ, PYTHONUTF8="1", PYTHONIOENCODING="utf-8")
|
|
114
|
+
|
|
115
|
+
subprocess.run(
|
|
116
|
+
[sys.executable, str(convert), model_dir, "--outtype", "f16", "--outfile", str(f16)],
|
|
117
|
+
check=True, env=env,
|
|
118
|
+
)
|
|
119
|
+
subprocess.run([str(quant_bin), str(f16), str(final), qtype], check=True, env=env)
|
|
120
|
+
f16.unlink(missing_ok=True) # drop the large f16 intermediate
|
|
121
|
+
return out
|
quantfit/cli.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""quantfit CLI — `check`, `list`, `quantize`."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
from quantfit.registry import METHODS
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _force_utf8_stdio() -> None:
|
|
11
|
+
# llm-compressor / gptqmodel loggers emit unicode; a Windows cp1252 console
|
|
12
|
+
# otherwise crashes mid-run with UnicodeEncodeError.
|
|
13
|
+
for stream in (sys.stdout, sys.stderr):
|
|
14
|
+
try:
|
|
15
|
+
stream.reconfigure(encoding="utf-8") # type: ignore[union-attr]
|
|
16
|
+
except (AttributeError, ValueError):
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _build_parser() -> argparse.ArgumentParser:
|
|
21
|
+
p = argparse.ArgumentParser(
|
|
22
|
+
prog="quantfit", description="Quantize an LLM if it fits your GPU."
|
|
23
|
+
)
|
|
24
|
+
sub = p.add_subparsers(dest="cmd", required=True)
|
|
25
|
+
|
|
26
|
+
pc = sub.add_parser("check", help="will this model fit your GPU?")
|
|
27
|
+
pc.add_argument("--model", required=True, help="HF model id")
|
|
28
|
+
|
|
29
|
+
sub.add_parser("list", help="list supported methods + schemes")
|
|
30
|
+
|
|
31
|
+
pv = sub.add_parser("verify", help="smoke-load a quantized artifact + generate")
|
|
32
|
+
pv.add_argument("--model", required=True, help="path to a quantized output dir or .gguf")
|
|
33
|
+
|
|
34
|
+
pvs = sub.add_parser("verify-safety", help="refusal preservation: fp16 baseline vs quantized")
|
|
35
|
+
pvs.add_argument("--fp16", required=True, help="HF id of the fp16 baseline")
|
|
36
|
+
pvs.add_argument("--quant", required=True, help="path to the quantized artifact")
|
|
37
|
+
pvs.add_argument("--max-new-tokens", type=int, default=64)
|
|
38
|
+
|
|
39
|
+
pq = sub.add_parser("quantize", help="quantize a model")
|
|
40
|
+
pq.add_argument("--model", required=True, help="HF model id (the FP16 base)")
|
|
41
|
+
pq.add_argument("--method", required=True, choices=tuple(METHODS))
|
|
42
|
+
pq.add_argument("--scheme", default=None, help="override the method's default scheme")
|
|
43
|
+
pq.add_argument("--out", required=True, help="output directory")
|
|
44
|
+
pq.add_argument("--push", default=None, help="HF repo id to upload the result to")
|
|
45
|
+
pq.add_argument("--private", action="store_true", help="push as a private repo")
|
|
46
|
+
pq.add_argument("--offload", action="store_true", help="quantize on CPU (fits any size, slower)")
|
|
47
|
+
pq.add_argument("--no-check", action="store_true", help="skip the GPU pre-flight")
|
|
48
|
+
return p
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def main(argv: list[str] | None = None) -> int:
|
|
52
|
+
_force_utf8_stdio()
|
|
53
|
+
args = _build_parser().parse_args(argv)
|
|
54
|
+
|
|
55
|
+
if args.cmd == "check":
|
|
56
|
+
from quantfit.fit import plan
|
|
57
|
+
|
|
58
|
+
cap = plan(args.model)
|
|
59
|
+
print(cap.reason())
|
|
60
|
+
return 0 if cap.fits else 2
|
|
61
|
+
|
|
62
|
+
if args.cmd == "list":
|
|
63
|
+
from quantfit.registry import catalog
|
|
64
|
+
|
|
65
|
+
print(catalog())
|
|
66
|
+
return 0
|
|
67
|
+
|
|
68
|
+
if args.cmd == "verify":
|
|
69
|
+
from quantfit.verify import verify
|
|
70
|
+
|
|
71
|
+
ok, msg = verify(args.model)
|
|
72
|
+
print(("PASS: " if ok else "FAIL: ") + msg)
|
|
73
|
+
return 0 if ok else 2
|
|
74
|
+
|
|
75
|
+
if args.cmd == "verify-safety":
|
|
76
|
+
from quantfit.safety.verify import verify_safety
|
|
77
|
+
|
|
78
|
+
tax = verify_safety(args.fp16, args.quant, max_new_tokens=args.max_new_tokens)
|
|
79
|
+
print(tax.summary()) # aggregates only — never echoes raw probe prompts/completions
|
|
80
|
+
return 0 if tax.clean else 2
|
|
81
|
+
|
|
82
|
+
if args.cmd == "quantize":
|
|
83
|
+
from quantfit.quantize import CannotQuantize, push, quantize
|
|
84
|
+
from quantfit.registry import UnsupportedCombo
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
out = quantize(
|
|
88
|
+
args.model, args.method, args.out,
|
|
89
|
+
scheme=args.scheme, run_check=not args.no_check, offload=args.offload,
|
|
90
|
+
)
|
|
91
|
+
except (CannotQuantize, UnsupportedCombo) as exc:
|
|
92
|
+
print(exc)
|
|
93
|
+
return 2
|
|
94
|
+
print(f"quantized -> {out}")
|
|
95
|
+
if args.push:
|
|
96
|
+
print(f"pushed -> {push(str(out), args.push, private=args.private)}")
|
|
97
|
+
return 0
|
|
98
|
+
|
|
99
|
+
return 1 # unreachable: subparser is required
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
if __name__ == "__main__":
|
|
103
|
+
raise SystemExit(main())
|
quantfit/fit.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""Capacity decision: in-GPU / CPU-offload / refuse.
|
|
2
|
+
|
|
3
|
+
Three resources gate a quantization job:
|
|
4
|
+
- disk : weights must be downloaded (unless cached) + the output written. Needed
|
|
5
|
+
in BOTH gpu and offload modes, so it's a precondition.
|
|
6
|
+
- VRAM : enough -> quantize in-GPU (fast).
|
|
7
|
+
- RAM : not enough VRAM but enough RAM -> hold the model on CPU and stream
|
|
8
|
+
layers to the GPU (offload; slower; fits any size).
|
|
9
|
+
Refuse only when none of the above can be satisfied, and always name the actual
|
|
10
|
+
limiting resource.
|
|
11
|
+
"""
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import shutil
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
import psutil
|
|
19
|
+
|
|
20
|
+
from quantfit.gpufit import (
|
|
21
|
+
CALIB_OVERHEAD_FACTOR,
|
|
22
|
+
HEADROOM_BYTES,
|
|
23
|
+
estimate_fp16_bytes,
|
|
24
|
+
gpu_free_bytes,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
# Offload holds the model in CPU RAM and moves one layer at a time to the GPU.
|
|
28
|
+
OFFLOAD_RAM_FACTOR = 1.15
|
|
29
|
+
# Quantized output is smaller than fp16; reserve this fraction (covers 8-bit, the
|
|
30
|
+
# largest common output; 4-bit is ~half this).
|
|
31
|
+
OUTPUT_DISK_FACTOR = 0.6
|
|
32
|
+
_GIB = 1024**3
|
|
33
|
+
|
|
34
|
+
MODE_GPU = "gpu"
|
|
35
|
+
MODE_OFFLOAD = "offload"
|
|
36
|
+
MODE_REFUSE = "refuse"
|
|
37
|
+
|
|
38
|
+
LIMIT_NONE = ""
|
|
39
|
+
LIMIT_DISK = "disk"
|
|
40
|
+
LIMIT_MACHINE = "machine"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass(frozen=True)
|
|
44
|
+
class CapacityPlan:
|
|
45
|
+
model_id: str
|
|
46
|
+
fp16_bytes: int
|
|
47
|
+
gpu_free: int
|
|
48
|
+
ram_available: int
|
|
49
|
+
disk_free: int
|
|
50
|
+
disk_need: int
|
|
51
|
+
mode: str
|
|
52
|
+
limit: str
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def fits(self) -> bool:
|
|
56
|
+
return self.mode != MODE_REFUSE
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def offload(self) -> bool:
|
|
60
|
+
return self.mode == MODE_OFFLOAD
|
|
61
|
+
|
|
62
|
+
def reason(self) -> str:
|
|
63
|
+
g = lambda b: f"{b / _GIB:.1f}"
|
|
64
|
+
if self.mode == MODE_GPU:
|
|
65
|
+
return (
|
|
66
|
+
f"OK (in-GPU): {self.model_id} ~{g(self.fp16_bytes)} GB, "
|
|
67
|
+
f"{g(self.gpu_free)} GB VRAM free."
|
|
68
|
+
)
|
|
69
|
+
if self.mode == MODE_OFFLOAD:
|
|
70
|
+
return (
|
|
71
|
+
f"OK (offload): {self.model_id} ~{g(self.fp16_bytes)} GB won't fit "
|
|
72
|
+
f"{g(self.gpu_free)} GB VRAM — quantizing via CPU "
|
|
73
|
+
f"({g(self.ram_available)} GB RAM). Slower."
|
|
74
|
+
)
|
|
75
|
+
if self.limit == LIMIT_DISK:
|
|
76
|
+
return (
|
|
77
|
+
f"CAN'T QUANTIZE: {self.model_id} needs ~{g(self.disk_need)} GB free "
|
|
78
|
+
f"disk (download + output) but only {g(self.disk_free)} GB is free."
|
|
79
|
+
)
|
|
80
|
+
return (
|
|
81
|
+
f"CAN'T QUANTIZE: {self.model_id} ~{g(self.fp16_bytes)} GB needs more than "
|
|
82
|
+
f"{g(self.gpu_free)} GB VRAM and {g(self.ram_available)} GB RAM. "
|
|
83
|
+
f"Use a bigger machine."
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _existing_parent(path: str) -> str:
|
|
88
|
+
p = Path(path).resolve()
|
|
89
|
+
while not p.exists():
|
|
90
|
+
p = p.parent
|
|
91
|
+
return str(p)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _cached_weight_bytes(model_id: str) -> int:
|
|
95
|
+
"""Bytes of this model's safetensors already in the HF cache (0 if absent)."""
|
|
96
|
+
from huggingface_hub.constants import HF_HUB_CACHE
|
|
97
|
+
|
|
98
|
+
snap = Path(HF_HUB_CACHE) / ("models--" + model_id.replace("/", "--")) / "snapshots"
|
|
99
|
+
if not snap.exists():
|
|
100
|
+
return 0
|
|
101
|
+
total = 0
|
|
102
|
+
for f in snap.rglob("*.safetensors"):
|
|
103
|
+
try:
|
|
104
|
+
total += f.stat().st_size
|
|
105
|
+
except OSError:
|
|
106
|
+
pass
|
|
107
|
+
return total
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def plan(model_id: str, out_dir: str = ".", token: str | None = None) -> CapacityPlan:
|
|
111
|
+
"""Decide gpu / offload / refuse for quantizing `model_id`."""
|
|
112
|
+
fp16 = estimate_fp16_bytes(model_id, token=token)
|
|
113
|
+
gpu = gpu_free_bytes()
|
|
114
|
+
ram = int(psutil.virtual_memory().available)
|
|
115
|
+
disk = shutil.disk_usage(_existing_parent(out_dir)).free
|
|
116
|
+
|
|
117
|
+
download_need = max(0, fp16 - _cached_weight_bytes(model_id))
|
|
118
|
+
disk_need = download_need + int(fp16 * OUTPUT_DISK_FACTOR)
|
|
119
|
+
gpu_need = int(fp16 * CALIB_OVERHEAD_FACTOR) + HEADROOM_BYTES
|
|
120
|
+
ram_need = int(fp16 * OFFLOAD_RAM_FACTOR) + HEADROOM_BYTES
|
|
121
|
+
|
|
122
|
+
if disk < disk_need:
|
|
123
|
+
mode, limit = MODE_REFUSE, LIMIT_DISK
|
|
124
|
+
elif gpu_need <= gpu:
|
|
125
|
+
mode, limit = MODE_GPU, LIMIT_NONE
|
|
126
|
+
elif ram_need <= ram:
|
|
127
|
+
mode, limit = MODE_OFFLOAD, LIMIT_NONE
|
|
128
|
+
else:
|
|
129
|
+
mode, limit = MODE_REFUSE, LIMIT_MACHINE
|
|
130
|
+
return CapacityPlan(model_id, fp16, gpu, ram, disk, disk_need, mode, limit)
|
quantfit/gpufit.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""GPU capacity pre-flight.
|
|
2
|
+
|
|
3
|
+
The whole point of the tool: decide whether a model can be quantized in-GPU on
|
|
4
|
+
*this* machine BEFORE downloading weights or starting a multi-minute job. The
|
|
5
|
+
estimate is the FP16 footprint of the released weights (read from the Hub file
|
|
6
|
+
metadata, no download) times a calibration-overhead factor, plus fixed headroom,
|
|
7
|
+
compared against free VRAM. Errs toward refusal — a clear "can't quantize" beats
|
|
8
|
+
an OOM crash 20 minutes in.
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
|
|
14
|
+
from huggingface_hub import HfApi
|
|
15
|
+
|
|
16
|
+
# In-GPU PTQ (GPTQ/AWQ) holds the FP16 model plus per-layer Hessian/activation
|
|
17
|
+
# buffers; observed peak runs ~1.25x the released FP16 size. Headroom covers the
|
|
18
|
+
# CUDA context, calibration-batch activations, and allocator fragmentation.
|
|
19
|
+
CALIB_OVERHEAD_FACTOR = 1.25
|
|
20
|
+
HEADROOM_BYTES = 2 * 1024**3
|
|
21
|
+
_GIB = 1024**3
|
|
22
|
+
# Prefer safetensors; fall back to .bin only if no safetensors shards exist
|
|
23
|
+
# (summing both would double-count repos that ship both formats).
|
|
24
|
+
_WEIGHT_SUFFIXES = (".safetensors", ".bin")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass(frozen=True)
|
|
28
|
+
class FitReport:
|
|
29
|
+
model_id: str
|
|
30
|
+
fp16_bytes: int
|
|
31
|
+
required_bytes: int
|
|
32
|
+
free_bytes: int
|
|
33
|
+
fits: bool
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def fp16_gib(self) -> float:
|
|
37
|
+
return self.fp16_bytes / _GIB
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def required_gib(self) -> float:
|
|
41
|
+
return self.required_bytes / _GIB
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def free_gib(self) -> float:
|
|
45
|
+
return self.free_bytes / _GIB
|
|
46
|
+
|
|
47
|
+
def reason(self) -> str:
|
|
48
|
+
if self.fits:
|
|
49
|
+
return (
|
|
50
|
+
f"OK: {self.model_id} is ~{self.fp16_gib:.1f} GB FP16, needs "
|
|
51
|
+
f"~{self.required_gib:.1f} GB to quantize, {self.free_gib:.1f} GB free."
|
|
52
|
+
)
|
|
53
|
+
return (
|
|
54
|
+
f"CAN'T QUANTIZE: {self.model_id} needs ~{self.required_gib:.1f} GB "
|
|
55
|
+
f"in-GPU but only {self.free_gib:.1f} GB is free. Use a bigger GPU "
|
|
56
|
+
f"or a smaller model."
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def estimate_fp16_bytes(model_id: str, token: str | None = None) -> int:
|
|
61
|
+
"""Sum the released weight-file sizes from Hub metadata (no weight download)."""
|
|
62
|
+
info = HfApi().model_info(model_id, files_metadata=True, token=token)
|
|
63
|
+
by_suffix: dict[str, int] = {}
|
|
64
|
+
for f in info.siblings:
|
|
65
|
+
for suffix in _WEIGHT_SUFFIXES:
|
|
66
|
+
if f.rfilename.endswith(suffix) and f.size:
|
|
67
|
+
by_suffix[suffix] = by_suffix.get(suffix, 0) + f.size
|
|
68
|
+
for suffix in _WEIGHT_SUFFIXES:
|
|
69
|
+
if by_suffix.get(suffix):
|
|
70
|
+
return by_suffix[suffix]
|
|
71
|
+
raise ValueError(
|
|
72
|
+
f"{model_id}: no weight-file sizes found via Hub metadata; cannot "
|
|
73
|
+
"estimate footprint (model may be gated without access, or unavailable)."
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def gpu_free_bytes() -> int:
|
|
78
|
+
"""Free VRAM on the current default CUDA device, in bytes."""
|
|
79
|
+
import torch
|
|
80
|
+
|
|
81
|
+
if not torch.cuda.is_available():
|
|
82
|
+
raise RuntimeError("no CUDA GPU visible; quantfit needs a GPU to quantize.")
|
|
83
|
+
free, _total = torch.cuda.mem_get_info()
|
|
84
|
+
return int(free)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def check_fit(
|
|
88
|
+
model_id: str,
|
|
89
|
+
token: str | None = None,
|
|
90
|
+
overhead: float = CALIB_OVERHEAD_FACTOR,
|
|
91
|
+
headroom_bytes: int = HEADROOM_BYTES,
|
|
92
|
+
) -> FitReport:
|
|
93
|
+
"""Estimate footprint vs. free VRAM and return a fit verdict."""
|
|
94
|
+
fp16 = estimate_fp16_bytes(model_id, token=token)
|
|
95
|
+
required = int(fp16 * overhead) + headroom_bytes
|
|
96
|
+
free = gpu_free_bytes()
|
|
97
|
+
return FitReport(model_id, fp16, required, free, fits=required <= free)
|
quantfit/quantize.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""Dispatcher: validate the request, GPU pre-flight, route to a backend, card it."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from quantfit.fit import MODE_REFUSE, plan
|
|
7
|
+
from quantfit.registry import BACKEND_CT, BACKEND_GGUF, resolve
|
|
8
|
+
from quantfit.spec import DEFAULT_SPEC, QuantSpec
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class CannotQuantize(RuntimeError):
|
|
12
|
+
"""Raised when the model won't fit the GPU (pre-flight refusal)."""
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def quantize(
|
|
16
|
+
model_id: str,
|
|
17
|
+
method: str,
|
|
18
|
+
out_dir: str,
|
|
19
|
+
scheme: str | None = None,
|
|
20
|
+
spec: QuantSpec = DEFAULT_SPEC,
|
|
21
|
+
token: str | None = None,
|
|
22
|
+
run_check: bool = True,
|
|
23
|
+
offload: bool = False,
|
|
24
|
+
) -> Path:
|
|
25
|
+
"""Quantize `model_id` with `method` (+ optional `scheme`) into `out_dir`."""
|
|
26
|
+
m, resolved_scheme = resolve(method, scheme)
|
|
27
|
+
|
|
28
|
+
# The 3-tier capacity plan applies to GPU quantization (compressed-tensors).
|
|
29
|
+
# GGUF quantization is CPU-only, so it skips the VRAM/offload plan.
|
|
30
|
+
if run_check and m.backend == BACKEND_CT:
|
|
31
|
+
cap = plan(model_id, out_dir, token=token)
|
|
32
|
+
if cap.mode == MODE_REFUSE:
|
|
33
|
+
raise CannotQuantize(cap.reason())
|
|
34
|
+
if cap.offload:
|
|
35
|
+
offload = True
|
|
36
|
+
|
|
37
|
+
if m.backend == BACKEND_CT:
|
|
38
|
+
from quantfit.backends.compressed_tensors import quantize_ct
|
|
39
|
+
|
|
40
|
+
out = quantize_ct(
|
|
41
|
+
model_id, m.name, resolved_scheme, out_dir, spec,
|
|
42
|
+
m.needs_calibration, token=token, offload=offload,
|
|
43
|
+
)
|
|
44
|
+
elif m.backend == BACKEND_GGUF:
|
|
45
|
+
from quantfit.backends.gguf import quantize_gguf
|
|
46
|
+
|
|
47
|
+
out = quantize_gguf(model_id, resolved_scheme, out_dir, token=token)
|
|
48
|
+
else:
|
|
49
|
+
raise NotImplementedError(f"backend {m.backend!r} is not wired yet")
|
|
50
|
+
|
|
51
|
+
_write_card(Path(out), model_id, m.name, resolved_scheme, spec)
|
|
52
|
+
return Path(out)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _write_card(out: Path, model_id: str, method: str, scheme: str, spec: QuantSpec) -> None:
|
|
56
|
+
if method == "gguf":
|
|
57
|
+
card = f"""---
|
|
58
|
+
base_model: {model_id}
|
|
59
|
+
tags: [quantized, gguf, {scheme.lower()}, llama.cpp, quantfit]
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
# {out.name}
|
|
63
|
+
|
|
64
|
+
GGUF {scheme} quantization of `{model_id}`, produced with
|
|
65
|
+
[quantfit](https://github.com/Sahil170595/quantfit).
|
|
66
|
+
|
|
67
|
+
Loads in llama.cpp / Ollama / LM Studio. k-quant, no calibration (no imatrix).
|
|
68
|
+
"""
|
|
69
|
+
else:
|
|
70
|
+
card = f"""---
|
|
71
|
+
base_model: {model_id}
|
|
72
|
+
tags: [quantized, {method}, {scheme.lower()}, compressed-tensors, quantfit]
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
# {out.name}
|
|
76
|
+
|
|
77
|
+
{method.upper()} quantization ({scheme}) of `{model_id}`, produced with
|
|
78
|
+
[quantfit](https://github.com/Sahil170595/quantfit).
|
|
79
|
+
|
|
80
|
+
## Provenance
|
|
81
|
+
- method: {method}, scheme: {scheme}, group_size {spec.group_size}
|
|
82
|
+
- calibration: {spec.calib_dataset}/{spec.calib_config} [{spec.calib_split}], \
|
|
83
|
+
{spec.calib_samples} samples, seq-len {spec.calib_seqlen}, seed {spec.seed}
|
|
84
|
+
- spec fingerprint: `{spec.fingerprint()}`
|
|
85
|
+
|
|
86
|
+
Loads in vLLM via the compressed-tensors backend.
|
|
87
|
+
"""
|
|
88
|
+
(out / "README.md").write_text(card, encoding="utf-8")
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def push(out_dir: str, repo_id: str, token: str | None = None, private: bool = False) -> str:
|
|
92
|
+
"""Upload a quantized output folder to the Hub."""
|
|
93
|
+
from huggingface_hub import HfApi
|
|
94
|
+
|
|
95
|
+
api = HfApi(token=token)
|
|
96
|
+
api.create_repo(repo_id, exist_ok=True, private=private, repo_type="model")
|
|
97
|
+
api.upload_folder(folder_path=str(out_dir), repo_id=repo_id, repo_type="model")
|
|
98
|
+
return f"https://huggingface.co/{repo_id}"
|