quantfit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
quantfit/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ """quantfit — quantize an LLM if it fits your GPU."""
2
+ from quantfit.gpufit import FitReport, check_fit
3
+ from quantfit.spec import DEFAULT_SPEC, QuantSpec
4
+
5
+ __version__ = "0.1.0"
6
+ __all__ = ["FitReport", "check_fit", "QuantSpec", "DEFAULT_SPEC", "__version__"]
@@ -0,0 +1 @@
1
+ """Quantization backends. Each turns a (method, scheme) into a saved artifact."""
@@ -0,0 +1,104 @@
1
+ """compressed-tensors backend (llm-compressor): the method × scheme matrix.
2
+
3
+ awq / gptq / autoround / smoothquant calibrate; fp8 / rtn do not. All emit
4
+ compressed-tensors (vLLM-loadable). For the calibrated algorithms the only
5
+ cross-method difference is the algorithm itself — same calibration, same format
6
+ — so the methods are comparable, not confounded.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ from pathlib import Path
11
+
12
+ from quantfit.spec import QuantSpec
13
+
14
+ _TARGETS = ["Linear"]
15
+ _IGNORE = ["lm_head"]
16
+ _SMOOTHING_STRENGTH = 0.8 # SmoothQuant migration strength (standard default)
17
+
18
+
19
+ def build_recipe(method: str, scheme: str):
20
+ """Construct the llm-compressor recipe (modifier or modifier list) for a method."""
21
+ from llmcompressor.modifiers.awq import AWQModifier
22
+ from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationModifier
23
+ from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
24
+
25
+ common = dict(targets=_TARGETS, ignore=_IGNORE)
26
+ if method == "awq":
27
+ return AWQModifier(scheme=scheme, **common)
28
+ if method == "gptq":
29
+ return GPTQModifier(scheme=scheme, **common)
30
+ if method == "smoothquant":
31
+ return [
32
+ SmoothQuantModifier(smoothing_strength=_SMOOTHING_STRENGTH),
33
+ GPTQModifier(scheme=scheme, **common),
34
+ ]
35
+ if method in ("fp8", "rtn"):
36
+ return QuantizationModifier(scheme=scheme, **common)
37
+ raise ValueError(f"no compressed-tensors recipe for method {method!r}")
38
+
39
+
40
+ def calib_dataset(spec: QuantSpec, tokenizer, token: str | None = None):
41
+ """Packed fixed-length calibration: concatenate text, chunk into seq-len blocks.
42
+
43
+ Uniform-length sequences are required by AutoRound (it stacks samples and
44
+ rejects ragged lengths) and are the standard GPTQ/AWQ calibration form, so one
45
+ packed dataset serves every calibrated method. Deterministic under the spec.
46
+ """
47
+ from datasets import Dataset, load_dataset
48
+
49
+ ds = load_dataset(
50
+ spec.calib_dataset, spec.calib_config, split=spec.calib_split, token=token
51
+ )
52
+ ds = ds.filter(lambda ex: ex["text"] is not None and ex["text"].strip() != "")
53
+ ds = ds.shuffle(seed=spec.seed)
54
+
55
+ needed = spec.calib_samples * spec.calib_seqlen
56
+ buf: list[int] = []
57
+ for ex in ds:
58
+ buf.extend(tokenizer(ex["text"]).input_ids)
59
+ if len(buf) >= needed:
60
+ break
61
+ blocks = [
62
+ buf[i : i + spec.calib_seqlen]
63
+ for i in range(0, needed, spec.calib_seqlen)
64
+ ]
65
+ return Dataset.from_dict(
66
+ {"input_ids": blocks, "attention_mask": [[1] * len(b) for b in blocks]}
67
+ )
68
+
69
+
70
+ def quantize_ct(
71
+ model_id: str,
72
+ method: str,
73
+ scheme: str,
74
+ out_dir: str,
75
+ spec: QuantSpec,
76
+ needs_calibration: bool,
77
+ token: str | None = None,
78
+ offload: bool = False,
79
+ ) -> Path:
80
+ """Run llm-compressor oneshot for `method`/`scheme` into `out_dir`."""
81
+ from llmcompressor import oneshot
82
+ from transformers import AutoTokenizer
83
+
84
+ out = Path(out_dir)
85
+ out.mkdir(parents=True, exist_ok=True)
86
+ tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
87
+
88
+ kwargs: dict = dict(
89
+ model=model_id,
90
+ tokenizer=tokenizer,
91
+ recipe=build_recipe(method, scheme),
92
+ output_dir=str(out),
93
+ )
94
+ if offload:
95
+ # Quantize layer-by-layer with the model held on CPU -> fits any size.
96
+ kwargs["sequential_offload_device"] = "cpu"
97
+ if needs_calibration:
98
+ kwargs.update(
99
+ dataset=calib_dataset(spec, tokenizer, token=token),
100
+ num_calibration_samples=spec.calib_samples,
101
+ max_seq_length=spec.calib_seqlen,
102
+ )
103
+ oneshot(**kwargs)
104
+ return out
@@ -0,0 +1,121 @@
1
+ """GGUF backend (llama.cpp).
2
+
3
+ Produces GGUF k-quants (Q4_K_M, etc.) for the Ollama / llama.cpp / LM Studio
4
+ world. Quantization is CPU-only. Two tools are provisioned into a cache on first
5
+ use (or located via QUANTFIT_LLAMACPP pointing at a llama.cpp checkout):
6
+ - the prebuilt `llama-quantize` binary (from the pinned llama.cpp release zip)
7
+ - the repo's `convert_hf_to_gguf.py` (HF safetensors -> GGUF f16); it imports a
8
+ sibling `conversion` package, so a shallow clone of the repo is required.
9
+ """
10
+ from __future__ import annotations
11
+
12
+ import os
13
+ import platform
14
+ import subprocess
15
+ import sys
16
+ import urllib.request
17
+ import zipfile
18
+ from pathlib import Path
19
+
20
+ LLAMACPP_TAG = "b9817" # pinned release; binary + convert script must match
21
+ GGUF_TYPES = (
22
+ "Q2_K", "Q3_K_S", "Q3_K_M", "Q4_K_S", "Q4_K_M", "Q5_K_M", "Q6_K", "Q8_0", "IQ4_XS",
23
+ )
24
+
25
+ _REPO = "https://github.com/ggml-org/llama.cpp"
26
+ _RELEASES = "https://github.com/ggml-org/llama.cpp/releases/download"
27
+
28
+
29
+ def _cache_dir() -> Path:
30
+ root = os.environ.get("QUANTFIT_CACHE")
31
+ d = Path(root) if root else Path.home() / ".cache" / "quantfit"
32
+ d.mkdir(parents=True, exist_ok=True)
33
+ return d
34
+
35
+
36
+ def _exe_name() -> str:
37
+ return "llama-quantize.exe" if platform.system() == "Windows" else "llama-quantize"
38
+
39
+
40
+ def _binary_asset() -> str:
41
+ sysname = platform.system()
42
+ if sysname == "Windows":
43
+ return f"llama-{LLAMACPP_TAG}-bin-win-cpu-x64.zip"
44
+ if sysname == "Linux":
45
+ return f"llama-{LLAMACPP_TAG}-bin-ubuntu-x64.zip"
46
+ raise RuntimeError(
47
+ f"no prebuilt llama.cpp binary wired for {sysname}; install llama.cpp and "
48
+ f"set QUANTFIT_LLAMACPP to its directory"
49
+ )
50
+
51
+
52
+ def _first_match(root: Path, name: str) -> Path | None:
53
+ if (root / name).exists():
54
+ return root / name
55
+ return next(iter(root.rglob(name)), None)
56
+
57
+
58
+ def llama_quantize_bin() -> Path:
59
+ """Locate (env) or download+extract the llama-quantize binary."""
60
+ exe = _exe_name()
61
+ env = os.environ.get("QUANTFIT_LLAMACPP")
62
+ if env and (hit := _first_match(Path(env), exe)):
63
+ return hit
64
+
65
+ bindir = _cache_dir() / f"llamacpp-bin-{LLAMACPP_TAG}"
66
+ if hit := _first_match(bindir, exe):
67
+ return hit
68
+
69
+ asset = _binary_asset()
70
+ zip_path = _cache_dir() / asset
71
+ if not zip_path.exists():
72
+ urllib.request.urlretrieve(f"{_RELEASES}/{LLAMACPP_TAG}/{asset}", zip_path)
73
+ bindir.mkdir(parents=True, exist_ok=True)
74
+ with zipfile.ZipFile(zip_path) as z:
75
+ z.extractall(bindir)
76
+ if hit := _first_match(bindir, exe):
77
+ return hit
78
+ raise RuntimeError(f"{exe} not found inside {asset}")
79
+
80
+
81
+ def convert_script() -> Path:
82
+ """Locate (env) or shallow-clone the repo's convert_hf_to_gguf.py."""
83
+ name = "convert_hf_to_gguf.py"
84
+ env = os.environ.get("QUANTFIT_LLAMACPP")
85
+ if env and (Path(env) / name).exists():
86
+ return Path(env) / name
87
+
88
+ repo = _cache_dir() / f"llama.cpp-{LLAMACPP_TAG}"
89
+ if (repo / name).exists():
90
+ return repo / name
91
+ subprocess.run(
92
+ ["git", "clone", "--depth", "1", "--branch", LLAMACPP_TAG, _REPO, str(repo)],
93
+ check=True,
94
+ )
95
+ if not (repo / name).exists():
96
+ raise RuntimeError(f"{name} missing after cloning {_REPO}@{LLAMACPP_TAG}")
97
+ return repo / name
98
+
99
+
100
+ def quantize_gguf(model_id: str, qtype: str, out_dir: str, token: str | None = None) -> Path:
101
+ """HF model -> GGUF f16 -> quantized GGUF (CPU-only)."""
102
+ from huggingface_hub import snapshot_download
103
+
104
+ out = Path(out_dir)
105
+ out.mkdir(parents=True, exist_ok=True)
106
+
107
+ quant_bin = llama_quantize_bin()
108
+ convert = convert_script()
109
+ model_dir = snapshot_download(model_id, token=token)
110
+
111
+ f16 = out / "model.f16.gguf"
112
+ final = out / f"model.{qtype}.gguf"
113
+ env = dict(os.environ, PYTHONUTF8="1", PYTHONIOENCODING="utf-8")
114
+
115
+ subprocess.run(
116
+ [sys.executable, str(convert), model_dir, "--outtype", "f16", "--outfile", str(f16)],
117
+ check=True, env=env,
118
+ )
119
+ subprocess.run([str(quant_bin), str(f16), str(final), qtype], check=True, env=env)
120
+ f16.unlink(missing_ok=True) # drop the large f16 intermediate
121
+ return out
quantfit/cli.py ADDED
@@ -0,0 +1,103 @@
1
+ """quantfit CLI — `check`, `list`, `quantize`."""
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import sys
6
+
7
+ from quantfit.registry import METHODS
8
+
9
+
10
+ def _force_utf8_stdio() -> None:
11
+ # llm-compressor / gptqmodel loggers emit unicode; a Windows cp1252 console
12
+ # otherwise crashes mid-run with UnicodeEncodeError.
13
+ for stream in (sys.stdout, sys.stderr):
14
+ try:
15
+ stream.reconfigure(encoding="utf-8") # type: ignore[union-attr]
16
+ except (AttributeError, ValueError):
17
+ pass
18
+
19
+
20
+ def _build_parser() -> argparse.ArgumentParser:
21
+ p = argparse.ArgumentParser(
22
+ prog="quantfit", description="Quantize an LLM if it fits your GPU."
23
+ )
24
+ sub = p.add_subparsers(dest="cmd", required=True)
25
+
26
+ pc = sub.add_parser("check", help="will this model fit your GPU?")
27
+ pc.add_argument("--model", required=True, help="HF model id")
28
+
29
+ sub.add_parser("list", help="list supported methods + schemes")
30
+
31
+ pv = sub.add_parser("verify", help="smoke-load a quantized artifact + generate")
32
+ pv.add_argument("--model", required=True, help="path to a quantized output dir or .gguf")
33
+
34
+ pvs = sub.add_parser("verify-safety", help="refusal preservation: fp16 baseline vs quantized")
35
+ pvs.add_argument("--fp16", required=True, help="HF id of the fp16 baseline")
36
+ pvs.add_argument("--quant", required=True, help="path to the quantized artifact")
37
+ pvs.add_argument("--max-new-tokens", type=int, default=64)
38
+
39
+ pq = sub.add_parser("quantize", help="quantize a model")
40
+ pq.add_argument("--model", required=True, help="HF model id (the FP16 base)")
41
+ pq.add_argument("--method", required=True, choices=tuple(METHODS))
42
+ pq.add_argument("--scheme", default=None, help="override the method's default scheme")
43
+ pq.add_argument("--out", required=True, help="output directory")
44
+ pq.add_argument("--push", default=None, help="HF repo id to upload the result to")
45
+ pq.add_argument("--private", action="store_true", help="push as a private repo")
46
+ pq.add_argument("--offload", action="store_true", help="quantize on CPU (fits any size, slower)")
47
+ pq.add_argument("--no-check", action="store_true", help="skip the GPU pre-flight")
48
+ return p
49
+
50
+
51
+ def main(argv: list[str] | None = None) -> int:
52
+ _force_utf8_stdio()
53
+ args = _build_parser().parse_args(argv)
54
+
55
+ if args.cmd == "check":
56
+ from quantfit.fit import plan
57
+
58
+ cap = plan(args.model)
59
+ print(cap.reason())
60
+ return 0 if cap.fits else 2
61
+
62
+ if args.cmd == "list":
63
+ from quantfit.registry import catalog
64
+
65
+ print(catalog())
66
+ return 0
67
+
68
+ if args.cmd == "verify":
69
+ from quantfit.verify import verify
70
+
71
+ ok, msg = verify(args.model)
72
+ print(("PASS: " if ok else "FAIL: ") + msg)
73
+ return 0 if ok else 2
74
+
75
+ if args.cmd == "verify-safety":
76
+ from quantfit.safety.verify import verify_safety
77
+
78
+ tax = verify_safety(args.fp16, args.quant, max_new_tokens=args.max_new_tokens)
79
+ print(tax.summary()) # aggregates only — never echoes raw probe prompts/completions
80
+ return 0 if tax.clean else 2
81
+
82
+ if args.cmd == "quantize":
83
+ from quantfit.quantize import CannotQuantize, push, quantize
84
+ from quantfit.registry import UnsupportedCombo
85
+
86
+ try:
87
+ out = quantize(
88
+ args.model, args.method, args.out,
89
+ scheme=args.scheme, run_check=not args.no_check, offload=args.offload,
90
+ )
91
+ except (CannotQuantize, UnsupportedCombo) as exc:
92
+ print(exc)
93
+ return 2
94
+ print(f"quantized -> {out}")
95
+ if args.push:
96
+ print(f"pushed -> {push(str(out), args.push, private=args.private)}")
97
+ return 0
98
+
99
+ return 1 # unreachable: subparser is required
100
+
101
+
102
+ if __name__ == "__main__":
103
+ raise SystemExit(main())
quantfit/fit.py ADDED
@@ -0,0 +1,130 @@
1
+ """Capacity decision: in-GPU / CPU-offload / refuse.
2
+
3
+ Three resources gate a quantization job:
4
+ - disk : weights must be downloaded (unless cached) + the output written. Needed
5
+ in BOTH gpu and offload modes, so it's a precondition.
6
+ - VRAM : enough -> quantize in-GPU (fast).
7
+ - RAM : not enough VRAM but enough RAM -> hold the model on CPU and stream
8
+ layers to the GPU (offload; slower; fits any size).
9
+ Refuse only when none of the above can be satisfied, and always name the actual
10
+ limiting resource.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import shutil
15
+ from dataclasses import dataclass
16
+ from pathlib import Path
17
+
18
+ import psutil
19
+
20
+ from quantfit.gpufit import (
21
+ CALIB_OVERHEAD_FACTOR,
22
+ HEADROOM_BYTES,
23
+ estimate_fp16_bytes,
24
+ gpu_free_bytes,
25
+ )
26
+
27
+ # Offload holds the model in CPU RAM and moves one layer at a time to the GPU.
28
+ OFFLOAD_RAM_FACTOR = 1.15
29
+ # Quantized output is smaller than fp16; reserve this fraction (covers 8-bit, the
30
+ # largest common output; 4-bit is ~half this).
31
+ OUTPUT_DISK_FACTOR = 0.6
32
+ _GIB = 1024**3
33
+
34
+ MODE_GPU = "gpu"
35
+ MODE_OFFLOAD = "offload"
36
+ MODE_REFUSE = "refuse"
37
+
38
+ LIMIT_NONE = ""
39
+ LIMIT_DISK = "disk"
40
+ LIMIT_MACHINE = "machine"
41
+
42
+
43
+ @dataclass(frozen=True)
44
+ class CapacityPlan:
45
+ model_id: str
46
+ fp16_bytes: int
47
+ gpu_free: int
48
+ ram_available: int
49
+ disk_free: int
50
+ disk_need: int
51
+ mode: str
52
+ limit: str
53
+
54
+ @property
55
+ def fits(self) -> bool:
56
+ return self.mode != MODE_REFUSE
57
+
58
+ @property
59
+ def offload(self) -> bool:
60
+ return self.mode == MODE_OFFLOAD
61
+
62
+ def reason(self) -> str:
63
+ g = lambda b: f"{b / _GIB:.1f}"
64
+ if self.mode == MODE_GPU:
65
+ return (
66
+ f"OK (in-GPU): {self.model_id} ~{g(self.fp16_bytes)} GB, "
67
+ f"{g(self.gpu_free)} GB VRAM free."
68
+ )
69
+ if self.mode == MODE_OFFLOAD:
70
+ return (
71
+ f"OK (offload): {self.model_id} ~{g(self.fp16_bytes)} GB won't fit "
72
+ f"{g(self.gpu_free)} GB VRAM — quantizing via CPU "
73
+ f"({g(self.ram_available)} GB RAM). Slower."
74
+ )
75
+ if self.limit == LIMIT_DISK:
76
+ return (
77
+ f"CAN'T QUANTIZE: {self.model_id} needs ~{g(self.disk_need)} GB free "
78
+ f"disk (download + output) but only {g(self.disk_free)} GB is free."
79
+ )
80
+ return (
81
+ f"CAN'T QUANTIZE: {self.model_id} ~{g(self.fp16_bytes)} GB needs more than "
82
+ f"{g(self.gpu_free)} GB VRAM and {g(self.ram_available)} GB RAM. "
83
+ f"Use a bigger machine."
84
+ )
85
+
86
+
87
+ def _existing_parent(path: str) -> str:
88
+ p = Path(path).resolve()
89
+ while not p.exists():
90
+ p = p.parent
91
+ return str(p)
92
+
93
+
94
+ def _cached_weight_bytes(model_id: str) -> int:
95
+ """Bytes of this model's safetensors already in the HF cache (0 if absent)."""
96
+ from huggingface_hub.constants import HF_HUB_CACHE
97
+
98
+ snap = Path(HF_HUB_CACHE) / ("models--" + model_id.replace("/", "--")) / "snapshots"
99
+ if not snap.exists():
100
+ return 0
101
+ total = 0
102
+ for f in snap.rglob("*.safetensors"):
103
+ try:
104
+ total += f.stat().st_size
105
+ except OSError:
106
+ pass
107
+ return total
108
+
109
+
110
+ def plan(model_id: str, out_dir: str = ".", token: str | None = None) -> CapacityPlan:
111
+ """Decide gpu / offload / refuse for quantizing `model_id`."""
112
+ fp16 = estimate_fp16_bytes(model_id, token=token)
113
+ gpu = gpu_free_bytes()
114
+ ram = int(psutil.virtual_memory().available)
115
+ disk = shutil.disk_usage(_existing_parent(out_dir)).free
116
+
117
+ download_need = max(0, fp16 - _cached_weight_bytes(model_id))
118
+ disk_need = download_need + int(fp16 * OUTPUT_DISK_FACTOR)
119
+ gpu_need = int(fp16 * CALIB_OVERHEAD_FACTOR) + HEADROOM_BYTES
120
+ ram_need = int(fp16 * OFFLOAD_RAM_FACTOR) + HEADROOM_BYTES
121
+
122
+ if disk < disk_need:
123
+ mode, limit = MODE_REFUSE, LIMIT_DISK
124
+ elif gpu_need <= gpu:
125
+ mode, limit = MODE_GPU, LIMIT_NONE
126
+ elif ram_need <= ram:
127
+ mode, limit = MODE_OFFLOAD, LIMIT_NONE
128
+ else:
129
+ mode, limit = MODE_REFUSE, LIMIT_MACHINE
130
+ return CapacityPlan(model_id, fp16, gpu, ram, disk, disk_need, mode, limit)
quantfit/gpufit.py ADDED
@@ -0,0 +1,97 @@
1
+ """GPU capacity pre-flight.
2
+
3
+ The whole point of the tool: decide whether a model can be quantized in-GPU on
4
+ *this* machine BEFORE downloading weights or starting a multi-minute job. The
5
+ estimate is the FP16 footprint of the released weights (read from the Hub file
6
+ metadata, no download) times a calibration-overhead factor, plus fixed headroom,
7
+ compared against free VRAM. Errs toward refusal — a clear "can't quantize" beats
8
+ an OOM crash 20 minutes in.
9
+ """
10
+ from __future__ import annotations
11
+
12
+ from dataclasses import dataclass
13
+
14
+ from huggingface_hub import HfApi
15
+
16
+ # In-GPU PTQ (GPTQ/AWQ) holds the FP16 model plus per-layer Hessian/activation
17
+ # buffers; observed peak runs ~1.25x the released FP16 size. Headroom covers the
18
+ # CUDA context, calibration-batch activations, and allocator fragmentation.
19
+ CALIB_OVERHEAD_FACTOR = 1.25
20
+ HEADROOM_BYTES = 2 * 1024**3
21
+ _GIB = 1024**3
22
+ # Prefer safetensors; fall back to .bin only if no safetensors shards exist
23
+ # (summing both would double-count repos that ship both formats).
24
+ _WEIGHT_SUFFIXES = (".safetensors", ".bin")
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class FitReport:
29
+ model_id: str
30
+ fp16_bytes: int
31
+ required_bytes: int
32
+ free_bytes: int
33
+ fits: bool
34
+
35
+ @property
36
+ def fp16_gib(self) -> float:
37
+ return self.fp16_bytes / _GIB
38
+
39
+ @property
40
+ def required_gib(self) -> float:
41
+ return self.required_bytes / _GIB
42
+
43
+ @property
44
+ def free_gib(self) -> float:
45
+ return self.free_bytes / _GIB
46
+
47
+ def reason(self) -> str:
48
+ if self.fits:
49
+ return (
50
+ f"OK: {self.model_id} is ~{self.fp16_gib:.1f} GB FP16, needs "
51
+ f"~{self.required_gib:.1f} GB to quantize, {self.free_gib:.1f} GB free."
52
+ )
53
+ return (
54
+ f"CAN'T QUANTIZE: {self.model_id} needs ~{self.required_gib:.1f} GB "
55
+ f"in-GPU but only {self.free_gib:.1f} GB is free. Use a bigger GPU "
56
+ f"or a smaller model."
57
+ )
58
+
59
+
60
+ def estimate_fp16_bytes(model_id: str, token: str | None = None) -> int:
61
+ """Sum the released weight-file sizes from Hub metadata (no weight download)."""
62
+ info = HfApi().model_info(model_id, files_metadata=True, token=token)
63
+ by_suffix: dict[str, int] = {}
64
+ for f in info.siblings:
65
+ for suffix in _WEIGHT_SUFFIXES:
66
+ if f.rfilename.endswith(suffix) and f.size:
67
+ by_suffix[suffix] = by_suffix.get(suffix, 0) + f.size
68
+ for suffix in _WEIGHT_SUFFIXES:
69
+ if by_suffix.get(suffix):
70
+ return by_suffix[suffix]
71
+ raise ValueError(
72
+ f"{model_id}: no weight-file sizes found via Hub metadata; cannot "
73
+ "estimate footprint (model may be gated without access, or unavailable)."
74
+ )
75
+
76
+
77
+ def gpu_free_bytes() -> int:
78
+ """Free VRAM on the current default CUDA device, in bytes."""
79
+ import torch
80
+
81
+ if not torch.cuda.is_available():
82
+ raise RuntimeError("no CUDA GPU visible; quantfit needs a GPU to quantize.")
83
+ free, _total = torch.cuda.mem_get_info()
84
+ return int(free)
85
+
86
+
87
+ def check_fit(
88
+ model_id: str,
89
+ token: str | None = None,
90
+ overhead: float = CALIB_OVERHEAD_FACTOR,
91
+ headroom_bytes: int = HEADROOM_BYTES,
92
+ ) -> FitReport:
93
+ """Estimate footprint vs. free VRAM and return a fit verdict."""
94
+ fp16 = estimate_fp16_bytes(model_id, token=token)
95
+ required = int(fp16 * overhead) + headroom_bytes
96
+ free = gpu_free_bytes()
97
+ return FitReport(model_id, fp16, required, free, fits=required <= free)
quantfit/quantize.py ADDED
@@ -0,0 +1,98 @@
1
+ """Dispatcher: validate the request, GPU pre-flight, route to a backend, card it."""
2
+ from __future__ import annotations
3
+
4
+ from pathlib import Path
5
+
6
+ from quantfit.fit import MODE_REFUSE, plan
7
+ from quantfit.registry import BACKEND_CT, BACKEND_GGUF, resolve
8
+ from quantfit.spec import DEFAULT_SPEC, QuantSpec
9
+
10
+
11
+ class CannotQuantize(RuntimeError):
12
+ """Raised when the model won't fit the GPU (pre-flight refusal)."""
13
+
14
+
15
+ def quantize(
16
+ model_id: str,
17
+ method: str,
18
+ out_dir: str,
19
+ scheme: str | None = None,
20
+ spec: QuantSpec = DEFAULT_SPEC,
21
+ token: str | None = None,
22
+ run_check: bool = True,
23
+ offload: bool = False,
24
+ ) -> Path:
25
+ """Quantize `model_id` with `method` (+ optional `scheme`) into `out_dir`."""
26
+ m, resolved_scheme = resolve(method, scheme)
27
+
28
+ # The 3-tier capacity plan applies to GPU quantization (compressed-tensors).
29
+ # GGUF quantization is CPU-only, so it skips the VRAM/offload plan.
30
+ if run_check and m.backend == BACKEND_CT:
31
+ cap = plan(model_id, out_dir, token=token)
32
+ if cap.mode == MODE_REFUSE:
33
+ raise CannotQuantize(cap.reason())
34
+ if cap.offload:
35
+ offload = True
36
+
37
+ if m.backend == BACKEND_CT:
38
+ from quantfit.backends.compressed_tensors import quantize_ct
39
+
40
+ out = quantize_ct(
41
+ model_id, m.name, resolved_scheme, out_dir, spec,
42
+ m.needs_calibration, token=token, offload=offload,
43
+ )
44
+ elif m.backend == BACKEND_GGUF:
45
+ from quantfit.backends.gguf import quantize_gguf
46
+
47
+ out = quantize_gguf(model_id, resolved_scheme, out_dir, token=token)
48
+ else:
49
+ raise NotImplementedError(f"backend {m.backend!r} is not wired yet")
50
+
51
+ _write_card(Path(out), model_id, m.name, resolved_scheme, spec)
52
+ return Path(out)
53
+
54
+
55
+ def _write_card(out: Path, model_id: str, method: str, scheme: str, spec: QuantSpec) -> None:
56
+ if method == "gguf":
57
+ card = f"""---
58
+ base_model: {model_id}
59
+ tags: [quantized, gguf, {scheme.lower()}, llama.cpp, quantfit]
60
+ ---
61
+
62
+ # {out.name}
63
+
64
+ GGUF {scheme} quantization of `{model_id}`, produced with
65
+ [quantfit](https://github.com/Sahil170595/quantfit).
66
+
67
+ Loads in llama.cpp / Ollama / LM Studio. k-quant, no calibration (no imatrix).
68
+ """
69
+ else:
70
+ card = f"""---
71
+ base_model: {model_id}
72
+ tags: [quantized, {method}, {scheme.lower()}, compressed-tensors, quantfit]
73
+ ---
74
+
75
+ # {out.name}
76
+
77
+ {method.upper()} quantization ({scheme}) of `{model_id}`, produced with
78
+ [quantfit](https://github.com/Sahil170595/quantfit).
79
+
80
+ ## Provenance
81
+ - method: {method}, scheme: {scheme}, group_size {spec.group_size}
82
+ - calibration: {spec.calib_dataset}/{spec.calib_config} [{spec.calib_split}], \
83
+ {spec.calib_samples} samples, seq-len {spec.calib_seqlen}, seed {spec.seed}
84
+ - spec fingerprint: `{spec.fingerprint()}`
85
+
86
+ Loads in vLLM via the compressed-tensors backend.
87
+ """
88
+ (out / "README.md").write_text(card, encoding="utf-8")
89
+
90
+
91
+ def push(out_dir: str, repo_id: str, token: str | None = None, private: bool = False) -> str:
92
+ """Upload a quantized output folder to the Hub."""
93
+ from huggingface_hub import HfApi
94
+
95
+ api = HfApi(token=token)
96
+ api.create_repo(repo_id, exist_ok=True, private=private, repo_type="model")
97
+ api.upload_folder(folder_path=str(out_dir), repo_id=repo_id, repo_type="model")
98
+ return f"https://huggingface.co/{repo_id}"