nearbits 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. nearbits-0.2.0/PKG-INFO +118 -0
  2. nearbits-0.2.0/README.md +106 -0
  3. nearbits-0.2.0/pyproject.toml +25 -0
  4. nearbits-0.2.0/setup.cfg +4 -0
  5. nearbits-0.2.0/src/nearbit/__init__.py +14 -0
  6. nearbits-0.2.0/src/nearbit/analysis/__init__.py +4 -0
  7. nearbits-0.2.0/src/nearbit/analysis/analyzer.py +79 -0
  8. nearbits-0.2.0/src/nearbit/analysis/introspection.py +81 -0
  9. nearbits-0.2.0/src/nearbit/api.py +142 -0
  10. nearbits-0.2.0/src/nearbit/compression/__init__.py +3 -0
  11. nearbits-0.2.0/src/nearbit/compression/activations.py +9 -0
  12. nearbits-0.2.0/src/nearbit/compression/binary.py +18 -0
  13. nearbits-0.2.0/src/nearbit/compression/compressor.py +66 -0
  14. nearbits-0.2.0/src/nearbit/compression/kv_cache.py +15 -0
  15. nearbits-0.2.0/src/nearbit/compression/outliers.py +16 -0
  16. nearbits-0.2.0/src/nearbit/compression/residual.py +14 -0
  17. nearbits-0.2.0/src/nearbit/defaults.py +25 -0
  18. nearbits-0.2.0/src/nearbit/examples/__init__.py +1 -0
  19. nearbits-0.2.0/src/nearbit/examples/toy_transformer.py +31 -0
  20. nearbits-0.2.0/src/nearbit/integration/__init__.py +10 -0
  21. nearbits-0.2.0/src/nearbit/integration/huggingface.py +116 -0
  22. nearbits-0.2.0/src/nearbit/integration/hybrid_lowbit.py +411 -0
  23. nearbits-0.2.0/src/nearbit/packaging/__init__.py +3 -0
  24. nearbits-0.2.0/src/nearbit/packaging/builder.py +39 -0
  25. nearbits-0.2.0/src/nearbit/planning/__init__.py +3 -0
  26. nearbits-0.2.0/src/nearbit/planning/planner.py +82 -0
  27. nearbits-0.2.0/src/nearbit/runtime/__init__.py +3 -0
  28. nearbits-0.2.0/src/nearbit/runtime/arena.py +17 -0
  29. nearbits-0.2.0/src/nearbit/runtime/kernels.py +11 -0
  30. nearbits-0.2.0/src/nearbit/runtime/runner.py +29 -0
  31. nearbits-0.2.0/src/nearbit/types.py +131 -0
  32. nearbits-0.2.0/src/nearbits.egg-info/PKG-INFO +118 -0
  33. nearbits-0.2.0/src/nearbits.egg-info/SOURCES.txt +34 -0
  34. nearbits-0.2.0/src/nearbits.egg-info/dependency_links.txt +1 -0
  35. nearbits-0.2.0/src/nearbits.egg-info/requires.txt +4 -0
  36. nearbits-0.2.0/src/nearbits.egg-info/top_level.txt +1 -0
@@ -0,0 +1,118 @@
1
+ Metadata-Version: 2.4
2
+ Name: nearbits
3
+ Version: 0.2.0
4
+ Summary: Hybrid low-bit compression for Hugging Face models on low-RAM devices.
5
+ Author: Codex
6
+ Requires-Python: >=3.10
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: transformers>=4.40.0
9
+ Requires-Dist: accelerate>=0.28.0
10
+ Requires-Dist: sentencepiece>=0.1.99
11
+ Requires-Dist: psutil>=5.9.0
12
+
13
+ # NearBit
14
+
15
+ NearBit is a Python package for compressing Hugging Face models with a single strategy: `hybrid_lowbit`.
16
+
17
+ The package keeps high-impact parts in `int8`, compresses the bulk of linear layers to `binary` or `ternary`, rescues outlier weights in `int8`, and targets the lowest practical RAM footprint without retraining.
18
+
19
+ ## What NearBit does
20
+
21
+ - Compresses `nn.Linear` layers with `binary` or `ternary` storage plus group-wise scales
22
+ - Keeps sensitive linear layers in `int8`
23
+ - Compresses `nn.Embedding` tables to `int8`
24
+ - Works directly on existing Hugging Face models
25
+ - Exposes simple chat and prompt benchmarking helpers
26
+
27
+ ## Install
28
+
29
+ Install PyTorch first for your platform, then install NearBit.
30
+
31
+ ```bash
32
+ pip install torch
33
+ pip install nearbit
34
+ ```
35
+
36
+ For local development from this repo:
37
+
38
+ ```bash
39
+ pip install torch
40
+ pip install -e .
41
+ ```
42
+
43
+ ## Quick Start
44
+
45
+ ```python
46
+ from nearbit import HybridLowBitConfig, chat, compress_hf_model, load_hf_model, model_report
47
+
48
+ model, tokenizer, model_kind = load_hf_model("Qwen/Qwen2.5-1.5B-Instruct")
49
+ print(model_report(model))
50
+
51
+ config = HybridLowBitConfig(
52
+ group_size=128,
53
+ binary_outlier_fraction=0.002,
54
+ ternary_outlier_fraction=0.001,
55
+ binary_nmse_threshold=0.045,
56
+ ternary_nmse_threshold=0.02,
57
+ )
58
+
59
+ compressed_model = compress_hf_model(model, config)
60
+ print(model_report(compressed_model))
61
+
62
+ reply = chat(compressed_model, tokenizer, "Who are you?")
63
+ print(reply)
64
+ ```
65
+
66
+ ## Google Colab Usage
67
+
68
+ Use `examples/google_collab_hybrid_lowbit.py` as the runnable Colab example.
69
+
70
+ Typical Colab cells:
71
+
72
+ ```python
73
+ !pip install torch
74
+ !pip install nearbit
75
+ ```
76
+
77
+ ```python
78
+ from nearbit import HybridLowBitConfig, benchmark_prompts, compress_hf_model, load_hf_model, model_report
79
+
80
+ MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
81
+ PROMPTS = [
82
+ "hi",
83
+ "hello",
84
+ "who are you?",
85
+ "what is 2 + 2?",
86
+ "write one sentence about India.",
87
+ "explain model compression in simple words.",
88
+ ]
89
+
90
+ model, tokenizer, model_kind = load_hf_model(MODEL_ID)
91
+ print("Before:", model_report(model))
92
+
93
+ compressed_model = compress_hf_model(model, HybridLowBitConfig())
94
+ print("After:", model_report(compressed_model))
95
+
96
+ rows = benchmark_prompts(compressed_model, tokenizer, PROMPTS)
97
+ for row in rows:
98
+ print(row["prompt"])
99
+ print(row["response"])
100
+ print(row["latency_sec"])
101
+ ```
102
+
103
+ ## Public API
104
+
105
+ - `load_hf_model(model_id, trust_remote_code=False)`
106
+ - `compress_hf_model(model, config=None)`
107
+ - `model_size_mb(model)`
108
+ - `model_report(model)`
109
+ - `chat(model, tokenizer, prompt, ...)`
110
+ - `benchmark_prompts(model, tokenizer, prompts, ...)`
111
+ - `HybridLowBitConfig(...)`
112
+
113
+ ## Notes
114
+
115
+ - NearBit is centered on one executable path: `hybrid_lowbit`
116
+ - There is no public `int8` fallback API in the package surface
117
+ - Quality depends on the model architecture and compression thresholds
118
+ - The strongest RAM reduction comes from the hybrid path, not from pure `int8`
@@ -0,0 +1,106 @@
1
+ # NearBit
2
+
3
+ NearBit is a Python package for compressing Hugging Face models with a single strategy: `hybrid_lowbit`.
4
+
5
+ The package keeps high-impact parts in `int8`, compresses the bulk of linear layers to `binary` or `ternary`, rescues outlier weights in `int8`, and targets the lowest practical RAM footprint without retraining.
6
+
7
+ ## What NearBit does
8
+
9
+ - Compresses `nn.Linear` layers with `binary` or `ternary` storage plus group-wise scales
10
+ - Keeps sensitive linear layers in `int8`
11
+ - Compresses `nn.Embedding` tables to `int8`
12
+ - Works directly on existing Hugging Face models
13
+ - Exposes simple chat and prompt benchmarking helpers
14
+
15
+ ## Install
16
+
17
+ Install PyTorch first for your platform, then install NearBit.
18
+
19
+ ```bash
20
+ pip install torch
21
+ pip install nearbit
22
+ ```
23
+
24
+ For local development from this repo:
25
+
26
+ ```bash
27
+ pip install torch
28
+ pip install -e .
29
+ ```
30
+
31
+ ## Quick Start
32
+
33
+ ```python
34
+ from nearbit import HybridLowBitConfig, chat, compress_hf_model, load_hf_model, model_report
35
+
36
+ model, tokenizer, model_kind = load_hf_model("Qwen/Qwen2.5-1.5B-Instruct")
37
+ print(model_report(model))
38
+
39
+ config = HybridLowBitConfig(
40
+ group_size=128,
41
+ binary_outlier_fraction=0.002,
42
+ ternary_outlier_fraction=0.001,
43
+ binary_nmse_threshold=0.045,
44
+ ternary_nmse_threshold=0.02,
45
+ )
46
+
47
+ compressed_model = compress_hf_model(model, config)
48
+ print(model_report(compressed_model))
49
+
50
+ reply = chat(compressed_model, tokenizer, "Who are you?")
51
+ print(reply)
52
+ ```
53
+
54
+ ## Google Colab Usage
55
+
56
+ Use `examples/google_collab_hybrid_lowbit.py` as the runnable Colab example.
57
+
58
+ Typical Colab cells:
59
+
60
+ ```python
61
+ !pip install torch
62
+ !pip install nearbit
63
+ ```
64
+
65
+ ```python
66
+ from nearbit import HybridLowBitConfig, benchmark_prompts, compress_hf_model, load_hf_model, model_report
67
+
68
+ MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
69
+ PROMPTS = [
70
+ "hi",
71
+ "hello",
72
+ "who are you?",
73
+ "what is 2 + 2?",
74
+ "write one sentence about India.",
75
+ "explain model compression in simple words.",
76
+ ]
77
+
78
+ model, tokenizer, model_kind = load_hf_model(MODEL_ID)
79
+ print("Before:", model_report(model))
80
+
81
+ compressed_model = compress_hf_model(model, HybridLowBitConfig())
82
+ print("After:", model_report(compressed_model))
83
+
84
+ rows = benchmark_prompts(compressed_model, tokenizer, PROMPTS)
85
+ for row in rows:
86
+ print(row["prompt"])
87
+ print(row["response"])
88
+ print(row["latency_sec"])
89
+ ```
90
+
91
+ ## Public API
92
+
93
+ - `load_hf_model(model_id, trust_remote_code=False)`
94
+ - `compress_hf_model(model, config=None)`
95
+ - `model_size_mb(model)`
96
+ - `model_report(model)`
97
+ - `chat(model, tokenizer, prompt, ...)`
98
+ - `benchmark_prompts(model, tokenizer, prompts, ...)`
99
+ - `HybridLowBitConfig(...)`
100
+
101
+ ## Notes
102
+
103
+ - NearBit is centered on one executable path: `hybrid_lowbit`
104
+ - There is no public `int8` fallback API in the package surface
105
+ - Quality depends on the model architecture and compression thresholds
106
+ - The strongest RAM reduction comes from the hybrid path, not from pure `int8`
@@ -0,0 +1,25 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "nearbits"
7
+ version = "0.2.0"
8
+ description = "Hybrid low-bit compression for Hugging Face models on low-RAM devices."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ authors = [
12
+ { name = "Codex" }
13
+ ]
14
+ dependencies = [
15
+ "transformers>=4.40.0",
16
+ "accelerate>=0.28.0",
17
+ "sentencepiece>=0.1.99",
18
+ "psutil>=5.9.0"
19
+ ]
20
+
21
+ [tool.setuptools]
22
+ package-dir = { "" = "src" }
23
+
24
+ [tool.setuptools.packages.find]
25
+ where = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,14 @@
1
+ from .api import benchmark_prompts, chat, compress_hf_model, load_hf_model, model_report, model_size_mb
2
+ from .integration import HybridLowBitConfig, detect_model_dtypes, estimate_effective_bits
3
+
4
+ __all__ = [
5
+ "HybridLowBitConfig",
6
+ "benchmark_prompts",
7
+ "chat",
8
+ "compress_hf_model",
9
+ "detect_model_dtypes",
10
+ "estimate_effective_bits",
11
+ "load_hf_model",
12
+ "model_report",
13
+ "model_size_mb",
14
+ ]
@@ -0,0 +1,4 @@
1
+ from .analyzer import Analyzer
2
+ from .introspection import ModelInspector
3
+
4
+ __all__ = ["Analyzer", "ModelInspector"]
@@ -0,0 +1,79 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Iterable
4
+ from statistics import mean
5
+ from typing import Any
6
+
7
+ from ..types import CalibrationSample, HardwareProfile, LayerSensitivity, PrecisionKind, SensitivityReport
8
+ from .introspection import ModelInspector
9
+
10
+
11
+ class Analyzer:
12
+ def __init__(self, hardware_profile: HardwareProfile) -> None:
13
+ self.hardware_profile = hardware_profile
14
+ self.inspector = ModelInspector()
15
+
16
+ def analyze(self, model: Any, calibration_set: Iterable[CalibrationSample]) -> SensitivityReport:
17
+ samples = list(calibration_set)
18
+ model_name, model_type = self.inspector.identify_model(model)
19
+ layers = self.inspector.inspect_layers(model)
20
+ sample_factor = max(len(samples), 1)
21
+
22
+ sensitivities: list[LayerSensitivity] = []
23
+ for index, layer in enumerate(layers):
24
+ size_factor = min(layer.parameter_count / 1_000_000, 4.0)
25
+ position_penalty = 0.8 if index in (0, len(layers) - 1) else 0.0
26
+ fragile_penalty = 1.5 if layer.is_fragile else 0.0
27
+ binary_bonus = -0.3 if layer.supports_binary else 0.8
28
+ score = round(0.4 + size_factor + position_penalty + fragile_penalty + binary_bonus, 4)
29
+
30
+ min_act = round(-1.0 - 0.02 * index, 4)
31
+ max_act = round(1.0 + 0.03 * index, 4)
32
+ outlier_fraction = round(min(0.01, 0.001 + size_factor / 100 + index / (10_000 * sample_factor)), 6)
33
+ estimated_binary_error = round(score * 0.018 + outlier_fraction * 10, 6)
34
+
35
+ recommended = self._recommended_precision(layer.is_fragile, layer.supports_binary, estimated_binary_error)
36
+ sensitivities.append(
37
+ LayerSensitivity(
38
+ name=layer.name,
39
+ op_type=layer.op_type,
40
+ sensitivity_score=score,
41
+ activation_range=(min_act, max_act),
42
+ outlier_fraction=outlier_fraction,
43
+ estimated_binary_error=estimated_binary_error,
44
+ is_fragile=layer.is_fragile,
45
+ supports_binary=layer.supports_binary,
46
+ recommended_precision=recommended,
47
+ )
48
+ )
49
+
50
+ baseline_metric = round(max(0.0, 1.0 - mean(layer.estimated_binary_error for layer in sensitivities) / 2), 6)
51
+ return SensitivityReport(
52
+ model_name=model_name,
53
+ model_type=model_type,
54
+ layers=sensitivities,
55
+ baseline_metric=baseline_metric,
56
+ metadata={
57
+ "calibration_samples": len(samples),
58
+ "hardware_target": self.hardware_profile.device_name,
59
+ "accelerator": self.hardware_profile.accelerator,
60
+ },
61
+ )
62
+
63
+ def _recommended_precision(
64
+ self,
65
+ is_fragile: bool,
66
+ supports_binary: bool,
67
+ estimated_binary_error: float,
68
+ ) -> PrecisionKind:
69
+ if is_fragile:
70
+ return PrecisionKind.INT8
71
+ if not supports_binary:
72
+ return PrecisionKind.INT4 if estimated_binary_error < 0.08 else PrecisionKind.INT8
73
+ if estimated_binary_error < 0.03:
74
+ return PrecisionKind.BINARY
75
+ if estimated_binary_error < 0.06:
76
+ return PrecisionKind.BINARY_OUTLIERS
77
+ if estimated_binary_error < 0.09:
78
+ return PrecisionKind.BINARY_LOW_RANK
79
+ return PrecisionKind.INT4
@@ -0,0 +1,81 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Iterable
4
+
5
+ from ..types import LayerProfile
6
+
7
+
8
+ class ModelInspector:
9
+ """Adapter-style model inspection with safe fallbacks for existing models."""
10
+
11
+ FRAGILE_OPS = {"embedding", "norm", "layernorm", "rmsnorm", "head", "lm_head"}
12
+ SAFE_BINARY_OPS = {"linear", "conv1d", "conv2d", "matmul"}
13
+
14
+ def inspect_layers(self, model: Any) -> list[LayerProfile]:
15
+ if hasattr(model, "nearbit_layers"):
16
+ return list(model.nearbit_layers())
17
+ if hasattr(model, "named_modules"):
18
+ return self._from_named_modules(model.named_modules())
19
+ if isinstance(model, dict) and "layers" in model:
20
+ return self._from_mapping(model["layers"])
21
+ raise TypeError("Unsupported model format. Provide nearbit_layers(), named_modules(), or a layer mapping.")
22
+
23
+ def identify_model(self, model: Any) -> tuple[str, str]:
24
+ model_name = getattr(model, "name", model.__class__.__name__)
25
+ model_type = getattr(model, "model_type", "transformer")
26
+ return model_name, model_type
27
+
28
+ def _from_named_modules(self, modules: Iterable[tuple[str, Any]]) -> list[LayerProfile]:
29
+ layers: list[LayerProfile] = []
30
+ for name, module in modules:
31
+ if not name:
32
+ continue
33
+ op_type = module.__class__.__name__.lower()
34
+ parameter_count = int(getattr(module, "parameter_count", 0) or self._safe_numel(module))
35
+ fragile = self._is_fragile(name, op_type)
36
+ supports_binary = self._supports_binary(op_type) and not fragile
37
+ layers.append(
38
+ LayerProfile(
39
+ name=name,
40
+ op_type=op_type,
41
+ parameter_count=parameter_count,
42
+ is_fragile=fragile,
43
+ supports_binary=supports_binary,
44
+ )
45
+ )
46
+ return layers
47
+
48
+ def _from_mapping(self, layers_map: Iterable[dict[str, Any]]) -> list[LayerProfile]:
49
+ layers: list[LayerProfile] = []
50
+ for item in layers_map:
51
+ name = item["name"]
52
+ op_type = str(item.get("op_type", "linear")).lower()
53
+ fragile = bool(item.get("is_fragile", self._is_fragile(name, op_type)))
54
+ supports_binary = bool(item.get("supports_binary", self._supports_binary(op_type) and not fragile))
55
+ layers.append(
56
+ LayerProfile(
57
+ name=name,
58
+ op_type=op_type,
59
+ parameter_count=int(item.get("parameter_count", 0)),
60
+ output_shape=tuple(item.get("output_shape", ())),
61
+ is_fragile=fragile,
62
+ supports_binary=supports_binary,
63
+ )
64
+ )
65
+ return layers
66
+
67
+ def _is_fragile(self, name: str, op_type: str) -> bool:
68
+ name_lower = name.lower()
69
+ return any(token in name_lower or token in op_type for token in self.FRAGILE_OPS)
70
+
71
+ def _supports_binary(self, op_type: str) -> bool:
72
+ return any(token in op_type for token in self.SAFE_BINARY_OPS)
73
+
74
+ def _safe_numel(self, module: Any) -> int:
75
+ parameters = getattr(module, "parameters", None)
76
+ if parameters is None:
77
+ return 0
78
+ total = 0
79
+ for param in parameters():
80
+ total += int(getattr(param, "numel", lambda: 0)())
81
+ return total
@@ -0,0 +1,142 @@
1
+ from __future__ import annotations
2
+
3
+ import time
4
+ from typing import Any
5
+
6
+ import torch
7
+
8
+ from .integration import (
9
+ HybridLowBitConfig,
10
+ convert_model_to_hybrid_lowbit,
11
+ detect_model_dtypes,
12
+ estimate_effective_bits,
13
+ )
14
+
15
+
16
+ def load_hf_model(model_id: str, trust_remote_code: bool = False):
17
+ from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer
18
+
19
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=trust_remote_code)
20
+ if tokenizer.pad_token is None and tokenizer.eos_token is not None:
21
+ tokenizer.pad_token = tokenizer.eos_token
22
+
23
+ errors: list[str] = []
24
+ for model_kind, loader in (("causal", AutoModelForCausalLM), ("seq2seq", AutoModelForSeq2SeqLM)):
25
+ try:
26
+ model = loader.from_pretrained(
27
+ model_id,
28
+ trust_remote_code=trust_remote_code,
29
+ low_cpu_mem_usage=True,
30
+ )
31
+ model.eval()
32
+ return model, tokenizer, model_kind
33
+ except Exception as exc:
34
+ errors.append(f"{model_kind}: {exc}")
35
+ raise RuntimeError(f"Could not load model '{model_id}'. Details: {' | '.join(errors)}")
36
+
37
+
38
+ def compress_hf_model(model: Any, config: HybridLowBitConfig | None = None):
39
+ return convert_model_to_hybrid_lowbit(model, config or HybridLowBitConfig())
40
+
41
+
42
+ def model_size_mb(model: Any) -> float:
43
+ total_bytes = 0
44
+ for tensor in model.state_dict().values():
45
+ if hasattr(tensor, "element_size") and hasattr(tensor, "nelement"):
46
+ total_bytes += tensor.element_size() * tensor.nelement()
47
+ return round(total_bytes / (1024 ** 2), 4)
48
+
49
+
50
+ def model_report(model: Any) -> dict[str, Any]:
51
+ return {
52
+ "model_size_mb": model_size_mb(model),
53
+ "effective_bits": estimate_effective_bits(model),
54
+ "dtypes": detect_model_dtypes(model),
55
+ "lowbit_summary": dict(getattr(model, "nearbit_lowbit_summary", {})),
56
+ }
57
+
58
+
59
+ def chat(
60
+ model: Any,
61
+ tokenizer: Any,
62
+ prompt: str,
63
+ *,
64
+ system_prompt: str | None = "You are a concise, helpful assistant.",
65
+ max_new_tokens: int = 128,
66
+ do_sample: bool = False,
67
+ temperature: float = 0.7,
68
+ top_p: float = 0.95,
69
+ ) -> str:
70
+ from transformers import GenerationConfig
71
+
72
+ inputs = _build_inputs(tokenizer, prompt, system_prompt)
73
+ generation_kwargs = {
74
+ "max_new_tokens": max_new_tokens,
75
+ "do_sample": do_sample,
76
+ "pad_token_id": tokenizer.pad_token_id,
77
+ "eos_token_id": tokenizer.eos_token_id,
78
+ }
79
+ if do_sample:
80
+ generation_kwargs["temperature"] = temperature
81
+ generation_kwargs["top_p"] = top_p
82
+ generation_config = GenerationConfig(**generation_kwargs)
83
+
84
+ with torch.inference_mode():
85
+ output = model.generate(**inputs, generation_config=generation_config)
86
+
87
+ is_encoder_decoder = bool(getattr(getattr(model, "config", None), "is_encoder_decoder", False))
88
+ if is_encoder_decoder:
89
+ generated_ids = output[0]
90
+ else:
91
+ input_len = inputs["input_ids"].shape[-1]
92
+ generated_ids = output[0][input_len:]
93
+ return tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
94
+
95
+
96
+ def benchmark_prompts(
97
+ model: Any,
98
+ tokenizer: Any,
99
+ prompts: list[str],
100
+ *,
101
+ system_prompt: str | None = "You are a concise, helpful assistant.",
102
+ max_new_tokens: int = 64,
103
+ do_sample: bool = False,
104
+ temperature: float = 0.7,
105
+ top_p: float = 0.95,
106
+ ) -> list[dict[str, Any]]:
107
+ rows: list[dict[str, Any]] = []
108
+ for prompt in prompts:
109
+ started = time.perf_counter()
110
+ response = chat(
111
+ model,
112
+ tokenizer,
113
+ prompt,
114
+ system_prompt=system_prompt,
115
+ max_new_tokens=max_new_tokens,
116
+ do_sample=do_sample,
117
+ temperature=temperature,
118
+ top_p=top_p,
119
+ )
120
+ elapsed = time.perf_counter() - started
121
+ token_count = len(tokenizer.encode(response, add_special_tokens=False))
122
+ rows.append(
123
+ {
124
+ "prompt": prompt,
125
+ "response": response,
126
+ "latency_sec": round(elapsed, 4),
127
+ "generated_tokens": token_count,
128
+ "tokens_per_sec": round(token_count / elapsed, 4) if elapsed > 0 else None,
129
+ }
130
+ )
131
+ return rows
132
+
133
+
134
+ def _build_inputs(tokenizer: Any, prompt: str, system_prompt: str | None = None):
135
+ if getattr(tokenizer, "chat_template", None):
136
+ messages = []
137
+ if system_prompt:
138
+ messages.append({"role": "system", "content": system_prompt})
139
+ messages.append({"role": "user", "content": prompt})
140
+ rendered = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
141
+ return tokenizer(rendered, return_tensors="pt")
142
+ return tokenizer(prompt, return_tensors="pt")
@@ -0,0 +1,3 @@
1
+ from .compressor import Compressor
2
+
3
+ __all__ = ["Compressor"]
@@ -0,0 +1,9 @@
1
+ from __future__ import annotations
2
+
3
+ from ..types import PrecisionKind
4
+
5
+
6
+ def activation_policy(model_type: str) -> dict[str, str]:
7
+ if model_type == "transformer":
8
+ return {"default_precision": PrecisionKind.INT8.value, "fallback_precision": PrecisionKind.INT4.value}
9
+ return {"default_precision": PrecisionKind.INT8.value, "fallback_precision": PrecisionKind.INT8.value}
@@ -0,0 +1,18 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Iterable
4
+
5
+
6
+ def pack_signs(weights: Iterable[float], group_size: int) -> dict[str, object]:
7
+ values = list(weights)
8
+ packed = [1 if value >= 0 else 0 for value in values]
9
+ scales = []
10
+ for index in range(0, len(values), group_size):
11
+ group = values[index : index + group_size]
12
+ mean_abs = sum(abs(value) for value in group) / max(len(group), 1)
13
+ scales.append(mean_abs)
14
+ return {
15
+ "bitpack": packed,
16
+ "group_size": group_size,
17
+ "scales": scales,
18
+ }
@@ -0,0 +1,66 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from ..packaging.builder import ArtifactBuilder
6
+ from ..types import CompressionArtifact, CompressionPlan, PackedLayer, PrecisionKind
7
+ from .binary import pack_signs
8
+ from .kv_cache import kv_cache_policy
9
+ from .outliers import extract_outliers
10
+ from .residual import build_low_rank_residual
11
+
12
+
13
+ class Compressor:
14
+ def compress(self, model: Any, compression_plan: CompressionPlan) -> CompressionArtifact:
15
+ weights_by_layer = self._read_weights(model)
16
+ packed_layers: list[PackedLayer] = []
17
+
18
+ for layer_plan in compression_plan.layers:
19
+ weights = weights_by_layer.get(layer_plan.name, [0.0])
20
+ payload = self._compress_layer(weights, layer_plan.precision, layer_plan.group_size, layer_plan.outlier_fraction, layer_plan.residual_rank)
21
+ packed_layers.append(
22
+ PackedLayer(
23
+ name=layer_plan.name,
24
+ precision=layer_plan.precision,
25
+ payload=payload,
26
+ )
27
+ )
28
+
29
+ return ArtifactBuilder().build(
30
+ compression_plan=compression_plan,
31
+ packed_layers=packed_layers,
32
+ kv_policy=kv_cache_policy(compression_plan.model_type),
33
+ )
34
+
35
+ def _compress_layer(
36
+ self,
37
+ weights: list[float],
38
+ precision: PrecisionKind,
39
+ group_size: int,
40
+ outlier_fraction: float,
41
+ residual_rank: int,
42
+ ) -> dict[str, object]:
43
+ if precision == PrecisionKind.BINARY:
44
+ return pack_signs(weights, group_size)
45
+ if precision == PrecisionKind.BINARY_OUTLIERS:
46
+ return {
47
+ "binary": pack_signs(weights, group_size),
48
+ "outliers": extract_outliers(weights, outlier_fraction),
49
+ }
50
+ if precision == PrecisionKind.BINARY_LOW_RANK:
51
+ return {
52
+ "binary": pack_signs(weights, group_size),
53
+ "outliers": extract_outliers(weights, outlier_fraction),
54
+ "residual": build_low_rank_residual(weights, residual_rank),
55
+ }
56
+ return {
57
+ "values": weights,
58
+ "stored_precision": precision.value,
59
+ }
60
+
61
+ def _read_weights(self, model: Any) -> dict[str, list[float]]:
62
+ if hasattr(model, "nearbit_weights"):
63
+ return dict(model.nearbit_weights())
64
+ if isinstance(model, dict) and "weights" in model:
65
+ return {str(key): list(value) for key, value in model["weights"].items()}
66
+ return {}