nearbits 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nearbits-0.2.0/PKG-INFO +118 -0
- nearbits-0.2.0/README.md +106 -0
- nearbits-0.2.0/pyproject.toml +25 -0
- nearbits-0.2.0/setup.cfg +4 -0
- nearbits-0.2.0/src/nearbit/__init__.py +14 -0
- nearbits-0.2.0/src/nearbit/analysis/__init__.py +4 -0
- nearbits-0.2.0/src/nearbit/analysis/analyzer.py +79 -0
- nearbits-0.2.0/src/nearbit/analysis/introspection.py +81 -0
- nearbits-0.2.0/src/nearbit/api.py +142 -0
- nearbits-0.2.0/src/nearbit/compression/__init__.py +3 -0
- nearbits-0.2.0/src/nearbit/compression/activations.py +9 -0
- nearbits-0.2.0/src/nearbit/compression/binary.py +18 -0
- nearbits-0.2.0/src/nearbit/compression/compressor.py +66 -0
- nearbits-0.2.0/src/nearbit/compression/kv_cache.py +15 -0
- nearbits-0.2.0/src/nearbit/compression/outliers.py +16 -0
- nearbits-0.2.0/src/nearbit/compression/residual.py +14 -0
- nearbits-0.2.0/src/nearbit/defaults.py +25 -0
- nearbits-0.2.0/src/nearbit/examples/__init__.py +1 -0
- nearbits-0.2.0/src/nearbit/examples/toy_transformer.py +31 -0
- nearbits-0.2.0/src/nearbit/integration/__init__.py +10 -0
- nearbits-0.2.0/src/nearbit/integration/huggingface.py +116 -0
- nearbits-0.2.0/src/nearbit/integration/hybrid_lowbit.py +411 -0
- nearbits-0.2.0/src/nearbit/packaging/__init__.py +3 -0
- nearbits-0.2.0/src/nearbit/packaging/builder.py +39 -0
- nearbits-0.2.0/src/nearbit/planning/__init__.py +3 -0
- nearbits-0.2.0/src/nearbit/planning/planner.py +82 -0
- nearbits-0.2.0/src/nearbit/runtime/__init__.py +3 -0
- nearbits-0.2.0/src/nearbit/runtime/arena.py +17 -0
- nearbits-0.2.0/src/nearbit/runtime/kernels.py +11 -0
- nearbits-0.2.0/src/nearbit/runtime/runner.py +29 -0
- nearbits-0.2.0/src/nearbit/types.py +131 -0
- nearbits-0.2.0/src/nearbits.egg-info/PKG-INFO +118 -0
- nearbits-0.2.0/src/nearbits.egg-info/SOURCES.txt +34 -0
- nearbits-0.2.0/src/nearbits.egg-info/dependency_links.txt +1 -0
- nearbits-0.2.0/src/nearbits.egg-info/requires.txt +4 -0
- nearbits-0.2.0/src/nearbits.egg-info/top_level.txt +1 -0
nearbits-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: nearbits
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Hybrid low-bit compression for Hugging Face models on low-RAM devices.
|
|
5
|
+
Author: Codex
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: transformers>=4.40.0
|
|
9
|
+
Requires-Dist: accelerate>=0.28.0
|
|
10
|
+
Requires-Dist: sentencepiece>=0.1.99
|
|
11
|
+
Requires-Dist: psutil>=5.9.0
|
|
12
|
+
|
|
13
|
+
# NearBit
|
|
14
|
+
|
|
15
|
+
NearBit is a Python package for compressing Hugging Face models with a single strategy: `hybrid_lowbit`.
|
|
16
|
+
|
|
17
|
+
The package keeps high-impact parts in `int8`, compresses the bulk of linear layers to `binary` or `ternary`, rescues outlier weights in `int8`, and targets the lowest practical RAM footprint without retraining.
|
|
18
|
+
|
|
19
|
+
## What NearBit does
|
|
20
|
+
|
|
21
|
+
- Compresses `nn.Linear` layers with `binary` or `ternary` storage plus group-wise scales
|
|
22
|
+
- Keeps sensitive linear layers in `int8`
|
|
23
|
+
- Compresses `nn.Embedding` tables to `int8`
|
|
24
|
+
- Works directly on existing Hugging Face models
|
|
25
|
+
- Exposes simple chat and prompt benchmarking helpers
|
|
26
|
+
|
|
27
|
+
## Install
|
|
28
|
+
|
|
29
|
+
Install PyTorch first for your platform, then install NearBit.
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install torch
|
|
33
|
+
pip install nearbit
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
For local development from this repo:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install torch
|
|
40
|
+
pip install -e .
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Quick Start
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
from nearbit import HybridLowBitConfig, chat, compress_hf_model, load_hf_model, model_report
|
|
47
|
+
|
|
48
|
+
model, tokenizer, model_kind = load_hf_model("Qwen/Qwen2.5-1.5B-Instruct")
|
|
49
|
+
print(model_report(model))
|
|
50
|
+
|
|
51
|
+
config = HybridLowBitConfig(
|
|
52
|
+
group_size=128,
|
|
53
|
+
binary_outlier_fraction=0.002,
|
|
54
|
+
ternary_outlier_fraction=0.001,
|
|
55
|
+
binary_nmse_threshold=0.045,
|
|
56
|
+
ternary_nmse_threshold=0.02,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
compressed_model = compress_hf_model(model, config)
|
|
60
|
+
print(model_report(compressed_model))
|
|
61
|
+
|
|
62
|
+
reply = chat(compressed_model, tokenizer, "Who are you?")
|
|
63
|
+
print(reply)
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Google Colab Usage
|
|
67
|
+
|
|
68
|
+
Use `examples/google_collab_hybrid_lowbit.py` as the runnable Colab example.
|
|
69
|
+
|
|
70
|
+
Typical Colab cells:
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
!pip install torch
|
|
74
|
+
!pip install nearbit
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
from nearbit import HybridLowBitConfig, benchmark_prompts, compress_hf_model, load_hf_model, model_report
|
|
79
|
+
|
|
80
|
+
MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
|
|
81
|
+
PROMPTS = [
|
|
82
|
+
"hi",
|
|
83
|
+
"hello",
|
|
84
|
+
"who are you?",
|
|
85
|
+
"what is 2 + 2?",
|
|
86
|
+
"write one sentence about India.",
|
|
87
|
+
"explain model compression in simple words.",
|
|
88
|
+
]
|
|
89
|
+
|
|
90
|
+
model, tokenizer, model_kind = load_hf_model(MODEL_ID)
|
|
91
|
+
print("Before:", model_report(model))
|
|
92
|
+
|
|
93
|
+
compressed_model = compress_hf_model(model, HybridLowBitConfig())
|
|
94
|
+
print("After:", model_report(compressed_model))
|
|
95
|
+
|
|
96
|
+
rows = benchmark_prompts(compressed_model, tokenizer, PROMPTS)
|
|
97
|
+
for row in rows:
|
|
98
|
+
print(row["prompt"])
|
|
99
|
+
print(row["response"])
|
|
100
|
+
print(row["latency_sec"])
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Public API
|
|
104
|
+
|
|
105
|
+
- `load_hf_model(model_id, trust_remote_code=False)`
|
|
106
|
+
- `compress_hf_model(model, config=None)`
|
|
107
|
+
- `model_size_mb(model)`
|
|
108
|
+
- `model_report(model)`
|
|
109
|
+
- `chat(model, tokenizer, prompt, ...)`
|
|
110
|
+
- `benchmark_prompts(model, tokenizer, prompts, ...)`
|
|
111
|
+
- `HybridLowBitConfig(...)`
|
|
112
|
+
|
|
113
|
+
## Notes
|
|
114
|
+
|
|
115
|
+
- NearBit is centered on one executable path: `hybrid_lowbit`
|
|
116
|
+
- There is no public `int8` fallback API in the package surface
|
|
117
|
+
- Quality depends on the model architecture and compression thresholds
|
|
118
|
+
- The strongest RAM reduction comes from the hybrid path, not from pure `int8`
|
nearbits-0.2.0/README.md
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# NearBit
|
|
2
|
+
|
|
3
|
+
NearBit is a Python package for compressing Hugging Face models with a single strategy: `hybrid_lowbit`.
|
|
4
|
+
|
|
5
|
+
The package keeps high-impact parts in `int8`, compresses the bulk of linear layers to `binary` or `ternary`, rescues outlier weights in `int8`, and targets the lowest practical RAM footprint without retraining.
|
|
6
|
+
|
|
7
|
+
## What NearBit does
|
|
8
|
+
|
|
9
|
+
- Compresses `nn.Linear` layers with `binary` or `ternary` storage plus group-wise scales
|
|
10
|
+
- Keeps sensitive linear layers in `int8`
|
|
11
|
+
- Compresses `nn.Embedding` tables to `int8`
|
|
12
|
+
- Works directly on existing Hugging Face models
|
|
13
|
+
- Exposes simple chat and prompt benchmarking helpers
|
|
14
|
+
|
|
15
|
+
## Install
|
|
16
|
+
|
|
17
|
+
Install PyTorch first for your platform, then install NearBit.
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install torch
|
|
21
|
+
pip install nearbit
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
For local development from this repo:
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install torch
|
|
28
|
+
pip install -e .
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Quick Start
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
from nearbit import HybridLowBitConfig, chat, compress_hf_model, load_hf_model, model_report
|
|
35
|
+
|
|
36
|
+
model, tokenizer, model_kind = load_hf_model("Qwen/Qwen2.5-1.5B-Instruct")
|
|
37
|
+
print(model_report(model))
|
|
38
|
+
|
|
39
|
+
config = HybridLowBitConfig(
|
|
40
|
+
group_size=128,
|
|
41
|
+
binary_outlier_fraction=0.002,
|
|
42
|
+
ternary_outlier_fraction=0.001,
|
|
43
|
+
binary_nmse_threshold=0.045,
|
|
44
|
+
ternary_nmse_threshold=0.02,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
compressed_model = compress_hf_model(model, config)
|
|
48
|
+
print(model_report(compressed_model))
|
|
49
|
+
|
|
50
|
+
reply = chat(compressed_model, tokenizer, "Who are you?")
|
|
51
|
+
print(reply)
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Google Colab Usage
|
|
55
|
+
|
|
56
|
+
Use `examples/google_collab_hybrid_lowbit.py` as the runnable Colab example.
|
|
57
|
+
|
|
58
|
+
Typical Colab cells:
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
!pip install torch
|
|
62
|
+
!pip install nearbit
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
from nearbit import HybridLowBitConfig, benchmark_prompts, compress_hf_model, load_hf_model, model_report
|
|
67
|
+
|
|
68
|
+
MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
|
|
69
|
+
PROMPTS = [
|
|
70
|
+
"hi",
|
|
71
|
+
"hello",
|
|
72
|
+
"who are you?",
|
|
73
|
+
"what is 2 + 2?",
|
|
74
|
+
"write one sentence about India.",
|
|
75
|
+
"explain model compression in simple words.",
|
|
76
|
+
]
|
|
77
|
+
|
|
78
|
+
model, tokenizer, model_kind = load_hf_model(MODEL_ID)
|
|
79
|
+
print("Before:", model_report(model))
|
|
80
|
+
|
|
81
|
+
compressed_model = compress_hf_model(model, HybridLowBitConfig())
|
|
82
|
+
print("After:", model_report(compressed_model))
|
|
83
|
+
|
|
84
|
+
rows = benchmark_prompts(compressed_model, tokenizer, PROMPTS)
|
|
85
|
+
for row in rows:
|
|
86
|
+
print(row["prompt"])
|
|
87
|
+
print(row["response"])
|
|
88
|
+
print(row["latency_sec"])
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Public API
|
|
92
|
+
|
|
93
|
+
- `load_hf_model(model_id, trust_remote_code=False)`
|
|
94
|
+
- `compress_hf_model(model, config=None)`
|
|
95
|
+
- `model_size_mb(model)`
|
|
96
|
+
- `model_report(model)`
|
|
97
|
+
- `chat(model, tokenizer, prompt, ...)`
|
|
98
|
+
- `benchmark_prompts(model, tokenizer, prompts, ...)`
|
|
99
|
+
- `HybridLowBitConfig(...)`
|
|
100
|
+
|
|
101
|
+
## Notes
|
|
102
|
+
|
|
103
|
+
- NearBit is centered on one executable path: `hybrid_lowbit`
|
|
104
|
+
- There is no public `int8` fallback API in the package surface
|
|
105
|
+
- Quality depends on the model architecture and compression thresholds
|
|
106
|
+
- The strongest RAM reduction comes from the hybrid path, not from pure `int8`
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "nearbits"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "Hybrid low-bit compression for Hugging Face models on low-RAM devices."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
authors = [
|
|
12
|
+
{ name = "Codex" }
|
|
13
|
+
]
|
|
14
|
+
dependencies = [
|
|
15
|
+
"transformers>=4.40.0",
|
|
16
|
+
"accelerate>=0.28.0",
|
|
17
|
+
"sentencepiece>=0.1.99",
|
|
18
|
+
"psutil>=5.9.0"
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
[tool.setuptools]
|
|
22
|
+
package-dir = { "" = "src" }
|
|
23
|
+
|
|
24
|
+
[tool.setuptools.packages.find]
|
|
25
|
+
where = ["src"]
|
nearbits-0.2.0/setup.cfg
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from .api import benchmark_prompts, chat, compress_hf_model, load_hf_model, model_report, model_size_mb
|
|
2
|
+
from .integration import HybridLowBitConfig, detect_model_dtypes, estimate_effective_bits
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"HybridLowBitConfig",
|
|
6
|
+
"benchmark_prompts",
|
|
7
|
+
"chat",
|
|
8
|
+
"compress_hf_model",
|
|
9
|
+
"detect_model_dtypes",
|
|
10
|
+
"estimate_effective_bits",
|
|
11
|
+
"load_hf_model",
|
|
12
|
+
"model_report",
|
|
13
|
+
"model_size_mb",
|
|
14
|
+
]
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
from statistics import mean
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from ..types import CalibrationSample, HardwareProfile, LayerSensitivity, PrecisionKind, SensitivityReport
|
|
8
|
+
from .introspection import ModelInspector
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Analyzer:
|
|
12
|
+
def __init__(self, hardware_profile: HardwareProfile) -> None:
|
|
13
|
+
self.hardware_profile = hardware_profile
|
|
14
|
+
self.inspector = ModelInspector()
|
|
15
|
+
|
|
16
|
+
def analyze(self, model: Any, calibration_set: Iterable[CalibrationSample]) -> SensitivityReport:
|
|
17
|
+
samples = list(calibration_set)
|
|
18
|
+
model_name, model_type = self.inspector.identify_model(model)
|
|
19
|
+
layers = self.inspector.inspect_layers(model)
|
|
20
|
+
sample_factor = max(len(samples), 1)
|
|
21
|
+
|
|
22
|
+
sensitivities: list[LayerSensitivity] = []
|
|
23
|
+
for index, layer in enumerate(layers):
|
|
24
|
+
size_factor = min(layer.parameter_count / 1_000_000, 4.0)
|
|
25
|
+
position_penalty = 0.8 if index in (0, len(layers) - 1) else 0.0
|
|
26
|
+
fragile_penalty = 1.5 if layer.is_fragile else 0.0
|
|
27
|
+
binary_bonus = -0.3 if layer.supports_binary else 0.8
|
|
28
|
+
score = round(0.4 + size_factor + position_penalty + fragile_penalty + binary_bonus, 4)
|
|
29
|
+
|
|
30
|
+
min_act = round(-1.0 - 0.02 * index, 4)
|
|
31
|
+
max_act = round(1.0 + 0.03 * index, 4)
|
|
32
|
+
outlier_fraction = round(min(0.01, 0.001 + size_factor / 100 + index / (10_000 * sample_factor)), 6)
|
|
33
|
+
estimated_binary_error = round(score * 0.018 + outlier_fraction * 10, 6)
|
|
34
|
+
|
|
35
|
+
recommended = self._recommended_precision(layer.is_fragile, layer.supports_binary, estimated_binary_error)
|
|
36
|
+
sensitivities.append(
|
|
37
|
+
LayerSensitivity(
|
|
38
|
+
name=layer.name,
|
|
39
|
+
op_type=layer.op_type,
|
|
40
|
+
sensitivity_score=score,
|
|
41
|
+
activation_range=(min_act, max_act),
|
|
42
|
+
outlier_fraction=outlier_fraction,
|
|
43
|
+
estimated_binary_error=estimated_binary_error,
|
|
44
|
+
is_fragile=layer.is_fragile,
|
|
45
|
+
supports_binary=layer.supports_binary,
|
|
46
|
+
recommended_precision=recommended,
|
|
47
|
+
)
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
baseline_metric = round(max(0.0, 1.0 - mean(layer.estimated_binary_error for layer in sensitivities) / 2), 6)
|
|
51
|
+
return SensitivityReport(
|
|
52
|
+
model_name=model_name,
|
|
53
|
+
model_type=model_type,
|
|
54
|
+
layers=sensitivities,
|
|
55
|
+
baseline_metric=baseline_metric,
|
|
56
|
+
metadata={
|
|
57
|
+
"calibration_samples": len(samples),
|
|
58
|
+
"hardware_target": self.hardware_profile.device_name,
|
|
59
|
+
"accelerator": self.hardware_profile.accelerator,
|
|
60
|
+
},
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
def _recommended_precision(
|
|
64
|
+
self,
|
|
65
|
+
is_fragile: bool,
|
|
66
|
+
supports_binary: bool,
|
|
67
|
+
estimated_binary_error: float,
|
|
68
|
+
) -> PrecisionKind:
|
|
69
|
+
if is_fragile:
|
|
70
|
+
return PrecisionKind.INT8
|
|
71
|
+
if not supports_binary:
|
|
72
|
+
return PrecisionKind.INT4 if estimated_binary_error < 0.08 else PrecisionKind.INT8
|
|
73
|
+
if estimated_binary_error < 0.03:
|
|
74
|
+
return PrecisionKind.BINARY
|
|
75
|
+
if estimated_binary_error < 0.06:
|
|
76
|
+
return PrecisionKind.BINARY_OUTLIERS
|
|
77
|
+
if estimated_binary_error < 0.09:
|
|
78
|
+
return PrecisionKind.BINARY_LOW_RANK
|
|
79
|
+
return PrecisionKind.INT4
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Iterable
|
|
4
|
+
|
|
5
|
+
from ..types import LayerProfile
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ModelInspector:
|
|
9
|
+
"""Adapter-style model inspection with safe fallbacks for existing models."""
|
|
10
|
+
|
|
11
|
+
FRAGILE_OPS = {"embedding", "norm", "layernorm", "rmsnorm", "head", "lm_head"}
|
|
12
|
+
SAFE_BINARY_OPS = {"linear", "conv1d", "conv2d", "matmul"}
|
|
13
|
+
|
|
14
|
+
def inspect_layers(self, model: Any) -> list[LayerProfile]:
|
|
15
|
+
if hasattr(model, "nearbit_layers"):
|
|
16
|
+
return list(model.nearbit_layers())
|
|
17
|
+
if hasattr(model, "named_modules"):
|
|
18
|
+
return self._from_named_modules(model.named_modules())
|
|
19
|
+
if isinstance(model, dict) and "layers" in model:
|
|
20
|
+
return self._from_mapping(model["layers"])
|
|
21
|
+
raise TypeError("Unsupported model format. Provide nearbit_layers(), named_modules(), or a layer mapping.")
|
|
22
|
+
|
|
23
|
+
def identify_model(self, model: Any) -> tuple[str, str]:
|
|
24
|
+
model_name = getattr(model, "name", model.__class__.__name__)
|
|
25
|
+
model_type = getattr(model, "model_type", "transformer")
|
|
26
|
+
return model_name, model_type
|
|
27
|
+
|
|
28
|
+
def _from_named_modules(self, modules: Iterable[tuple[str, Any]]) -> list[LayerProfile]:
|
|
29
|
+
layers: list[LayerProfile] = []
|
|
30
|
+
for name, module in modules:
|
|
31
|
+
if not name:
|
|
32
|
+
continue
|
|
33
|
+
op_type = module.__class__.__name__.lower()
|
|
34
|
+
parameter_count = int(getattr(module, "parameter_count", 0) or self._safe_numel(module))
|
|
35
|
+
fragile = self._is_fragile(name, op_type)
|
|
36
|
+
supports_binary = self._supports_binary(op_type) and not fragile
|
|
37
|
+
layers.append(
|
|
38
|
+
LayerProfile(
|
|
39
|
+
name=name,
|
|
40
|
+
op_type=op_type,
|
|
41
|
+
parameter_count=parameter_count,
|
|
42
|
+
is_fragile=fragile,
|
|
43
|
+
supports_binary=supports_binary,
|
|
44
|
+
)
|
|
45
|
+
)
|
|
46
|
+
return layers
|
|
47
|
+
|
|
48
|
+
def _from_mapping(self, layers_map: Iterable[dict[str, Any]]) -> list[LayerProfile]:
|
|
49
|
+
layers: list[LayerProfile] = []
|
|
50
|
+
for item in layers_map:
|
|
51
|
+
name = item["name"]
|
|
52
|
+
op_type = str(item.get("op_type", "linear")).lower()
|
|
53
|
+
fragile = bool(item.get("is_fragile", self._is_fragile(name, op_type)))
|
|
54
|
+
supports_binary = bool(item.get("supports_binary", self._supports_binary(op_type) and not fragile))
|
|
55
|
+
layers.append(
|
|
56
|
+
LayerProfile(
|
|
57
|
+
name=name,
|
|
58
|
+
op_type=op_type,
|
|
59
|
+
parameter_count=int(item.get("parameter_count", 0)),
|
|
60
|
+
output_shape=tuple(item.get("output_shape", ())),
|
|
61
|
+
is_fragile=fragile,
|
|
62
|
+
supports_binary=supports_binary,
|
|
63
|
+
)
|
|
64
|
+
)
|
|
65
|
+
return layers
|
|
66
|
+
|
|
67
|
+
def _is_fragile(self, name: str, op_type: str) -> bool:
|
|
68
|
+
name_lower = name.lower()
|
|
69
|
+
return any(token in name_lower or token in op_type for token in self.FRAGILE_OPS)
|
|
70
|
+
|
|
71
|
+
def _supports_binary(self, op_type: str) -> bool:
|
|
72
|
+
return any(token in op_type for token in self.SAFE_BINARY_OPS)
|
|
73
|
+
|
|
74
|
+
def _safe_numel(self, module: Any) -> int:
|
|
75
|
+
parameters = getattr(module, "parameters", None)
|
|
76
|
+
if parameters is None:
|
|
77
|
+
return 0
|
|
78
|
+
total = 0
|
|
79
|
+
for param in parameters():
|
|
80
|
+
total += int(getattr(param, "numel", lambda: 0)())
|
|
81
|
+
return total
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import torch
|
|
7
|
+
|
|
8
|
+
from .integration import (
|
|
9
|
+
HybridLowBitConfig,
|
|
10
|
+
convert_model_to_hybrid_lowbit,
|
|
11
|
+
detect_model_dtypes,
|
|
12
|
+
estimate_effective_bits,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def load_hf_model(model_id: str, trust_remote_code: bool = False):
|
|
17
|
+
from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer
|
|
18
|
+
|
|
19
|
+
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=trust_remote_code)
|
|
20
|
+
if tokenizer.pad_token is None and tokenizer.eos_token is not None:
|
|
21
|
+
tokenizer.pad_token = tokenizer.eos_token
|
|
22
|
+
|
|
23
|
+
errors: list[str] = []
|
|
24
|
+
for model_kind, loader in (("causal", AutoModelForCausalLM), ("seq2seq", AutoModelForSeq2SeqLM)):
|
|
25
|
+
try:
|
|
26
|
+
model = loader.from_pretrained(
|
|
27
|
+
model_id,
|
|
28
|
+
trust_remote_code=trust_remote_code,
|
|
29
|
+
low_cpu_mem_usage=True,
|
|
30
|
+
)
|
|
31
|
+
model.eval()
|
|
32
|
+
return model, tokenizer, model_kind
|
|
33
|
+
except Exception as exc:
|
|
34
|
+
errors.append(f"{model_kind}: {exc}")
|
|
35
|
+
raise RuntimeError(f"Could not load model '{model_id}'. Details: {' | '.join(errors)}")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def compress_hf_model(model: Any, config: HybridLowBitConfig | None = None):
|
|
39
|
+
return convert_model_to_hybrid_lowbit(model, config or HybridLowBitConfig())
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def model_size_mb(model: Any) -> float:
|
|
43
|
+
total_bytes = 0
|
|
44
|
+
for tensor in model.state_dict().values():
|
|
45
|
+
if hasattr(tensor, "element_size") and hasattr(tensor, "nelement"):
|
|
46
|
+
total_bytes += tensor.element_size() * tensor.nelement()
|
|
47
|
+
return round(total_bytes / (1024 ** 2), 4)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def model_report(model: Any) -> dict[str, Any]:
|
|
51
|
+
return {
|
|
52
|
+
"model_size_mb": model_size_mb(model),
|
|
53
|
+
"effective_bits": estimate_effective_bits(model),
|
|
54
|
+
"dtypes": detect_model_dtypes(model),
|
|
55
|
+
"lowbit_summary": dict(getattr(model, "nearbit_lowbit_summary", {})),
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def chat(
|
|
60
|
+
model: Any,
|
|
61
|
+
tokenizer: Any,
|
|
62
|
+
prompt: str,
|
|
63
|
+
*,
|
|
64
|
+
system_prompt: str | None = "You are a concise, helpful assistant.",
|
|
65
|
+
max_new_tokens: int = 128,
|
|
66
|
+
do_sample: bool = False,
|
|
67
|
+
temperature: float = 0.7,
|
|
68
|
+
top_p: float = 0.95,
|
|
69
|
+
) -> str:
|
|
70
|
+
from transformers import GenerationConfig
|
|
71
|
+
|
|
72
|
+
inputs = _build_inputs(tokenizer, prompt, system_prompt)
|
|
73
|
+
generation_kwargs = {
|
|
74
|
+
"max_new_tokens": max_new_tokens,
|
|
75
|
+
"do_sample": do_sample,
|
|
76
|
+
"pad_token_id": tokenizer.pad_token_id,
|
|
77
|
+
"eos_token_id": tokenizer.eos_token_id,
|
|
78
|
+
}
|
|
79
|
+
if do_sample:
|
|
80
|
+
generation_kwargs["temperature"] = temperature
|
|
81
|
+
generation_kwargs["top_p"] = top_p
|
|
82
|
+
generation_config = GenerationConfig(**generation_kwargs)
|
|
83
|
+
|
|
84
|
+
with torch.inference_mode():
|
|
85
|
+
output = model.generate(**inputs, generation_config=generation_config)
|
|
86
|
+
|
|
87
|
+
is_encoder_decoder = bool(getattr(getattr(model, "config", None), "is_encoder_decoder", False))
|
|
88
|
+
if is_encoder_decoder:
|
|
89
|
+
generated_ids = output[0]
|
|
90
|
+
else:
|
|
91
|
+
input_len = inputs["input_ids"].shape[-1]
|
|
92
|
+
generated_ids = output[0][input_len:]
|
|
93
|
+
return tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def benchmark_prompts(
|
|
97
|
+
model: Any,
|
|
98
|
+
tokenizer: Any,
|
|
99
|
+
prompts: list[str],
|
|
100
|
+
*,
|
|
101
|
+
system_prompt: str | None = "You are a concise, helpful assistant.",
|
|
102
|
+
max_new_tokens: int = 64,
|
|
103
|
+
do_sample: bool = False,
|
|
104
|
+
temperature: float = 0.7,
|
|
105
|
+
top_p: float = 0.95,
|
|
106
|
+
) -> list[dict[str, Any]]:
|
|
107
|
+
rows: list[dict[str, Any]] = []
|
|
108
|
+
for prompt in prompts:
|
|
109
|
+
started = time.perf_counter()
|
|
110
|
+
response = chat(
|
|
111
|
+
model,
|
|
112
|
+
tokenizer,
|
|
113
|
+
prompt,
|
|
114
|
+
system_prompt=system_prompt,
|
|
115
|
+
max_new_tokens=max_new_tokens,
|
|
116
|
+
do_sample=do_sample,
|
|
117
|
+
temperature=temperature,
|
|
118
|
+
top_p=top_p,
|
|
119
|
+
)
|
|
120
|
+
elapsed = time.perf_counter() - started
|
|
121
|
+
token_count = len(tokenizer.encode(response, add_special_tokens=False))
|
|
122
|
+
rows.append(
|
|
123
|
+
{
|
|
124
|
+
"prompt": prompt,
|
|
125
|
+
"response": response,
|
|
126
|
+
"latency_sec": round(elapsed, 4),
|
|
127
|
+
"generated_tokens": token_count,
|
|
128
|
+
"tokens_per_sec": round(token_count / elapsed, 4) if elapsed > 0 else None,
|
|
129
|
+
}
|
|
130
|
+
)
|
|
131
|
+
return rows
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _build_inputs(tokenizer: Any, prompt: str, system_prompt: str | None = None):
|
|
135
|
+
if getattr(tokenizer, "chat_template", None):
|
|
136
|
+
messages = []
|
|
137
|
+
if system_prompt:
|
|
138
|
+
messages.append({"role": "system", "content": system_prompt})
|
|
139
|
+
messages.append({"role": "user", "content": prompt})
|
|
140
|
+
rendered = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
141
|
+
return tokenizer(rendered, return_tensors="pt")
|
|
142
|
+
return tokenizer(prompt, return_tensors="pt")
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from ..types import PrecisionKind
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def activation_policy(model_type: str) -> dict[str, str]:
|
|
7
|
+
if model_type == "transformer":
|
|
8
|
+
return {"default_precision": PrecisionKind.INT8.value, "fallback_precision": PrecisionKind.INT4.value}
|
|
9
|
+
return {"default_precision": PrecisionKind.INT8.value, "fallback_precision": PrecisionKind.INT8.value}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Iterable
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def pack_signs(weights: Iterable[float], group_size: int) -> dict[str, object]:
|
|
7
|
+
values = list(weights)
|
|
8
|
+
packed = [1 if value >= 0 else 0 for value in values]
|
|
9
|
+
scales = []
|
|
10
|
+
for index in range(0, len(values), group_size):
|
|
11
|
+
group = values[index : index + group_size]
|
|
12
|
+
mean_abs = sum(abs(value) for value in group) / max(len(group), 1)
|
|
13
|
+
scales.append(mean_abs)
|
|
14
|
+
return {
|
|
15
|
+
"bitpack": packed,
|
|
16
|
+
"group_size": group_size,
|
|
17
|
+
"scales": scales,
|
|
18
|
+
}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from ..packaging.builder import ArtifactBuilder
|
|
6
|
+
from ..types import CompressionArtifact, CompressionPlan, PackedLayer, PrecisionKind
|
|
7
|
+
from .binary import pack_signs
|
|
8
|
+
from .kv_cache import kv_cache_policy
|
|
9
|
+
from .outliers import extract_outliers
|
|
10
|
+
from .residual import build_low_rank_residual
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Compressor:
|
|
14
|
+
def compress(self, model: Any, compression_plan: CompressionPlan) -> CompressionArtifact:
|
|
15
|
+
weights_by_layer = self._read_weights(model)
|
|
16
|
+
packed_layers: list[PackedLayer] = []
|
|
17
|
+
|
|
18
|
+
for layer_plan in compression_plan.layers:
|
|
19
|
+
weights = weights_by_layer.get(layer_plan.name, [0.0])
|
|
20
|
+
payload = self._compress_layer(weights, layer_plan.precision, layer_plan.group_size, layer_plan.outlier_fraction, layer_plan.residual_rank)
|
|
21
|
+
packed_layers.append(
|
|
22
|
+
PackedLayer(
|
|
23
|
+
name=layer_plan.name,
|
|
24
|
+
precision=layer_plan.precision,
|
|
25
|
+
payload=payload,
|
|
26
|
+
)
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
return ArtifactBuilder().build(
|
|
30
|
+
compression_plan=compression_plan,
|
|
31
|
+
packed_layers=packed_layers,
|
|
32
|
+
kv_policy=kv_cache_policy(compression_plan.model_type),
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
def _compress_layer(
|
|
36
|
+
self,
|
|
37
|
+
weights: list[float],
|
|
38
|
+
precision: PrecisionKind,
|
|
39
|
+
group_size: int,
|
|
40
|
+
outlier_fraction: float,
|
|
41
|
+
residual_rank: int,
|
|
42
|
+
) -> dict[str, object]:
|
|
43
|
+
if precision == PrecisionKind.BINARY:
|
|
44
|
+
return pack_signs(weights, group_size)
|
|
45
|
+
if precision == PrecisionKind.BINARY_OUTLIERS:
|
|
46
|
+
return {
|
|
47
|
+
"binary": pack_signs(weights, group_size),
|
|
48
|
+
"outliers": extract_outliers(weights, outlier_fraction),
|
|
49
|
+
}
|
|
50
|
+
if precision == PrecisionKind.BINARY_LOW_RANK:
|
|
51
|
+
return {
|
|
52
|
+
"binary": pack_signs(weights, group_size),
|
|
53
|
+
"outliers": extract_outliers(weights, outlier_fraction),
|
|
54
|
+
"residual": build_low_rank_residual(weights, residual_rank),
|
|
55
|
+
}
|
|
56
|
+
return {
|
|
57
|
+
"values": weights,
|
|
58
|
+
"stored_precision": precision.value,
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
def _read_weights(self, model: Any) -> dict[str, list[float]]:
|
|
62
|
+
if hasattr(model, "nearbit_weights"):
|
|
63
|
+
return dict(model.nearbit_weights())
|
|
64
|
+
if isinstance(model, dict) and "weights" in model:
|
|
65
|
+
return {str(key): list(value) for key, value in model["weights"].items()}
|
|
66
|
+
return {}
|