compressed-tensors-nightly 0.3.3.20240514__py3-none-any.whl → 0.3.3.20240517__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- compressed_tensors/compressors/__init__.py +1 -0
- compressed_tensors/compressors/base.py +1 -0
- compressed_tensors/compressors/int_quantized.py +27 -7
- compressed_tensors/compressors/pack_quantized.py +198 -0
- compressed_tensors/config/base.py +1 -0
- compressed_tensors/quantization/lifecycle/apply.py +5 -2
- compressed_tensors/quantization/lifecycle/forward.py +4 -1
- compressed_tensors/quantization/utils/helpers.py +58 -5
- {compressed_tensors_nightly-0.3.3.20240514.dist-info → compressed_tensors_nightly-0.3.3.20240517.dist-info}/METADATA +42 -1
- {compressed_tensors_nightly-0.3.3.20240514.dist-info → compressed_tensors_nightly-0.3.3.20240517.dist-info}/RECORD +13 -12
- {compressed_tensors_nightly-0.3.3.20240514.dist-info → compressed_tensors_nightly-0.3.3.20240517.dist-info}/LICENSE +0 -0
- {compressed_tensors_nightly-0.3.3.20240514.dist-info → compressed_tensors_nightly-0.3.3.20240517.dist-info}/WHEEL +0 -0
- {compressed_tensors_nightly-0.3.3.20240514.dist-info → compressed_tensors_nightly-0.3.3.20240517.dist-info}/top_level.txt +0 -0
@@ -19,4 +19,5 @@ from .dense import DenseCompressor
|
|
19
19
|
from .helpers import load_compressed, save_compressed, save_compressed_model
|
20
20
|
from .int_quantized import IntQuantizationCompressor
|
21
21
|
from .model_compressor import ModelCompressor
|
22
|
+
from .pack_quantized import PackedQuantizationCompressor
|
22
23
|
from .sparse_bitmask import BitmaskCompressor, BitmaskTensor
|
@@ -54,6 +54,7 @@ class Compressor(RegistryMixin):
|
|
54
54
|
|
55
55
|
:param model_path: path to compressed safetensors model (directory with
|
56
56
|
one or more safetensors files) or compressed tensors file
|
57
|
+
:param device: optional device to load intermediate weights into
|
57
58
|
:return: compressed state dict
|
58
59
|
"""
|
59
60
|
raise NotImplementedError()
|
@@ -18,7 +18,9 @@ from typing import Dict, Generator, Tuple
|
|
18
18
|
import torch
|
19
19
|
from compressed_tensors.compressors import Compressor
|
20
20
|
from compressed_tensors.config import CompressionFormat
|
21
|
+
from compressed_tensors.quantization import QuantizationArgs
|
21
22
|
from compressed_tensors.quantization.lifecycle.forward import dequantize, quantize
|
23
|
+
from compressed_tensors.quantization.utils import can_quantize
|
22
24
|
from compressed_tensors.utils import get_nested_weight_mappings, merge_names
|
23
25
|
from safetensors import safe_open
|
24
26
|
from torch import Tensor
|
@@ -40,8 +42,20 @@ class IntQuantizationCompressor(Compressor):
|
|
40
42
|
|
41
43
|
COMPRESSION_PARAM_NAMES = ["weight", "weight_scale", "weight_zero_point"]
|
42
44
|
|
43
|
-
def compress(
|
44
|
-
|
45
|
+
def compress(
|
46
|
+
self,
|
47
|
+
model_state: Dict[str, Tensor],
|
48
|
+
model_quant_args: Dict[str, QuantizationArgs],
|
49
|
+
**kwargs,
|
50
|
+
) -> Dict[str, Tensor]:
|
51
|
+
"""
|
52
|
+
Compresses a dense state dict
|
53
|
+
|
54
|
+
:param model_state: state dict of uncompressed model
|
55
|
+
:param model_quant_args: quantization args for each quantized weight, needed for
|
56
|
+
quantize function to calculate bit depth
|
57
|
+
:return: compressed state dict
|
58
|
+
"""
|
45
59
|
compressed_dict = {}
|
46
60
|
_LOGGER.debug(
|
47
61
|
f"Compressing model with {len(model_state)} parameterized layers..."
|
@@ -55,11 +69,7 @@ class IntQuantizationCompressor(Compressor):
|
|
55
69
|
if scale is not None and zp is not None:
|
56
70
|
# weight is quantized, compress it
|
57
71
|
quant_args = model_quant_args[prefix]
|
58
|
-
|
59
|
-
bit_depth = torch.finfo(value.dtype).bits
|
60
|
-
except TypeError:
|
61
|
-
bit_depth = torch.iinfo(value.dtype).bits
|
62
|
-
if bit_depth > quant_args.num_bits:
|
72
|
+
if can_quantize(value, quant_args):
|
63
73
|
# only quantize if not already quantized
|
64
74
|
value = quantize(
|
65
75
|
x=value,
|
@@ -76,6 +86,16 @@ class IntQuantizationCompressor(Compressor):
|
|
76
86
|
def decompress(
|
77
87
|
self, path_to_model_or_tensors: str, device: str = "cpu"
|
78
88
|
) -> Generator[Tuple[str, Tensor], None, None]:
|
89
|
+
"""
|
90
|
+
Reads a compressed state dict located at path_to_model_or_tensors
|
91
|
+
and returns a generator for sequentially decompressing back to a
|
92
|
+
dense state dict
|
93
|
+
|
94
|
+
:param model_path: path to compressed safetensors model (directory with
|
95
|
+
one or more safetensors files) or compressed tensors file
|
96
|
+
:param device: optional device to load intermediate weights into
|
97
|
+
:return: compressed state dict
|
98
|
+
"""
|
79
99
|
weight_mappings = get_nested_weight_mappings(
|
80
100
|
path_to_model_or_tensors, self.COMPRESSION_PARAM_NAMES
|
81
101
|
)
|
@@ -0,0 +1,198 @@
|
|
1
|
+
# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing,
|
10
|
+
# software distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
import logging
|
16
|
+
import math
|
17
|
+
from typing import Dict, Generator, Tuple
|
18
|
+
|
19
|
+
import numpy as np
|
20
|
+
import torch
|
21
|
+
from compressed_tensors.compressors import Compressor
|
22
|
+
from compressed_tensors.config import CompressionFormat
|
23
|
+
from compressed_tensors.quantization import QuantizationArgs
|
24
|
+
from compressed_tensors.quantization.lifecycle.forward import dequantize, quantize
|
25
|
+
from compressed_tensors.quantization.utils import can_quantize
|
26
|
+
from compressed_tensors.utils import get_nested_weight_mappings, merge_names
|
27
|
+
from safetensors import safe_open
|
28
|
+
from torch import Tensor
|
29
|
+
from tqdm import tqdm
|
30
|
+
|
31
|
+
|
32
|
+
__all__ = ["PackedQuantizationCompressor", "pack_4bit_ints", "unpack_4bit_ints"]
|
33
|
+
|
34
|
+
_LOGGER: logging.Logger = logging.getLogger(__name__)
|
35
|
+
|
36
|
+
|
37
|
+
@Compressor.register(name=CompressionFormat.pack_quantized.value)
|
38
|
+
class PackedQuantizationCompressor(Compressor):
|
39
|
+
"""
|
40
|
+
Compresses a quantized model by packing every eight 4-bit weights into an int32
|
41
|
+
"""
|
42
|
+
|
43
|
+
COMPRESSION_PARAM_NAMES = [
|
44
|
+
"weight",
|
45
|
+
"weight_scale",
|
46
|
+
"weight_zero_point",
|
47
|
+
"weight_shape",
|
48
|
+
]
|
49
|
+
|
50
|
+
def compress(
|
51
|
+
self,
|
52
|
+
model_state: Dict[str, Tensor],
|
53
|
+
model_quant_args: Dict[str, QuantizationArgs],
|
54
|
+
**kwargs,
|
55
|
+
) -> Dict[str, Tensor]:
|
56
|
+
"""
|
57
|
+
Compresses a dense state dict
|
58
|
+
|
59
|
+
:param model_state: state dict of uncompressed model
|
60
|
+
:param model_quant_args: quantization args for each quantized weight, needed for
|
61
|
+
quantize function to calculate bit depth
|
62
|
+
:return: compressed state dict
|
63
|
+
"""
|
64
|
+
compressed_dict = {}
|
65
|
+
_LOGGER.debug(
|
66
|
+
f"Compressing model with {len(model_state)} parameterized layers..."
|
67
|
+
)
|
68
|
+
|
69
|
+
for name, value in tqdm(model_state.items(), desc="Compressing model"):
|
70
|
+
if name.endswith(".weight"):
|
71
|
+
prefix = name.removesuffix(".weight")
|
72
|
+
scale = model_state.get(merge_names(prefix, "weight_scale"), None)
|
73
|
+
zp = model_state.get(merge_names(prefix, "weight_zero_point"), None)
|
74
|
+
shape = torch.tensor(value.shape)
|
75
|
+
if scale is not None and zp is not None:
|
76
|
+
# weight is quantized, compress it
|
77
|
+
# weight is quantized, compress it
|
78
|
+
quant_args = model_quant_args[prefix]
|
79
|
+
if can_quantize(value, quant_args):
|
80
|
+
# convert weight to an int if not already compressed
|
81
|
+
value = quantize(
|
82
|
+
x=value,
|
83
|
+
scale=scale,
|
84
|
+
zero_point=zp,
|
85
|
+
args=quant_args,
|
86
|
+
dtype=torch.int8,
|
87
|
+
)
|
88
|
+
value = pack_4bit_ints(value.cpu())
|
89
|
+
compressed_dict[merge_names(prefix, "weight_shape")] = shape
|
90
|
+
|
91
|
+
compressed_dict[name] = value.to("cpu")
|
92
|
+
|
93
|
+
return compressed_dict
|
94
|
+
|
95
|
+
def decompress(
|
96
|
+
self, path_to_model_or_tensors: str, device: str = "cpu"
|
97
|
+
) -> Generator[Tuple[str, Tensor], None, None]:
|
98
|
+
"""
|
99
|
+
Reads a compressed state dict located at path_to_model_or_tensors
|
100
|
+
and returns a generator for sequentially decompressing back to a
|
101
|
+
dense state dict
|
102
|
+
|
103
|
+
:param model_path: path to compressed safetensors model (directory with
|
104
|
+
one or more safetensors files) or compressed tensors file
|
105
|
+
:param device: optional device to load intermediate weights into
|
106
|
+
:return: compressed state dict
|
107
|
+
"""
|
108
|
+
weight_mappings = get_nested_weight_mappings(
|
109
|
+
path_to_model_or_tensors, self.COMPRESSION_PARAM_NAMES
|
110
|
+
)
|
111
|
+
for weight_name in weight_mappings.keys():
|
112
|
+
weight_data = {}
|
113
|
+
for param_name, safe_path in weight_mappings[weight_name].items():
|
114
|
+
full_name = merge_names(weight_name, param_name)
|
115
|
+
with safe_open(safe_path, framework="pt", device=device) as f:
|
116
|
+
weight_data[param_name] = f.get_tensor(full_name)
|
117
|
+
|
118
|
+
if len(weight_data) == len(self.COMPRESSION_PARAM_NAMES):
|
119
|
+
weight = weight_data["weight"]
|
120
|
+
original_shape = torch.Size(weight_data["weight_shape"])
|
121
|
+
unpacked = unpack_4bit_ints(weight, original_shape)
|
122
|
+
decompressed = dequantize(
|
123
|
+
x_q=unpacked,
|
124
|
+
scale=weight_data["weight_scale"],
|
125
|
+
zero_point=weight_data["weight_zero_point"],
|
126
|
+
)
|
127
|
+
yield merge_names(weight_name, "weight"), decompressed
|
128
|
+
|
129
|
+
|
130
|
+
def pack_4bit_ints(value: torch.Tensor) -> torch.Tensor:
|
131
|
+
"""
|
132
|
+
Packs a tensor of int4 weights stored in int8 into int32s with padding
|
133
|
+
|
134
|
+
:param value: tensor to pack
|
135
|
+
:returns: packed int32 tensor
|
136
|
+
"""
|
137
|
+
if value.dtype is not torch.int8:
|
138
|
+
raise ValueError("Tensor must be quantized to torch.int8 before packing")
|
139
|
+
|
140
|
+
# need to convert to unsigned 8bit to use numpy's pack/unpack
|
141
|
+
temp = (value - 8).to(torch.uint8)
|
142
|
+
bits = np.unpackbits(temp.numpy(), axis=-1, bitorder="little")
|
143
|
+
ranges = np.array([range(x, x + 4) for x in range(0, bits.shape[1], 8)]).flatten()
|
144
|
+
only_4_bits = bits[:, ranges] # top 4 bits are 0 because we're really uint4
|
145
|
+
|
146
|
+
# pad each row to fill a full 32bit int
|
147
|
+
pack_depth = 32
|
148
|
+
padding = (
|
149
|
+
math.ceil(only_4_bits.shape[1] / pack_depth) * pack_depth - only_4_bits.shape[1]
|
150
|
+
)
|
151
|
+
padded_bits = np.pad(
|
152
|
+
only_4_bits, pad_width=[(0, 0), (0, padding)], constant_values=0
|
153
|
+
)
|
154
|
+
|
155
|
+
# after packbits each uint8 is two packed uint4s
|
156
|
+
# then we keep the bit pattern the same but convert to int32
|
157
|
+
compressed = np.packbits(padded_bits, axis=-1, bitorder="little")
|
158
|
+
compressed = np.ascontiguousarray(compressed).view(np.int32)
|
159
|
+
|
160
|
+
return torch.from_numpy(compressed)
|
161
|
+
|
162
|
+
|
163
|
+
def unpack_4bit_ints(value: torch.Tensor, shape: torch.Size) -> torch.Tensor:
|
164
|
+
"""
|
165
|
+
Unpacks a tensor packed int4 weights into individual int8s, maintaining the
|
166
|
+
original their int4 range
|
167
|
+
|
168
|
+
:param value: tensor to upack
|
169
|
+
:param shape: shape to unpack into, used to remove padding
|
170
|
+
:returns: unpacked int8 tensor
|
171
|
+
"""
|
172
|
+
if value.dtype is not torch.int32:
|
173
|
+
raise ValueError(
|
174
|
+
f"Expected {torch.int32} but got {value.dtype}, Aborting unpack."
|
175
|
+
)
|
176
|
+
|
177
|
+
# unpack bits and undo padding to nearest int32 bits
|
178
|
+
individual_depth = 4
|
179
|
+
as_uint8 = value.numpy().view(np.uint8)
|
180
|
+
bits = np.unpackbits(as_uint8, axis=-1, bitorder="little")
|
181
|
+
original_row_size = int(shape[1] * individual_depth)
|
182
|
+
bits = bits[:, :original_row_size]
|
183
|
+
|
184
|
+
# reformat each packed uint4 to a uint8 by filling to top 4 bits with zeros
|
185
|
+
# (uint8 format is required by np.packbits)
|
186
|
+
shape_8bit = (bits.shape[0], bits.shape[1] * 2)
|
187
|
+
bits_as_8bit = np.zeros(shape_8bit, dtype=np.uint8)
|
188
|
+
ranges = np.array([range(x, x + 4) for x in range(0, shape_8bit[1], 8)]).flatten()
|
189
|
+
bits_as_8bit[:, ranges] = bits
|
190
|
+
|
191
|
+
# repack the bits to uint8
|
192
|
+
repacked = np.packbits(bits_as_8bit, axis=-1, bitorder="little")
|
193
|
+
|
194
|
+
# bits are packed in unsigned format, reformat to signed
|
195
|
+
# update the value range from uint4 to int4
|
196
|
+
final = repacked.astype(np.int8) - 8
|
197
|
+
|
198
|
+
return torch.from_numpy(final)
|
@@ -30,7 +30,10 @@ from compressed_tensors.quantization.quant_config import (
|
|
30
30
|
QuantizationConfig,
|
31
31
|
QuantizationStatus,
|
32
32
|
)
|
33
|
-
from compressed_tensors.quantization.utils import
|
33
|
+
from compressed_tensors.quantization.utils import (
|
34
|
+
infer_quantization_status,
|
35
|
+
iter_named_leaf_modules,
|
36
|
+
)
|
34
37
|
from compressed_tensors.utils.safetensors_load import get_safetensors_folder
|
35
38
|
from torch.nn import Module
|
36
39
|
|
@@ -121,7 +124,7 @@ def apply_quantization_status(model: Module, status: QuantizationStatus):
|
|
121
124
|
:param model: model to apply quantization to
|
122
125
|
:param status: status to update the module to
|
123
126
|
"""
|
124
|
-
current_status =
|
127
|
+
current_status = infer_quantization_status(model)
|
125
128
|
|
126
129
|
if status >= QuantizationStatus.INITIALIZED > current_status:
|
127
130
|
model.apply(initialize_module_for_quantization)
|
@@ -229,7 +229,10 @@ def _process_quantization(
|
|
229
229
|
def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme):
|
230
230
|
# expects a module already initialized and injected with the parameters in
|
231
231
|
# initialize_module_for_quantization
|
232
|
-
|
232
|
+
if hasattr(module.forward, "__func__"):
|
233
|
+
forward_func_orig = module.forward.__func__
|
234
|
+
else:
|
235
|
+
forward_func_orig = module.forward.func
|
233
236
|
|
234
237
|
@wraps(forward_func_orig) # ensures docstring, names, etc are propagated
|
235
238
|
def wrapped_forward(self, *args, **kwargs):
|
@@ -12,7 +12,8 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
-
|
15
|
+
import logging
|
16
|
+
from typing import Optional, Tuple
|
16
17
|
|
17
18
|
import torch
|
18
19
|
from compressed_tensors.quantization.observers.base import Observer
|
@@ -21,13 +22,33 @@ from tqdm import tqdm
|
|
21
22
|
|
22
23
|
|
23
24
|
__all__ = [
|
25
|
+
"infer_quantization_status",
|
24
26
|
"is_module_quantized",
|
25
27
|
"is_model_quantized",
|
26
28
|
"iter_named_leaf_modules",
|
27
29
|
"module_type",
|
28
30
|
"calculate_compression_ratio",
|
31
|
+
"get_torch_bit_depth",
|
32
|
+
"can_quantize",
|
29
33
|
]
|
30
34
|
|
35
|
+
_LOGGER: logging.Logger = logging.getLogger(__name__)
|
36
|
+
|
37
|
+
|
38
|
+
def infer_quantization_status(model: Module) -> Optional["QuantizationStatus"]: # noqa
|
39
|
+
"""
|
40
|
+
Checks the quantization status of a model. Assumes all modules in the model have
|
41
|
+
the same status, so only the first quantized model is checked.
|
42
|
+
|
43
|
+
:param model: model to check quantization status for
|
44
|
+
:return: quantization status if the model is quantized, otherwise None
|
45
|
+
"""
|
46
|
+
for module in model.modules():
|
47
|
+
status = getattr(module, "quantization_status", None)
|
48
|
+
if status is not None:
|
49
|
+
return status
|
50
|
+
return None
|
51
|
+
|
31
52
|
|
32
53
|
def is_module_quantized(module: Module) -> bool:
|
33
54
|
"""
|
@@ -100,6 +121,41 @@ def iter_named_leaf_modules(model: Module) -> Tuple[str, Module]:
|
|
100
121
|
yield name, submodule
|
101
122
|
|
102
123
|
|
124
|
+
def get_torch_bit_depth(value: torch.Tensor) -> int:
|
125
|
+
"""
|
126
|
+
Determine the number of bits used to represent the dtype of a tensor
|
127
|
+
|
128
|
+
:param value: tensor to check bit depth of
|
129
|
+
:return: bit depth of each element in the value tensor
|
130
|
+
"""
|
131
|
+
try:
|
132
|
+
bit_depth = torch.finfo(value.dtype).bits
|
133
|
+
except TypeError:
|
134
|
+
bit_depth = torch.iinfo(value.dtype).bits
|
135
|
+
|
136
|
+
return bit_depth
|
137
|
+
|
138
|
+
|
139
|
+
def can_quantize(value: torch.Tensor, quant_args: "QuantizationArgs") -> bool: # noqa
|
140
|
+
"""
|
141
|
+
Checks if value can be quantized by quant_args.
|
142
|
+
|
143
|
+
:param value: tensor to check for quantization
|
144
|
+
:param quant_args: QuantizationArgs to use for quantization
|
145
|
+
:return: False if value is already quantized to quant_args or value is incompatible
|
146
|
+
with quant_args, True if value can be quantized with quant_args
|
147
|
+
"""
|
148
|
+
bit_depth = get_torch_bit_depth(value)
|
149
|
+
requested_depth = quant_args.num_bits
|
150
|
+
if bit_depth < quant_args.num_bits:
|
151
|
+
_LOGGER.warn(
|
152
|
+
f"Can't quantize tensor with bit depth {bit_depth} to {requested_depth}."
|
153
|
+
"The QuantizationArgs provided are not compatible with the input tensor."
|
154
|
+
)
|
155
|
+
|
156
|
+
return bit_depth > quant_args.num_bits
|
157
|
+
|
158
|
+
|
103
159
|
def calculate_compression_ratio(model: Module) -> float:
|
104
160
|
"""
|
105
161
|
Calculates the quantization compression ratio of a pytorch model, based on the
|
@@ -116,10 +172,7 @@ def calculate_compression_ratio(model: Module) -> float:
|
|
116
172
|
desc="Calculating quantization compression ratio",
|
117
173
|
):
|
118
174
|
for parameter in model.parameters():
|
119
|
-
|
120
|
-
uncompressed_bits = torch.finfo(parameter.dtype).bits
|
121
|
-
except TypeError:
|
122
|
-
uncompressed_bits = torch.iinfo(parameter.dtype).bits
|
175
|
+
uncompressed_bits = get_torch_bit_depth(parameter)
|
123
176
|
compressed_bits = uncompressed_bits
|
124
177
|
if is_module_quantized(submodule):
|
125
178
|
compressed_bits = submodule.quantization_scheme.weights.num_bits
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: compressed-tensors-nightly
|
3
|
-
Version: 0.3.3.
|
3
|
+
Version: 0.3.3.20240517
|
4
4
|
Summary: Library for utilization of compressed safetensors of neural network models
|
5
5
|
Home-page: https://github.com/neuralmagic/compressed-tensors
|
6
6
|
Author: Neuralmagic, Inc.
|
@@ -103,3 +103,44 @@ state_dict = dict(load_compressed("compressed_model.safetensors", compression_co
|
|
103
103
|
```
|
104
104
|
|
105
105
|
For more in-depth tutorial on bitmask compression, refer to the [notebook](https://github.com/neuralmagic/compressed-tensors/blob/d707c5b84bc3fef164aebdcd97cb6eaa571982f8/examples/bitmask_compression.ipynb).
|
106
|
+
|
107
|
+
|
108
|
+
## Saving a Compressed Model with PTQ
|
109
|
+
|
110
|
+
We can use compressed-tensors to run basic post training quantization (PTQ) and save the quantized model compressed on disk
|
111
|
+
|
112
|
+
```python
|
113
|
+
model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
|
114
|
+
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda:0")
|
115
|
+
|
116
|
+
config = QuantizationConfig.parse_file("./examples/bit_packing/int4_config.json")
|
117
|
+
config.quantization_status = QuantizationStatus.CALIBRATION
|
118
|
+
apply_quantization_config(model, config)
|
119
|
+
|
120
|
+
dataset = load_dataset("ptb_text_only")["train"]
|
121
|
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
122
|
+
|
123
|
+
def tokenize_function(examples):
|
124
|
+
return tokenizer(examples["sentence"], padding=False, truncation=True, max_length=1024)
|
125
|
+
|
126
|
+
tokenized_dataset = dataset.map(tokenize_function, batched=True)
|
127
|
+
data_loader = DataLoader(tokenized_dataset, batch_size=1, collate_fn=DefaultDataCollator())
|
128
|
+
|
129
|
+
with torch.no_grad():
|
130
|
+
for idx, sample in tqdm(enumerate(data_loader), desc="Running calibration"):
|
131
|
+
sample = {key: value.to(device) for key,value in sample.items()}
|
132
|
+
_ = model(**sample)
|
133
|
+
|
134
|
+
if idx >= 512:
|
135
|
+
break
|
136
|
+
|
137
|
+
model.apply(freeze_module_quantization)
|
138
|
+
model.apply(compress_quantized_weights)
|
139
|
+
|
140
|
+
output_dir = "./ex_llama1.1b_w4a16_packed_quantize"
|
141
|
+
compressor = ModelCompressor(quantization_config=config)
|
142
|
+
compressed_state_dict = compressor.compress(model)
|
143
|
+
model.save_pretrained(output_dir, state_dict=compressed_state_dict)
|
144
|
+
```
|
145
|
+
|
146
|
+
For more in-depth tutorial on quantization compression, refer to the [notebook](./examples/quantize_and_pack_int4.ipynb).
|
@@ -1,15 +1,16 @@
|
|
1
1
|
compressed_tensors/__init__.py,sha256=SV1csvHUVCd8kHXz6UDZim1HZ_fAVG3vfk-j_4Bb6hY,789
|
2
2
|
compressed_tensors/base.py,sha256=OA2TOLP1gP3LSH7gp508eqr2ZtDQ-pqRHElCp-aB0vs,755
|
3
3
|
compressed_tensors/version.py,sha256=V8krJZctm43D4AGQhJY6dB0MvP1-T9TJ8BcGa8kESrI,1512
|
4
|
-
compressed_tensors/compressors/__init__.py,sha256=
|
5
|
-
compressed_tensors/compressors/base.py,sha256=
|
4
|
+
compressed_tensors/compressors/__init__.py,sha256=3yyoNICHll3F4HS6Yu-cgNZpDhfuobFNWCs6DrPcUyQ,992
|
5
|
+
compressed_tensors/compressors/base.py,sha256=LWEgbpgTxzmoqQ7Xhq2OQszUgWoDtFuGCiV1Y8nlBGw,2134
|
6
6
|
compressed_tensors/compressors/dense.py,sha256=G_XHbvuENyupIKlXSITOQgvPkNkcMEOLcLWQr70V9EE,1257
|
7
7
|
compressed_tensors/compressors/helpers.py,sha256=k9avlkmeYj6vkOAvl-MgcixtP7ib24SCfhzZ-RusXfw,5403
|
8
|
-
compressed_tensors/compressors/int_quantized.py,sha256=
|
8
|
+
compressed_tensors/compressors/int_quantized.py,sha256=I0FqnjtwCiJvQxi9YyfA8aBeaR5csqtq1bOrVvRqJ1I,4744
|
9
9
|
compressed_tensors/compressors/model_compressor.py,sha256=teohd0xTbcIDIuEfZrH-bZyAzHn2UZH2KJXT-7Gk3sw,10426
|
10
|
+
compressed_tensors/compressors/pack_quantized.py,sha256=K03l8kFqejpapgcMU5hMm1-JIX1cUVvU-VybGSN6RWA,7885
|
10
11
|
compressed_tensors/compressors/sparse_bitmask.py,sha256=TH77NDFJwvQeySY75YV6w1zskZC-JcUGpua4zCFOgTY,8632
|
11
12
|
compressed_tensors/config/__init__.py,sha256=ZBqWn3r6ku1qfmlHHYp0mQueY0i7Pwhr9rbQk9dDlMc,704
|
12
|
-
compressed_tensors/config/base.py,sha256=
|
13
|
+
compressed_tensors/config/base.py,sha256=grf5tDaLep8i2-W_p7H-fW9DOGXDi4Zz7su7zjs1Qqc,1454
|
13
14
|
compressed_tensors/config/dense.py,sha256=NgSxnFCnckU9-iunxEaqiFwqgdO7YYxlWKR74jNbjks,1317
|
14
15
|
compressed_tensors/config/sparse_bitmask.py,sha256=pZUboRNZTu6NajGOQEFExoPknak5ynVAUeiiYpS1Gt8,1308
|
15
16
|
compressed_tensors/quantization/__init__.py,sha256=83J5bPB7PavN2TfCoW7_vEDhfYpm4TDrqYO9vdSQ5bk,760
|
@@ -17,10 +18,10 @@ compressed_tensors/quantization/quant_args.py,sha256=A6b2V8lhsM8Ho8RjlPBQdxRUDNW
|
|
17
18
|
compressed_tensors/quantization/quant_config.py,sha256=U6oEzheNK1d-0kHARzwepasnmS7HHqU_zGwoDBJ-lxU,8042
|
18
19
|
compressed_tensors/quantization/quant_scheme.py,sha256=X3oqmZPiIKtX5tEKKUj-0N6hB68NeiU2b1GcQEQPadQ,1480
|
19
20
|
compressed_tensors/quantization/lifecycle/__init__.py,sha256=ggRGWRqhCxCaTTDWRcgTVX3axnS2xV6rc5YvdzK7fSg,798
|
20
|
-
compressed_tensors/quantization/lifecycle/apply.py,sha256=
|
21
|
+
compressed_tensors/quantization/lifecycle/apply.py,sha256=whKfNGC_EZm0BC23AP7qWfjRe5OJVWmcZOpX7lryZZc,7625
|
21
22
|
compressed_tensors/quantization/lifecycle/calibration.py,sha256=mLns4jlaWmBwOW8Jtlm5bMX-JET1AiZYUBO7qa-XuxI,1776
|
22
23
|
compressed_tensors/quantization/lifecycle/compressed.py,sha256=VreB10xPwgSLQQlTu20UCrFpRS--cA7-lx5s7nrPPrg,2247
|
23
|
-
compressed_tensors/quantization/lifecycle/forward.py,sha256=
|
24
|
+
compressed_tensors/quantization/lifecycle/forward.py,sha256=sXo7ReS2ehHFwbtwUbhPnsnnj-CZ3iyAZKmUzHxjTKc,11373
|
24
25
|
compressed_tensors/quantization/lifecycle/frozen.py,sha256=h1XYt89MouBTf3jTYLG_6OdFxIu5q2N8tPjsy6J4E6Y,1726
|
25
26
|
compressed_tensors/quantization/lifecycle/initialize.py,sha256=U6g9qifSF6pagQZQZEwd-rwWC6uQ_dZXn1wg6nr1Abg,3697
|
26
27
|
compressed_tensors/quantization/observers/__init__.py,sha256=DNH31NQYrIBBcmHsMyFA6whh4pbRsLwuNa6L8AeXaGc,745
|
@@ -29,14 +30,14 @@ compressed_tensors/quantization/observers/helpers.py,sha256=JwALNfBYY9Eyl8Q180t0
|
|
29
30
|
compressed_tensors/quantization/observers/memoryless.py,sha256=ZHTPh4aURE8LvHBFaP--HIC2JanMX5-VRdIkE2JHthw,1859
|
30
31
|
compressed_tensors/quantization/observers/min_max.py,sha256=s2I40pzTXrVAjIsavNt6TLAl7-qDUmdc43Xd5rb4XAY,3071
|
31
32
|
compressed_tensors/quantization/utils/__init__.py,sha256=VdtEmP0bvuND_IGQnyqUPc5lnFp-1_yD7StKSX4x80w,656
|
32
|
-
compressed_tensors/quantization/utils/helpers.py,sha256=
|
33
|
+
compressed_tensors/quantization/utils/helpers.py,sha256=NzAH18Cn_-mTAR87y6IlcQU5gC393XSjgNKC9CRkr78,6017
|
33
34
|
compressed_tensors/registry/__init__.py,sha256=FwLSNYqfIrb5JD_6OK_MT4_svvKTN_nEhpgQlQvGbjI,658
|
34
35
|
compressed_tensors/registry/registry.py,sha256=fxjOjh2wklCvJhQxwofdy-zV8q7MkQ85SLG77nml2iA,11890
|
35
36
|
compressed_tensors/utils/__init__.py,sha256=5DrYjoZbaEvSkJcC-GRSbM_RBHVF4tG9gMd3zsJnjLw,665
|
36
37
|
compressed_tensors/utils/helpers.py,sha256=h0jfl9drs5FAx40tCHRcVtJqXixB5hT5yq_IG2aY_-w,1735
|
37
38
|
compressed_tensors/utils/safetensors_load.py,sha256=wo9UirGrGlenBqZeqotvpCT7D5MEdjCo2J3HeRaIFoU,8502
|
38
|
-
compressed_tensors_nightly-0.3.3.
|
39
|
-
compressed_tensors_nightly-0.3.3.
|
40
|
-
compressed_tensors_nightly-0.3.3.
|
41
|
-
compressed_tensors_nightly-0.3.3.
|
42
|
-
compressed_tensors_nightly-0.3.3.
|
39
|
+
compressed_tensors_nightly-0.3.3.20240517.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
40
|
+
compressed_tensors_nightly-0.3.3.20240517.dist-info/METADATA,sha256=qFN4Sop3DqhucuhVNwQIsDKJMQwa5f2aZRCwFL79lgU,5633
|
41
|
+
compressed_tensors_nightly-0.3.3.20240517.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
42
|
+
compressed_tensors_nightly-0.3.3.20240517.dist-info/top_level.txt,sha256=w2i-GyPs2s1UwVxvutSvN_lM22SXC2hQFBmoMcPnV7Y,19
|
43
|
+
compressed_tensors_nightly-0.3.3.20240517.dist-info/RECORD,,
|
File without changes
|
File without changes
|