llm-compression 0.1.0__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llm_compression-0.1.0 → llm_compression-0.1.1}/PKG-INFO +1 -1
- {llm_compression-0.1.0 → llm_compression-0.1.1}/llm_compression/__init__.py +1 -1
- {llm_compression-0.1.0 → llm_compression-0.1.1}/llm_compression/arithmetic_coding.py +44 -15
- {llm_compression-0.1.0 → llm_compression-0.1.1}/llm_compression/llama_model.py +77 -32
- {llm_compression-0.1.0 → llm_compression-0.1.1}/llm_compression/probability_model.py +15 -11
- {llm_compression-0.1.0 → llm_compression-0.1.1}/pyproject.toml +1 -1
- {llm_compression-0.1.0 → llm_compression-0.1.1}/LICENSE +0 -0
- {llm_compression-0.1.0 → llm_compression-0.1.1}/README.md +0 -0
@@ -3,12 +3,24 @@ import math
|
|
3
3
|
|
4
4
|
from probability_model import ProbabilityModel
|
5
5
|
|
6
|
-
|
6
|
+
|
7
|
+
def encode(input_arr: np.ndarray, model: ProbabilityModel) -> list[int]:
|
8
|
+
"""
|
9
|
+
Encodes an input array of symbols into a list of bits using arithmetic coding.
|
10
|
+
|
11
|
+
Args:
|
12
|
+
input_arr (np.ndarray): An array of input symbols to encode.
|
13
|
+
model (ProbabilityModel): The probability model used to get the symbol probabilities.
|
14
|
+
|
15
|
+
Returns:
|
16
|
+
list[int]: A list of bits representing the encoded input symbols.
|
17
|
+
"""
|
18
|
+
|
7
19
|
BIT_PRECISION = 64
|
8
20
|
MAX = (1 << BIT_PRECISION) - 1
|
9
21
|
HALF = 1 << (BIT_PRECISION - 1)
|
10
22
|
QUARTER = 1 << (BIT_PRECISION - 2)
|
11
|
-
|
23
|
+
|
12
24
|
low = 0
|
13
25
|
high = MAX
|
14
26
|
output = []
|
@@ -17,18 +29,18 @@ def encode(input_arr: np.ndarray, model: ProbabilityModel):
|
|
17
29
|
|
18
30
|
for symbol in input_arr:
|
19
31
|
symbols, cdfs = model.get_prob(input_arr[:k])
|
20
|
-
|
21
|
-
symbol_idx = np.where(symbols == symbol)[0]
|
32
|
+
|
33
|
+
symbol_idx = np.where(symbols == symbol)[0] # symbols.index(symbol)
|
22
34
|
|
23
35
|
# Calculate probability bounds using floating-point CDFs
|
24
|
-
cdf_low = cdfs[symbol_idx-1][0] if symbol_idx > 0 else 0.0
|
36
|
+
cdf_low = cdfs[symbol_idx - 1][0] if symbol_idx > 0 else 0.0
|
25
37
|
cdf_high = cdfs[symbol_idx][0]
|
26
|
-
|
38
|
+
|
27
39
|
# Convert to integer ranges with careful rounding
|
28
40
|
range_size = high - low + 1
|
29
41
|
new_low = low + math.floor(cdf_low * range_size)
|
30
42
|
new_high = low + math.ceil(cdf_high * range_size) - 1
|
31
|
-
|
43
|
+
|
32
44
|
low, high = new_low, new_high
|
33
45
|
|
34
46
|
# Interval scaling and bit emission
|
@@ -64,7 +76,22 @@ def encode(input_arr: np.ndarray, model: ProbabilityModel):
|
|
64
76
|
|
65
77
|
return output
|
66
78
|
|
67
|
-
|
79
|
+
|
80
|
+
def decode(
|
81
|
+
encoded_bits: np.ndarray, model: ProbabilityModel, num_symbols: int
|
82
|
+
) -> np.ndarray:
|
83
|
+
"""
|
84
|
+
Decodes a list of encoded bits into an array of symbols using arithmetic decoding.
|
85
|
+
|
86
|
+
Args:
|
87
|
+
encoded_bits (np.ndarray): Numpy array of bits representing the encoded input symbols.
|
88
|
+
model (ProbabilityModel): The probability model used to get the symbol probabilities.
|
89
|
+
num_symbols (int): The number of symbols in encoded in the input array.
|
90
|
+
|
91
|
+
Returns:
|
92
|
+
np.ndarray: Array of decoded symbols.
|
93
|
+
"""
|
94
|
+
|
68
95
|
import math
|
69
96
|
import bisect
|
70
97
|
|
@@ -135,16 +162,16 @@ def decode(encoded_bits: np.ndarray, model: ProbabilityModel, num_symbols: int):
|
|
135
162
|
|
136
163
|
return decoded
|
137
164
|
|
165
|
+
|
138
166
|
# Testing
|
139
167
|
if __name__ == "__main__":
|
140
168
|
from llama_model import LlamaModel
|
141
169
|
from probability_model import StaticModel
|
142
170
|
|
143
|
-
|
144
|
-
model = StaticModel(3, ['a', 'b', 'c'], [0.4, 0.3, 0.3])
|
171
|
+
model = StaticModel(3, ["a", "b", "c"], [0.4, 0.3, 0.3])
|
145
172
|
test_str = "abcabc"
|
146
173
|
print(len(test_str), " symbols")
|
147
|
-
|
174
|
+
|
148
175
|
test_arr = np.asarray([test_str[i] for i in range(len(test_str))])
|
149
176
|
print(test_arr)
|
150
177
|
encoded_bin = encode(test_arr, model)
|
@@ -153,18 +180,20 @@ if __name__ == "__main__":
|
|
153
180
|
|
154
181
|
decoded = decode(encoded_bin, model, len(test_arr))
|
155
182
|
print(decoded)
|
156
|
-
|
183
|
+
|
157
184
|
print("LLM TEST")
|
158
185
|
|
159
|
-
model = LlamaModel(
|
186
|
+
model = LlamaModel(
|
187
|
+
model_path="../Llama-3.2-1B-Instruct-Q4_K_M.gguf", top_p=0.99, max_context=50
|
188
|
+
)
|
160
189
|
wiki_str = "Weissman var på 1920-talet en av Finlands mest kända kuplettsångare och var en mycket aktiv skådespelare med både operetter och lustspel på sin repertoar. Hans skådespelarkarriär inleddes omkring 1913 och varade fram till 1930-talet. Under den tiden var han verksam vid flera teatrar och skådespelarensembler. Som kuplettsångare uppträdde han på biografer, kaféer och restauranger runt om i landet. På 1910- och 1920-talen gjorde han en stor mängd skivinspelningar och var en aktiv sångare under grammofonfebern 1929. När kuplettgenren gick ur mode på slutet av 1920-talet försökte Weissman anpassa sig till schlagermusiken, men övergav inom kort den konstnärliga banan för att ägna sig åt reklamverksamhet och diverse affärer"
|
161
190
|
wiki_str_short = "Weissman var på 1920-talet en av Finlands mest kända kuplettsångare och var en mycket aktiv skådespelare med både operetter och lustspel på sin repertoar."
|
162
191
|
wiki_str_short2 = "The building began as a movie theater in 1973, was converted into the Jet Set nightclub in 1994, and underwent renovations in 2010 and 2015"
|
163
|
-
prompt = wiki_str_short2.encode(
|
192
|
+
prompt = wiki_str_short2.encode("utf-8")
|
164
193
|
prompt_tkn = np.asarray(model.tokenize(prompt))
|
165
194
|
print(len(prompt_tkn), " symbols")
|
166
195
|
encoded_bin = encode(prompt_tkn, model)
|
167
196
|
print(len(encoded_bin), " bits in encoding")
|
168
197
|
decoded = decode(encoded_bin, model, len(prompt_tkn))
|
169
198
|
outstr = model.detokenize(decoded)
|
170
|
-
print(outstr.decode(
|
199
|
+
print(outstr.decode("utf-8"))
|
@@ -4,8 +4,33 @@ import time
|
|
4
4
|
|
5
5
|
from probability_model import ProbabilityModel
|
6
6
|
|
7
|
+
|
7
8
|
class LlamaModel(ProbabilityModel):
|
8
|
-
def __init__(
|
9
|
+
def __init__(
|
10
|
+
self,
|
11
|
+
model_path: str,
|
12
|
+
top_p: float = 0.99,
|
13
|
+
max_context: int = 50,
|
14
|
+
):
|
15
|
+
"""
|
16
|
+
Initialize a LlamaModel.
|
17
|
+
|
18
|
+
Parameters
|
19
|
+
----------
|
20
|
+
model_path : str
|
21
|
+
File path to the LLaMA model .gguf file.
|
22
|
+
top_p : float, optional
|
23
|
+
The top [0, 1] percentage of the most likely tokens to consider when computing the probability distribution.
|
24
|
+
Higher values will generally result in better compression for sequences that the LLM can easily predict.
|
25
|
+
max_context : int, optional
|
26
|
+
The maximum number of tokens to keep in the model's context. Higher values will generally lead to better compression but slower performance.
|
27
|
+
|
28
|
+
Raises
|
29
|
+
------
|
30
|
+
ValueError
|
31
|
+
If the provided max_context is too large for the model.
|
32
|
+
"""
|
33
|
+
|
9
34
|
t1 = time.perf_counter()
|
10
35
|
self.llm = Llama(
|
11
36
|
model_path=model_path,
|
@@ -15,7 +40,9 @@ class LlamaModel(ProbabilityModel):
|
|
15
40
|
verbose=False,
|
16
41
|
)
|
17
42
|
if self.llm.n_ctx() < max_context:
|
18
|
-
raise ValueError(
|
43
|
+
raise ValueError(
|
44
|
+
f"Provided max_context is too large for the model. Provided max_context is {max_context}, but model max context is {self.llm.n_ctx}"
|
45
|
+
)
|
19
46
|
t2 = time.perf_counter()
|
20
47
|
print(f"Model loaded in {t2 - t1} seconds")
|
21
48
|
self.N = self.llm.n_vocab()
|
@@ -25,9 +52,24 @@ class LlamaModel(ProbabilityModel):
|
|
25
52
|
super().__init__(self.N)
|
26
53
|
|
27
54
|
def get_prob(self, prior_symbols: np.ndarray[int]) -> tuple[np.ndarray, np.ndarray]:
|
28
|
-
|
55
|
+
"""
|
56
|
+
Get cumalitive probability distribution of the next token given the prior tokens.
|
57
|
+
|
58
|
+
Parameters
|
59
|
+
----------
|
60
|
+
prior_symbols : np.ndarray[int]
|
61
|
+
The sequence of prior tokens.
|
62
|
+
|
63
|
+
Returns
|
64
|
+
-------
|
65
|
+
(tokens, cdfs)
|
66
|
+
tokens : np.ndarray[int]
|
67
|
+
The symbols in descending order of probability.
|
68
|
+
cdfs : np.ndarray[float]
|
69
|
+
The cumulative probabilities of the tokens in the same order.
|
70
|
+
"""
|
29
71
|
print(f"Prior symbols: {len(prior_symbols)}")
|
30
|
-
|
72
|
+
|
31
73
|
# If no prior tokens, return uniform distribution and clear cache
|
32
74
|
if len(prior_symbols) == 0:
|
33
75
|
self.reset()
|
@@ -40,16 +82,16 @@ class LlamaModel(ProbabilityModel):
|
|
40
82
|
tokens[token_id] = token_id
|
41
83
|
cdfs[token_id] = cumulative
|
42
84
|
return tokens, cdfs
|
43
|
-
|
85
|
+
|
44
86
|
# If there are more symbols cached than context, clear oldest half of cache
|
45
87
|
if len(self.cache) >= self.max_context:
|
46
|
-
#self.reset()
|
47
|
-
self.cache = self.cache[self.max_context // 2:]
|
88
|
+
# self.reset()
|
89
|
+
self.cache = self.cache[self.max_context // 2 :]
|
48
90
|
self.llm.reset()
|
49
91
|
# evaluate what is left of cache
|
50
92
|
self.llm.eval(self.cache)
|
51
93
|
print("Cache cleared")
|
52
|
-
|
94
|
+
|
53
95
|
# Evaluate latest token
|
54
96
|
t1 = time.perf_counter()
|
55
97
|
self.llm.eval([prior_symbols[-1]])
|
@@ -62,12 +104,12 @@ class LlamaModel(ProbabilityModel):
|
|
62
104
|
probs /= probs.sum()
|
63
105
|
|
64
106
|
t4 = time.perf_counter()
|
65
|
-
|
107
|
+
|
66
108
|
# Get cdf distribution of 90% most likely tokens
|
67
109
|
ts1 = time.perf_counter()
|
68
110
|
topk = np.argsort(-probs)
|
69
111
|
ts2 = time.perf_counter()
|
70
|
-
|
112
|
+
|
71
113
|
tokens = np.zeros(self.N, dtype=np.int64)
|
72
114
|
cdfs = np.zeros(self.N, dtype=np.float64)
|
73
115
|
|
@@ -76,9 +118,9 @@ class LlamaModel(ProbabilityModel):
|
|
76
118
|
# Compute cumulative probabilities
|
77
119
|
cum_probs = np.cumsum(probs_sorted)
|
78
120
|
# Find cutoff index of top_p probability
|
79
|
-
cutoff_index = np.searchsorted(cum_probs, self.top_p, side=
|
121
|
+
cutoff_index = np.searchsorted(cum_probs, self.top_p, side="right")
|
80
122
|
# Get slice of topk
|
81
|
-
topk_slice = topk[:cutoff_index+1]
|
123
|
+
topk_slice = topk[: cutoff_index + 1]
|
82
124
|
n_topk = cutoff_index + 1
|
83
125
|
|
84
126
|
tokens[:n_topk] = topk_slice
|
@@ -92,39 +134,42 @@ class LlamaModel(ProbabilityModel):
|
|
92
134
|
n_remaining = len(remaining_tokens)
|
93
135
|
if n_remaining > 0:
|
94
136
|
uniform_prob = (1.0 - cum_probs[cutoff_index]) / n_remaining
|
95
|
-
tokens[n_topk:n_topk + n_remaining] = remaining_tokens
|
96
|
-
cdfs[n_topk:n_topk + n_remaining] = (
|
137
|
+
tokens[n_topk : n_topk + n_remaining] = remaining_tokens
|
138
|
+
cdfs[n_topk : n_topk + n_remaining] = (
|
97
139
|
uniform_prob * np.arange(1, n_remaining + 1) + cum_probs[cutoff_index]
|
98
140
|
)
|
99
|
-
|
141
|
+
|
100
142
|
t5 = time.perf_counter()
|
101
|
-
|
143
|
+
|
102
144
|
# Cache new token
|
103
145
|
self.cache.append(prior_symbols[-1])
|
104
146
|
|
105
147
|
# returns sorted tokens and cdfs
|
106
148
|
return (tokens, cdfs)
|
107
|
-
|
108
|
-
def reset(self):
|
149
|
+
|
150
|
+
def reset(self) -> None:
|
151
|
+
"""Clear cache and reset LLM. Needed when starting a new compression/decrompression"""
|
109
152
|
self.cache = []
|
110
153
|
self.llm.reset()
|
111
|
-
print("Cache cleared")
|
112
154
|
|
113
155
|
def tokenize(self, text: bytes) -> list[int]:
|
114
|
-
|
156
|
+
"""
|
157
|
+
Tokenize a string of bytes into a sequence of token IDs.
|
115
158
|
|
116
|
-
|
117
|
-
return self.llm.detokenize(tokens)
|
159
|
+
This function is a wrapper around Llama's `tokenize` method without adding the BOS token.
|
118
160
|
|
161
|
+
Parameters
|
162
|
+
----------
|
163
|
+
text : bytes
|
164
|
+
The string of bytes to tokenize.
|
119
165
|
|
120
|
-
|
121
|
-
|
122
|
-
|
166
|
+
Returns
|
167
|
+
-------
|
168
|
+
list[int]
|
169
|
+
A list of token IDs.
|
170
|
+
"""
|
123
171
|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
for i in range(10):
|
129
|
-
print(tokens[i])
|
130
|
-
print(model.detokenize([tokens[i]]), cdfs[i])
|
172
|
+
return self.llm.tokenize(text, add_bos=False)
|
173
|
+
|
174
|
+
def detokenize(self, tokens: list[int]) -> bytes:
|
175
|
+
return self.llm.detokenize(tokens)
|
@@ -1,14 +1,15 @@
|
|
1
1
|
import numpy as np
|
2
2
|
|
3
|
+
|
3
4
|
class ProbabilityModel:
|
4
5
|
def __init__(self, N):
|
5
6
|
self.N = N
|
6
|
-
|
7
|
+
|
7
8
|
def get_prob(self, prior_symbols: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
|
8
9
|
"""
|
9
10
|
Args:
|
10
11
|
prior_symbols: numpy array of previous symbols
|
11
|
-
|
12
|
+
|
12
13
|
Returns:
|
13
14
|
tokens: numpy array of symbols in descending order of probability
|
14
15
|
cdfs: numpy array of the cumulative probabilities of the tokens in the same order
|
@@ -24,39 +25,42 @@ class StaticModel(ProbabilityModel):
|
|
24
25
|
probs = np.array(probs)
|
25
26
|
symbols = np.array(symbols)
|
26
27
|
# sort in descending order of probability
|
27
|
-
sorted_indices = np.argsort(-probs)
|
28
|
+
sorted_indices = np.argsort(-probs)
|
28
29
|
self.symbols = symbols[sorted_indices]
|
29
30
|
self.probs = probs[sorted_indices]
|
30
|
-
|
31
|
+
|
31
32
|
def get_prob(self, prior_symbols: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
|
32
33
|
cdfs = np.cumsum(self.probs)
|
33
34
|
# Ensure cdfs sum to 1
|
34
35
|
cdfs /= cdfs[-1]
|
35
36
|
return (np.array(self.symbols), cdfs)
|
36
37
|
|
38
|
+
|
37
39
|
# Simple adaptive model - places higher probability of symbol that appears more
|
38
40
|
class AdaptiveModel(ProbabilityModel):
|
39
41
|
def __init__(self, N, symbols):
|
40
42
|
super().__init__(N)
|
41
43
|
self.symbols = symbols
|
42
|
-
|
44
|
+
|
43
45
|
def get_prob(self, prior_symbols: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
|
44
46
|
probs = np.zeros(self.N)
|
45
|
-
|
47
|
+
|
46
48
|
for i in range(len(probs)):
|
47
49
|
probs[i] = np.sum(prior_symbols == self.symbols[i]) + 0.10
|
48
50
|
probs /= probs.sum()
|
49
|
-
|
50
|
-
combined_sort = sorted(
|
51
|
+
|
52
|
+
combined_sort = sorted(
|
53
|
+
zip(self.symbols, probs), key=lambda x: x[1], reverse=True
|
54
|
+
)
|
51
55
|
tokens = [x[0] for x in combined_sort]
|
52
56
|
sorted_probs = [x[1] for x in combined_sort]
|
53
|
-
|
57
|
+
|
54
58
|
cdfs = np.zeros(self.N)
|
55
59
|
cumalative = 0
|
56
60
|
for i in range(len(sorted_probs)):
|
57
61
|
cumalative += sorted_probs[i]
|
58
62
|
cdfs[i] = cumalative
|
59
|
-
|
63
|
+
|
60
64
|
cdfs /= cdfs[-1]
|
61
65
|
|
62
|
-
return (np.array(tokens), cdfs)
|
66
|
+
return (np.array(tokens), cdfs)
|
File without changes
|
File without changes
|