embedding-condensation 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- embedding_condensation-0.1.0/.gitignore +31 -0
- embedding_condensation-0.1.0/LICENSE +1 -0
- embedding_condensation-0.1.0/PKG-INFO +71 -0
- embedding_condensation-0.1.0/README.md +45 -0
- embedding_condensation-0.1.0/embedding_condensation/__init__.py +235 -0
- embedding_condensation-0.1.0/pyproject.toml +42 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
.vscode
|
|
2
|
+
data
|
|
3
|
+
**/__pycache__
|
|
4
|
+
**/.DS_Store
|
|
5
|
+
**/results/
|
|
6
|
+
|
|
7
|
+
# Models
|
|
8
|
+
**/*.pt
|
|
9
|
+
**/*.pkl
|
|
10
|
+
**/*.pth
|
|
11
|
+
**/*.pth.tar
|
|
12
|
+
|
|
13
|
+
# Data
|
|
14
|
+
**/*.npy
|
|
15
|
+
**/*.npz
|
|
16
|
+
|
|
17
|
+
# local env configs
|
|
18
|
+
env.sh
|
|
19
|
+
|
|
20
|
+
slurm_out/
|
|
21
|
+
*.csv
|
|
22
|
+
*.old
|
|
23
|
+
wandb/
|
|
24
|
+
*.out
|
|
25
|
+
*.png
|
|
26
|
+
old/
|
|
27
|
+
*.arrow
|
|
28
|
+
*.json
|
|
29
|
+
eval_log.txt
|
|
30
|
+
model_cache/
|
|
31
|
+
results_old/
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
Non-Commercial License — see ../LICENSE.md in the LM-Dispersion repository.
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: embedding-condensation
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Measure layer-wise token embedding cosine similarity (embedding condensation diagnostic).
|
|
5
|
+
Project-URL: Homepage, https://chenliu-1996.github.io/projects/LM-Dispersion/
|
|
6
|
+
Project-URL: Repository, https://github.com/ChenLiu-1996/LM-Dispersion
|
|
7
|
+
Author: Chen Liu
|
|
8
|
+
License: Non-Commercial License — see ../LICENSE.md in the LM-Dispersion repository.
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: embeddings,language-models,representation-geometry,transformers
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
|
+
Requires-Python: >=3.9
|
|
16
|
+
Requires-Dist: datasets>=2.14
|
|
17
|
+
Requires-Dist: matplotlib>=3.7
|
|
18
|
+
Requires-Dist: nltk>=3.8
|
|
19
|
+
Requires-Dist: numpy>=1.24
|
|
20
|
+
Requires-Dist: torch>=2.0
|
|
21
|
+
Requires-Dist: tqdm>=4.65
|
|
22
|
+
Requires-Dist: transformers<4.48,>=4.40
|
|
23
|
+
Provides-Extra: test
|
|
24
|
+
Requires-Dist: pytest>=7.0; extra == 'test'
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# embedding-condensation
|
|
28
|
+
|
|
29
|
+
Minimal library for the **embedding condensation** diagnostic from [LM-Dispersion](https://github.com/ChenLiu-1996/LM-Dispersion): layer-wise token cosine-similarity matrices and optional heatmaps.
|
|
30
|
+
|
|
31
|
+
## Install
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
cd pypi
|
|
35
|
+
pip install -e ".[test]"
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Usage
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from transformers import AutoModel, AutoTokenizer
|
|
42
|
+
from embedding_condensation import measure_embedding_condensation
|
|
43
|
+
|
|
44
|
+
model = AutoModel.from_pretrained("gpt2").eval()
|
|
45
|
+
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
|
46
|
+
|
|
47
|
+
result = measure_embedding_condensation(
|
|
48
|
+
model,
|
|
49
|
+
tokenizer,
|
|
50
|
+
texts=["Your long input text here. " * 200],
|
|
51
|
+
repetitions=1,
|
|
52
|
+
plot=False,
|
|
53
|
+
)
|
|
54
|
+
print(result.mean_cossim_by_layer)
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## PyPI upload
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
cd pypi
|
|
61
|
+
pip install build twine
|
|
62
|
+
python -m build
|
|
63
|
+
twine upload dist/*
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Test
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
cd pypi
|
|
70
|
+
pytest
|
|
71
|
+
```
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# embedding-condensation
|
|
2
|
+
|
|
3
|
+
Minimal library for the **embedding condensation** diagnostic from [LM-Dispersion](https://github.com/ChenLiu-1996/LM-Dispersion): layer-wise token cosine-similarity matrices and optional heatmaps.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
cd pypi
|
|
9
|
+
pip install -e ".[test]"
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
## Usage
|
|
13
|
+
|
|
14
|
+
```python
|
|
15
|
+
from transformers import AutoModel, AutoTokenizer
|
|
16
|
+
from embedding_condensation import measure_embedding_condensation
|
|
17
|
+
|
|
18
|
+
model = AutoModel.from_pretrained("gpt2").eval()
|
|
19
|
+
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
|
20
|
+
|
|
21
|
+
result = measure_embedding_condensation(
|
|
22
|
+
model,
|
|
23
|
+
tokenizer,
|
|
24
|
+
texts=["Your long input text here. " * 200],
|
|
25
|
+
repetitions=1,
|
|
26
|
+
plot=False,
|
|
27
|
+
)
|
|
28
|
+
print(result.mean_cossim_by_layer)
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## PyPI upload
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
cd pypi
|
|
35
|
+
pip install build twine
|
|
36
|
+
python -m build
|
|
37
|
+
twine upload dist/*
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Test
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
cd pypi
|
|
44
|
+
pytest
|
|
45
|
+
```
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
"""Embedding condensation measurement for Hugging Face transformer models."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import List, Optional, Sequence
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import matplotlib.pyplot as plt
|
|
10
|
+
import numpy as np
|
|
11
|
+
import torch
|
|
12
|
+
from datasets import load_dataset
|
|
13
|
+
from nltk.tokenize import word_tokenize
|
|
14
|
+
from tqdm import tqdm
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
word_tokenize('Arbitrary sentence.')
|
|
18
|
+
except:
|
|
19
|
+
import nltk
|
|
20
|
+
nltk.download('punkt_tab')
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"CondensationResult",
|
|
25
|
+
"measure_embedding_condensation",
|
|
26
|
+
"plot_similarity_heatmap",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class CondensationResult:
|
|
32
|
+
"""Per-layer token cosine-similarity matrices (L, S, S)."""
|
|
33
|
+
cossim_by_layer: np.ndarray
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def get_random_long_text(
|
|
37
|
+
dataset_name: str,
|
|
38
|
+
min_word_count: int = 1024,
|
|
39
|
+
max_word_count: int = 1280,
|
|
40
|
+
split: str = "train",
|
|
41
|
+
random_seed: int = 0,
|
|
42
|
+
) -> str:
|
|
43
|
+
if dataset_name == "wikipedia":
|
|
44
|
+
dataset = load_dataset("wikitext", "wikitext-103-v1")
|
|
45
|
+
key = "text"
|
|
46
|
+
elif dataset_name == "pubmed":
|
|
47
|
+
dataset = load_dataset("pubmed_qa", "pqa_labeled")
|
|
48
|
+
key = "long_answer"
|
|
49
|
+
elif dataset_name == "imdb":
|
|
50
|
+
dataset = load_dataset("imdb")
|
|
51
|
+
key = "text"
|
|
52
|
+
elif dataset_name == "squad":
|
|
53
|
+
dataset = load_dataset("squad")
|
|
54
|
+
key = "context"
|
|
55
|
+
else:
|
|
56
|
+
raise ValueError(
|
|
57
|
+
f"Unknown dataset {dataset_name!r}. "
|
|
58
|
+
"Use wikipedia, pubmed, imdb, squad, or pass `texts=` directly."
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
text = ""
|
|
62
|
+
rng = np.random.default_rng(seed=random_seed)
|
|
63
|
+
idx = rng.integers(0, int(len(dataset["train"]) * 0.95)).item()
|
|
64
|
+
while len(word_tokenize(text)) < min_word_count:
|
|
65
|
+
text += dataset[split][idx][key]
|
|
66
|
+
idx += 1
|
|
67
|
+
if len(word_tokenize(text)) > max_word_count:
|
|
68
|
+
break
|
|
69
|
+
return text
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _normalize(x: np.ndarray, p: int = 2, axis: int = 1, eps: float = 1e-3) -> np.ndarray:
|
|
73
|
+
norm = np.linalg.norm(x, ord=p, axis=axis, keepdims=True)
|
|
74
|
+
return x / np.maximum(norm, eps)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def organize_embeddings(embeddings: Sequence[torch.Tensor]) -> List[np.ndarray]:
|
|
78
|
+
return [z.squeeze(0).float().cpu().numpy() for z in embeddings]
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def compute_cosine_similarities(embeddings: Sequence[np.ndarray]) -> List[np.ndarray]:
|
|
82
|
+
out = []
|
|
83
|
+
for z in embeddings:
|
|
84
|
+
z = _normalize(z, axis=1)
|
|
85
|
+
out.append(np.matmul(z, z.T).clip(-1, 1))
|
|
86
|
+
return out
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def plot_similarity_heatmap(
|
|
90
|
+
cossim_matrix_by_layer: Sequence[np.ndarray],
|
|
91
|
+
save_path: Optional[str] = None,
|
|
92
|
+
step: int = 1,
|
|
93
|
+
bins: int = 128,
|
|
94
|
+
):
|
|
95
|
+
n_layers = len(cossim_matrix_by_layer)
|
|
96
|
+
denom = max(n_layers - 1, 1)
|
|
97
|
+
selected = [(i, data) for i, data in enumerate(cossim_matrix_by_layer) if i % step == 0]
|
|
98
|
+
layer_fractions, hist_data = [], []
|
|
99
|
+
for layer_idx, cossim_matrix in selected:
|
|
100
|
+
hist, _ = np.histogram(cossim_matrix.flatten(), bins=bins, density=True, range=(-1, 1))
|
|
101
|
+
hist_data.append(hist)
|
|
102
|
+
layer_fractions.append(layer_idx / denom)
|
|
103
|
+
hist_matrix = np.array(hist_data)
|
|
104
|
+
|
|
105
|
+
plt.rcParams["font.family"] = "sans-serif"
|
|
106
|
+
fig = plt.figure(figsize=(12, 10))
|
|
107
|
+
ax = fig.add_subplot(1, 1, 1)
|
|
108
|
+
ax.spines["top"].set_visible(False)
|
|
109
|
+
ax.spines["right"].set_visible(False)
|
|
110
|
+
im = ax.imshow(
|
|
111
|
+
hist_matrix.T,
|
|
112
|
+
aspect="auto",
|
|
113
|
+
origin="lower",
|
|
114
|
+
cmap="Reds",
|
|
115
|
+
extent=[0, layer_fractions[-1], -1, 1],
|
|
116
|
+
vmin=0,
|
|
117
|
+
vmax=10,
|
|
118
|
+
)
|
|
119
|
+
ax.tick_params(axis="both", which="major", labelsize=26)
|
|
120
|
+
ax.set_xlabel("Layer Fraction", fontsize=36)
|
|
121
|
+
ax.set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1])
|
|
122
|
+
ax.set_xticklabels([0, 0.2, 0.4, 0.6, 0.8, 1])
|
|
123
|
+
ax.set_ylabel("Cosine Similarity", fontsize=36)
|
|
124
|
+
cbar = fig.colorbar(im, ax=ax)
|
|
125
|
+
cbar.ax.tick_params(axis="both", which="major", labelsize=26)
|
|
126
|
+
cbar.ax.set_title("Probability\nDensity", fontsize=20, pad=20)
|
|
127
|
+
fig.tight_layout(pad=2)
|
|
128
|
+
|
|
129
|
+
os.makedirs(os.path.dirname(save_path) or ".", exist_ok=True)
|
|
130
|
+
fig.savefig(save_path, dpi=300)
|
|
131
|
+
plt.close(fig)
|
|
132
|
+
return fig
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _mean_cossim_per_layer(cossim_by_layer: Sequence[np.ndarray]) -> np.ndarray:
|
|
136
|
+
return np.array([float(m[np.triu_indices(m.shape[0], k=1)].mean()) for m in cossim_by_layer])
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def measure_embedding_condensation(
|
|
140
|
+
model: torch.nn.Module,
|
|
141
|
+
tokenizer,
|
|
142
|
+
*,
|
|
143
|
+
texts: Optional[Sequence[str]] = None,
|
|
144
|
+
dataset: str = "wikipedia",
|
|
145
|
+
repetitions: int = 100,
|
|
146
|
+
max_length: int = 512,
|
|
147
|
+
min_word_count: int = 1024,
|
|
148
|
+
max_word_count: int = 1280,
|
|
149
|
+
include_logits_layer: bool = False,
|
|
150
|
+
plot: bool = True,
|
|
151
|
+
save_path: Optional[str] = "./test_embedding_condensation.png",
|
|
152
|
+
show_progress: bool = True,
|
|
153
|
+
) -> CondensationResult:
|
|
154
|
+
"""
|
|
155
|
+
Run the LM-Dispersion embedding condensation measurement pipeline.
|
|
156
|
+
|
|
157
|
+
Pass `texts` to use fixed inputs. Otherwise samples
|
|
158
|
+
random long text from a Hugging Face dataset each repetition.
|
|
159
|
+
"""
|
|
160
|
+
model.eval()
|
|
161
|
+
device = next(model.parameters()).device
|
|
162
|
+
stacked: Optional[List[np.ndarray]] = None
|
|
163
|
+
|
|
164
|
+
if texts is not None:
|
|
165
|
+
if len(texts) == 0:
|
|
166
|
+
raise ValueError("`texts` must be non-empty when provided.")
|
|
167
|
+
rep_iter = range(repetitions)
|
|
168
|
+
if show_progress:
|
|
169
|
+
rep_iter = tqdm(rep_iter, desc="condensation")
|
|
170
|
+
for r in rep_iter:
|
|
171
|
+
torch.manual_seed(r)
|
|
172
|
+
text = texts[r % len(texts)]
|
|
173
|
+
curr = _forward_cossim(
|
|
174
|
+
model, tokenizer, text, device, max_length, include_logits_layer
|
|
175
|
+
)
|
|
176
|
+
stacked = _stack_repetition(stacked, curr)
|
|
177
|
+
else:
|
|
178
|
+
rep_iter = range(repetitions)
|
|
179
|
+
if show_progress:
|
|
180
|
+
rep_iter = tqdm(rep_iter, desc="condensation")
|
|
181
|
+
for random_seed in rep_iter:
|
|
182
|
+
torch.manual_seed(random_seed)
|
|
183
|
+
text = get_random_long_text(
|
|
184
|
+
dataset,
|
|
185
|
+
random_seed=random_seed,
|
|
186
|
+
min_word_count=min_word_count,
|
|
187
|
+
max_word_count=max_word_count,
|
|
188
|
+
)
|
|
189
|
+
curr = _forward_cossim(
|
|
190
|
+
model, tokenizer, text, device, max_length, include_logits_layer
|
|
191
|
+
)
|
|
192
|
+
stacked = _stack_repetition(stacked, curr)
|
|
193
|
+
|
|
194
|
+
assert stacked is not None
|
|
195
|
+
averaged = [m.mean(axis=0) for m in stacked]
|
|
196
|
+
cossim_arr = np.stack(averaged, axis=0)
|
|
197
|
+
result = CondensationResult(
|
|
198
|
+
cossim_by_layer=cossim_arr,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
if plot or save_path:
|
|
202
|
+
plot_similarity_heatmap(averaged, save_path=save_path)
|
|
203
|
+
return result
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _forward_cossim(
|
|
207
|
+
model: torch.nn.Module,
|
|
208
|
+
tokenizer,
|
|
209
|
+
text: str,
|
|
210
|
+
device: torch.device,
|
|
211
|
+
max_length: int,
|
|
212
|
+
include_logits_layer: bool,
|
|
213
|
+
) -> List[np.ndarray]:
|
|
214
|
+
tokens = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length)
|
|
215
|
+
tokens = {k: v.to(device) for k, v in tokens.items()}
|
|
216
|
+
with torch.no_grad():
|
|
217
|
+
output = model(**tokens, output_hidden_states=True)
|
|
218
|
+
curr = compute_cosine_similarities(organize_embeddings(output.hidden_states))
|
|
219
|
+
if include_logits_layer and hasattr(output, "logits"):
|
|
220
|
+
logits = torch.nn.functional.normalize(output.logits.squeeze(0).float(), dim=1)
|
|
221
|
+
curr.append(torch.matmul(logits, logits.T).clamp(-1, 1).cpu().numpy())
|
|
222
|
+
return curr
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def _stack_repetition(
|
|
226
|
+
stacked: Optional[List[np.ndarray]],
|
|
227
|
+
curr: List[np.ndarray],
|
|
228
|
+
) -> List[np.ndarray]:
|
|
229
|
+
clipped = [m.clip(-1, 1) for m in curr]
|
|
230
|
+
if stacked is None:
|
|
231
|
+
return [m[None, ...] for m in clipped]
|
|
232
|
+
return [
|
|
233
|
+
np.concatenate((stacked[i], clipped[i][None, ...]), axis=0)
|
|
234
|
+
for i in range(len(stacked))
|
|
235
|
+
]
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "embedding-condensation"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Measure layer-wise token embedding cosine similarity (embedding condensation diagnostic)."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = { file = "LICENSE" }
|
|
12
|
+
authors = [{ name = "Chen Liu" }]
|
|
13
|
+
keywords = ["language-models", "embeddings", "transformers", "representation-geometry"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"Intended Audience :: Science/Research",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
19
|
+
]
|
|
20
|
+
dependencies = [
|
|
21
|
+
"numpy>=1.24",
|
|
22
|
+
"torch>=2.0",
|
|
23
|
+
"transformers>=4.40,<4.48",
|
|
24
|
+
"datasets>=2.14",
|
|
25
|
+
"nltk>=3.8",
|
|
26
|
+
"matplotlib>=3.7",
|
|
27
|
+
"tqdm>=4.65",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[project.optional-dependencies]
|
|
31
|
+
test = ["pytest>=7.0"]
|
|
32
|
+
|
|
33
|
+
[project.urls]
|
|
34
|
+
Homepage = "https://chenliu-1996.github.io/projects/LM-Dispersion/"
|
|
35
|
+
Repository = "https://github.com/ChenLiu-1996/LM-Dispersion"
|
|
36
|
+
|
|
37
|
+
[tool.hatch.build.targets.wheel]
|
|
38
|
+
packages = ["embedding_condensation"]
|
|
39
|
+
|
|
40
|
+
[tool.pytest.ini_options]
|
|
41
|
+
testpaths = ["tests"]
|
|
42
|
+
addopts = "-q"
|