embedding-condensation 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,31 @@
1
+ .vscode
2
+ data
3
+ **/__pycache__
4
+ **/.DS_Store
5
+ **/results/
6
+
7
+ # Models
8
+ **/*.pt
9
+ **/*.pkl
10
+ **/*.pth
11
+ **/*.pth.tar
12
+
13
+ # Data
14
+ **/*.npy
15
+ **/*.npz
16
+
17
+ # local env configs
18
+ env.sh
19
+
20
+ slurm_out/
21
+ *.csv
22
+ *.old
23
+ wandb/
24
+ *.out
25
+ *.png
26
+ old/
27
+ *.arrow
28
+ *.json
29
+ eval_log.txt
30
+ model_cache/
31
+ results_old/
@@ -0,0 +1 @@
1
+ Non-Commercial License — see ../LICENSE.md in the LM-Dispersion repository.
@@ -0,0 +1,71 @@
1
+ Metadata-Version: 2.4
2
+ Name: embedding-condensation
3
+ Version: 0.1.0
4
+ Summary: Measure layer-wise token embedding cosine similarity (embedding condensation diagnostic).
5
+ Project-URL: Homepage, https://chenliu-1996.github.io/projects/LM-Dispersion/
6
+ Project-URL: Repository, https://github.com/ChenLiu-1996/LM-Dispersion
7
+ Author: Chen Liu
8
+ License: Non-Commercial License — see ../LICENSE.md in the LM-Dispersion repository.
9
+ License-File: LICENSE
10
+ Keywords: embeddings,language-models,representation-geometry,transformers
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
15
+ Requires-Python: >=3.9
16
+ Requires-Dist: datasets>=2.14
17
+ Requires-Dist: matplotlib>=3.7
18
+ Requires-Dist: nltk>=3.8
19
+ Requires-Dist: numpy>=1.24
20
+ Requires-Dist: torch>=2.0
21
+ Requires-Dist: tqdm>=4.65
22
+ Requires-Dist: transformers<4.48,>=4.40
23
+ Provides-Extra: test
24
+ Requires-Dist: pytest>=7.0; extra == 'test'
25
+ Description-Content-Type: text/markdown
26
+
27
+ # embedding-condensation
28
+
29
+ Minimal library for the **embedding condensation** diagnostic from [LM-Dispersion](https://github.com/ChenLiu-1996/LM-Dispersion): layer-wise token cosine-similarity matrices and optional heatmaps.
30
+
31
+ ## Install
32
+
33
+ ```bash
34
+ cd pypi
35
+ pip install -e ".[test]"
36
+ ```
37
+
38
+ ## Usage
39
+
40
+ ```python
41
+ from transformers import AutoModel, AutoTokenizer
42
+ from embedding_condensation import measure_embedding_condensation
43
+
44
+ model = AutoModel.from_pretrained("gpt2").eval()
45
+ tokenizer = AutoTokenizer.from_pretrained("gpt2")
46
+
47
+ result = measure_embedding_condensation(
48
+ model,
49
+ tokenizer,
50
+ texts=["Your long input text here. " * 200],
51
+ repetitions=1,
52
+ plot=False,
53
+ )
54
+ print(result.mean_cossim_by_layer)
55
+ ```
56
+
57
+ ## PyPI upload
58
+
59
+ ```bash
60
+ cd pypi
61
+ pip install build twine
62
+ python -m build
63
+ twine upload dist/*
64
+ ```
65
+
66
+ ## Test
67
+
68
+ ```bash
69
+ cd pypi
70
+ pytest
71
+ ```
@@ -0,0 +1,45 @@
1
+ # embedding-condensation
2
+
3
+ Minimal library for the **embedding condensation** diagnostic from [LM-Dispersion](https://github.com/ChenLiu-1996/LM-Dispersion): layer-wise token cosine-similarity matrices and optional heatmaps.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ cd pypi
9
+ pip install -e ".[test]"
10
+ ```
11
+
12
+ ## Usage
13
+
14
+ ```python
15
+ from transformers import AutoModel, AutoTokenizer
16
+ from embedding_condensation import measure_embedding_condensation
17
+
18
+ model = AutoModel.from_pretrained("gpt2").eval()
19
+ tokenizer = AutoTokenizer.from_pretrained("gpt2")
20
+
21
+ result = measure_embedding_condensation(
22
+ model,
23
+ tokenizer,
24
+ texts=["Your long input text here. " * 200],
25
+ repetitions=1,
26
+ plot=False,
27
+ )
28
+ print(result.mean_cossim_by_layer)
29
+ ```
30
+
31
+ ## PyPI upload
32
+
33
+ ```bash
34
+ cd pypi
35
+ pip install build twine
36
+ python -m build
37
+ twine upload dist/*
38
+ ```
39
+
40
+ ## Test
41
+
42
+ ```bash
43
+ cd pypi
44
+ pytest
45
+ ```
@@ -0,0 +1,235 @@
1
+ """Embedding condensation measurement for Hugging Face transformer models."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import List, Optional, Sequence
7
+
8
+ import os
9
+ import matplotlib.pyplot as plt
10
+ import numpy as np
11
+ import torch
12
+ from datasets import load_dataset
13
+ from nltk.tokenize import word_tokenize
14
+ from tqdm import tqdm
15
+
16
+ try:
17
+ word_tokenize('Arbitrary sentence.')
18
+ except:
19
+ import nltk
20
+ nltk.download('punkt_tab')
21
+
22
+
23
+ __all__ = [
24
+ "CondensationResult",
25
+ "measure_embedding_condensation",
26
+ "plot_similarity_heatmap",
27
+ ]
28
+
29
+
30
+ @dataclass
31
+ class CondensationResult:
32
+ """Per-layer token cosine-similarity matrices (L, S, S)."""
33
+ cossim_by_layer: np.ndarray
34
+
35
+
36
+ def get_random_long_text(
37
+ dataset_name: str,
38
+ min_word_count: int = 1024,
39
+ max_word_count: int = 1280,
40
+ split: str = "train",
41
+ random_seed: int = 0,
42
+ ) -> str:
43
+ if dataset_name == "wikipedia":
44
+ dataset = load_dataset("wikitext", "wikitext-103-v1")
45
+ key = "text"
46
+ elif dataset_name == "pubmed":
47
+ dataset = load_dataset("pubmed_qa", "pqa_labeled")
48
+ key = "long_answer"
49
+ elif dataset_name == "imdb":
50
+ dataset = load_dataset("imdb")
51
+ key = "text"
52
+ elif dataset_name == "squad":
53
+ dataset = load_dataset("squad")
54
+ key = "context"
55
+ else:
56
+ raise ValueError(
57
+ f"Unknown dataset {dataset_name!r}. "
58
+ "Use wikipedia, pubmed, imdb, squad, or pass `texts=` directly."
59
+ )
60
+
61
+ text = ""
62
+ rng = np.random.default_rng(seed=random_seed)
63
+ idx = rng.integers(0, int(len(dataset["train"]) * 0.95)).item()
64
+ while len(word_tokenize(text)) < min_word_count:
65
+ text += dataset[split][idx][key]
66
+ idx += 1
67
+ if len(word_tokenize(text)) > max_word_count:
68
+ break
69
+ return text
70
+
71
+
72
+ def _normalize(x: np.ndarray, p: int = 2, axis: int = 1, eps: float = 1e-3) -> np.ndarray:
73
+ norm = np.linalg.norm(x, ord=p, axis=axis, keepdims=True)
74
+ return x / np.maximum(norm, eps)
75
+
76
+
77
+ def organize_embeddings(embeddings: Sequence[torch.Tensor]) -> List[np.ndarray]:
78
+ return [z.squeeze(0).float().cpu().numpy() for z in embeddings]
79
+
80
+
81
+ def compute_cosine_similarities(embeddings: Sequence[np.ndarray]) -> List[np.ndarray]:
82
+ out = []
83
+ for z in embeddings:
84
+ z = _normalize(z, axis=1)
85
+ out.append(np.matmul(z, z.T).clip(-1, 1))
86
+ return out
87
+
88
+
89
+ def plot_similarity_heatmap(
90
+ cossim_matrix_by_layer: Sequence[np.ndarray],
91
+ save_path: Optional[str] = None,
92
+ step: int = 1,
93
+ bins: int = 128,
94
+ ):
95
+ n_layers = len(cossim_matrix_by_layer)
96
+ denom = max(n_layers - 1, 1)
97
+ selected = [(i, data) for i, data in enumerate(cossim_matrix_by_layer) if i % step == 0]
98
+ layer_fractions, hist_data = [], []
99
+ for layer_idx, cossim_matrix in selected:
100
+ hist, _ = np.histogram(cossim_matrix.flatten(), bins=bins, density=True, range=(-1, 1))
101
+ hist_data.append(hist)
102
+ layer_fractions.append(layer_idx / denom)
103
+ hist_matrix = np.array(hist_data)
104
+
105
+ plt.rcParams["font.family"] = "sans-serif"
106
+ fig = plt.figure(figsize=(12, 10))
107
+ ax = fig.add_subplot(1, 1, 1)
108
+ ax.spines["top"].set_visible(False)
109
+ ax.spines["right"].set_visible(False)
110
+ im = ax.imshow(
111
+ hist_matrix.T,
112
+ aspect="auto",
113
+ origin="lower",
114
+ cmap="Reds",
115
+ extent=[0, layer_fractions[-1], -1, 1],
116
+ vmin=0,
117
+ vmax=10,
118
+ )
119
+ ax.tick_params(axis="both", which="major", labelsize=26)
120
+ ax.set_xlabel("Layer Fraction", fontsize=36)
121
+ ax.set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1])
122
+ ax.set_xticklabels([0, 0.2, 0.4, 0.6, 0.8, 1])
123
+ ax.set_ylabel("Cosine Similarity", fontsize=36)
124
+ cbar = fig.colorbar(im, ax=ax)
125
+ cbar.ax.tick_params(axis="both", which="major", labelsize=26)
126
+ cbar.ax.set_title("Probability\nDensity", fontsize=20, pad=20)
127
+ fig.tight_layout(pad=2)
128
+
129
+ os.makedirs(os.path.dirname(save_path) or ".", exist_ok=True)
130
+ fig.savefig(save_path, dpi=300)
131
+ plt.close(fig)
132
+ return fig
133
+
134
+
135
+ def _mean_cossim_per_layer(cossim_by_layer: Sequence[np.ndarray]) -> np.ndarray:
136
+ return np.array([float(m[np.triu_indices(m.shape[0], k=1)].mean()) for m in cossim_by_layer])
137
+
138
+
139
+ def measure_embedding_condensation(
140
+ model: torch.nn.Module,
141
+ tokenizer,
142
+ *,
143
+ texts: Optional[Sequence[str]] = None,
144
+ dataset: str = "wikipedia",
145
+ repetitions: int = 100,
146
+ max_length: int = 512,
147
+ min_word_count: int = 1024,
148
+ max_word_count: int = 1280,
149
+ include_logits_layer: bool = False,
150
+ plot: bool = True,
151
+ save_path: Optional[str] = "./test_embedding_condensation.png",
152
+ show_progress: bool = True,
153
+ ) -> CondensationResult:
154
+ """
155
+ Run the LM-Dispersion embedding condensation measurement pipeline.
156
+
157
+ Pass `texts` to use fixed inputs. Otherwise samples
158
+ random long text from a Hugging Face dataset each repetition.
159
+ """
160
+ model.eval()
161
+ device = next(model.parameters()).device
162
+ stacked: Optional[List[np.ndarray]] = None
163
+
164
+ if texts is not None:
165
+ if len(texts) == 0:
166
+ raise ValueError("`texts` must be non-empty when provided.")
167
+ rep_iter = range(repetitions)
168
+ if show_progress:
169
+ rep_iter = tqdm(rep_iter, desc="condensation")
170
+ for r in rep_iter:
171
+ torch.manual_seed(r)
172
+ text = texts[r % len(texts)]
173
+ curr = _forward_cossim(
174
+ model, tokenizer, text, device, max_length, include_logits_layer
175
+ )
176
+ stacked = _stack_repetition(stacked, curr)
177
+ else:
178
+ rep_iter = range(repetitions)
179
+ if show_progress:
180
+ rep_iter = tqdm(rep_iter, desc="condensation")
181
+ for random_seed in rep_iter:
182
+ torch.manual_seed(random_seed)
183
+ text = get_random_long_text(
184
+ dataset,
185
+ random_seed=random_seed,
186
+ min_word_count=min_word_count,
187
+ max_word_count=max_word_count,
188
+ )
189
+ curr = _forward_cossim(
190
+ model, tokenizer, text, device, max_length, include_logits_layer
191
+ )
192
+ stacked = _stack_repetition(stacked, curr)
193
+
194
+ assert stacked is not None
195
+ averaged = [m.mean(axis=0) for m in stacked]
196
+ cossim_arr = np.stack(averaged, axis=0)
197
+ result = CondensationResult(
198
+ cossim_by_layer=cossim_arr,
199
+ )
200
+
201
+ if plot or save_path:
202
+ plot_similarity_heatmap(averaged, save_path=save_path)
203
+ return result
204
+
205
+
206
+ def _forward_cossim(
207
+ model: torch.nn.Module,
208
+ tokenizer,
209
+ text: str,
210
+ device: torch.device,
211
+ max_length: int,
212
+ include_logits_layer: bool,
213
+ ) -> List[np.ndarray]:
214
+ tokens = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length)
215
+ tokens = {k: v.to(device) for k, v in tokens.items()}
216
+ with torch.no_grad():
217
+ output = model(**tokens, output_hidden_states=True)
218
+ curr = compute_cosine_similarities(organize_embeddings(output.hidden_states))
219
+ if include_logits_layer and hasattr(output, "logits"):
220
+ logits = torch.nn.functional.normalize(output.logits.squeeze(0).float(), dim=1)
221
+ curr.append(torch.matmul(logits, logits.T).clamp(-1, 1).cpu().numpy())
222
+ return curr
223
+
224
+
225
+ def _stack_repetition(
226
+ stacked: Optional[List[np.ndarray]],
227
+ curr: List[np.ndarray],
228
+ ) -> List[np.ndarray]:
229
+ clipped = [m.clip(-1, 1) for m in curr]
230
+ if stacked is None:
231
+ return [m[None, ...] for m in clipped]
232
+ return [
233
+ np.concatenate((stacked[i], clipped[i][None, ...]), axis=0)
234
+ for i in range(len(stacked))
235
+ ]
@@ -0,0 +1,42 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "embedding-condensation"
7
+ version = "0.1.0"
8
+ description = "Measure layer-wise token embedding cosine similarity (embedding condensation diagnostic)."
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = { file = "LICENSE" }
12
+ authors = [{ name = "Chen Liu" }]
13
+ keywords = ["language-models", "embeddings", "transformers", "representation-geometry"]
14
+ classifiers = [
15
+ "Development Status :: 3 - Alpha",
16
+ "Intended Audience :: Science/Research",
17
+ "Programming Language :: Python :: 3",
18
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
19
+ ]
20
+ dependencies = [
21
+ "numpy>=1.24",
22
+ "torch>=2.0",
23
+ "transformers>=4.40,<4.48",
24
+ "datasets>=2.14",
25
+ "nltk>=3.8",
26
+ "matplotlib>=3.7",
27
+ "tqdm>=4.65",
28
+ ]
29
+
30
+ [project.optional-dependencies]
31
+ test = ["pytest>=7.0"]
32
+
33
+ [project.urls]
34
+ Homepage = "https://chenliu-1996.github.io/projects/LM-Dispersion/"
35
+ Repository = "https://github.com/ChenLiu-1996/LM-Dispersion"
36
+
37
+ [tool.hatch.build.targets.wheel]
38
+ packages = ["embedding_condensation"]
39
+
40
+ [tool.pytest.ini_options]
41
+ testpaths = ["tests"]
42
+ addopts = "-q"