expressivity 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,27 @@
1
+ Metadata-Version: 2.4
2
+ Name: expressivity
3
+ Version: 0.1.0
4
+ Summary: A package made to objectively compare the predicting power of neural network architectures implented with torch.
5
+ Author-email: Clustery <bigarnaque@gmail.com>
6
+ Requires-Python: >=3.11
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: matplotlib>=3.10.0
9
+ Requires-Dist: torch>=2.5.1
10
+ Provides-Extra: dev
11
+ Requires-Dist: black>=24.10.0; extra == "dev"
12
+ Requires-Dist: ruff>=0.9.0; extra == "dev"
13
+
14
+ Pour comparer 2 architectures, il faut définir une distribution de probabilité sur les paramtres de chaque réseau.
15
+ Cette distribution doit être définie directement lors de la création du réseau dans la méthode init.
16
+ Afin de comparer 2 architectures de manière équitable, il faut s'assurer qu'il y a ait autant de paramètres pour chaque paire prise à indice égale dans les listes space.architecture.
17
+ Il n'y a pas ce problème lorsque l'on compare avec un réseau tier.
18
+ L'argument 'mesure' dans un espace sert à quantifier la 'taille' d'un réseau. Le mode 'parameter' pour l'argument 'automatic_mesurement_mode' permet de comptabiliser automatiquement le nombre paramètres apprenables dans le modèle. Le mode par défaut 'information' permet lui de prendre en compte également la précision des paramètres pour comptabiliser l'intégralité des bits néccessaires pour encoder l'information des poids apprenables. L'utilisateur pourra lui-même définir ses propres métriques en définissant manuellement l'argument 'mesure' pour l'intégralité des modèles de l'espace.
19
+
20
+
21
+ Tests :
22
+ - Le fichier split_transformers_test.py permet de comparer un transformers classique à un transformer appliquant une fonction d'activation juste avant le produit entre Q et K. Un effectue un test A/B entre les deux modèles.
23
+ - Le fichier n_diagonal_test.py compare l'usage d'une matrice n_diagonale, comparée à une matrice écrite sous la forme d'une LoRA. Le nombre de paramètre n'étant pas régoureusement identique dans les 2 architectures, on se propose de comparer ces dernière à une architecture tier qui englobe les 2 architectures. On compare donc à un réseau constitué de matrices de poids pleines.
24
+
25
+ TODO:
26
+ - In the next version it would be great to let the user define its training routine directly in the ArchitecturalSpace class
27
+ - Lors de l'entrainement avec une architecture tier, on recalcule 2 fois des passes avant pour le modèle tier. On pourrait opitmiser le temps de calcul en s'assurant que l'on génère une target, puis les 2 modèles concurrents doivent l'approximer (au prix d'un code plus long, à moins de repenser la structure logique)
@@ -0,0 +1,14 @@
1
+ Pour comparer 2 architectures, il faut définir une distribution de probabilité sur les paramtres de chaque réseau.
2
+ Cette distribution doit être définie directement lors de la création du réseau dans la méthode init.
3
+ Afin de comparer 2 architectures de manière équitable, il faut s'assurer qu'il y a ait autant de paramètres pour chaque paire prise à indice égale dans les listes space.architecture.
4
+ Il n'y a pas ce problème lorsque l'on compare avec un réseau tier.
5
+ L'argument 'mesure' dans un espace sert à quantifier la 'taille' d'un réseau. Le mode 'parameter' pour l'argument 'automatic_mesurement_mode' permet de comptabiliser automatiquement le nombre paramètres apprenables dans le modèle. Le mode par défaut 'information' permet lui de prendre en compte également la précision des paramètres pour comptabiliser l'intégralité des bits néccessaires pour encoder l'information des poids apprenables. L'utilisateur pourra lui-même définir ses propres métriques en définissant manuellement l'argument 'mesure' pour l'intégralité des modèles de l'espace.
6
+
7
+
8
+ Tests :
9
+ - Le fichier split_transformers_test.py permet de comparer un transformers classique à un transformer appliquant une fonction d'activation juste avant le produit entre Q et K. Un effectue un test A/B entre les deux modèles.
10
+ - Le fichier n_diagonal_test.py compare l'usage d'une matrice n_diagonale, comparée à une matrice écrite sous la forme d'une LoRA. Le nombre de paramètre n'étant pas régoureusement identique dans les 2 architectures, on se propose de comparer ces dernière à une architecture tier qui englobe les 2 architectures. On compare donc à un réseau constitué de matrices de poids pleines.
11
+
12
+ TODO:
13
+ - In the next version it would be great to let the user define its training routine directly in the ArchitecturalSpace class
14
+ - Lors de l'entrainement avec une architecture tier, on recalcule 2 fois des passes avant pour le modèle tier. On pourrait opitmiser le temps de calcul en s'assurant que l'on génère une target, puis les 2 modèles concurrents doivent l'approximer (au prix d'un code plus long, à moins de repenser la structure logique)
@@ -0,0 +1,2 @@
1
+ from .probabilistic_density import ArchitectureComparator
2
+ from .space import ArchitecturalSpace
@@ -0,0 +1,391 @@
1
+ import torch
2
+ from torch import nn, optim
3
+ from expressivity.space import ArchitecturalSpace
4
+ import matplotlib.pyplot as plt
5
+
6
+
7
+ class ArchitectureComparator:
8
+ def __init__(
9
+ self,
10
+ A_space: ArchitecturalSpace,
11
+ B_space: ArchitecturalSpace,
12
+ base_space: ArchitecturalSpace = None,
13
+ criterion=nn.MSELoss(),
14
+ law=torch.distributions.Normal(0, 1),
15
+ ) -> None:
16
+ """
17
+ Initialize the ArchitectureComparator.
18
+
19
+ Parameters:
20
+ - A_space (ArchitecturalSpace): The first architectural space.
21
+ - B_space (ArchitecturalSpace): The second architectural space.
22
+ - base_space (ArchitecturalSpace, optional): The base architectural space used for comparison.
23
+ - criterion (nn.Module): Loss function used for training (default: nn.MSELoss).
24
+ - law (torch.distributions.Distribution): Data distribution for sampling (default: Normal(0, 1)).
25
+ """
26
+ self.A_space = A_space
27
+ self.B_space = B_space
28
+ self.base_space = base_space
29
+ self.criterion = criterion
30
+ self.law = law
31
+
32
+ assert (
33
+ A_space.input_size == B_space.input_size
34
+ ), "The input size of the two models must be the same"
35
+
36
+ self.input_size = A_space.input_size
37
+
38
+ assert len(A_space.parameters) == len(
39
+ B_space.parameters
40
+ ), "The number of architectures must be the same in space A and B"
41
+ self.count = len(A_space.parameters)
42
+
43
+ assert (
44
+ A_space.automatic_mesurement_mode == B_space.automatic_mesurement_mode
45
+ or A_space.automatic_mesurement_mode is None
46
+ or B_space.automatic_mesurement_mode is None
47
+ ), "The automatic mesurement mode must be the same in space A and B"
48
+
49
+ if A_space.mesurement != B_space.mesurement:
50
+ print(
51
+ "Warning: The mesurements of space A and B are different, you may not compare both model on an equal footing"
52
+ )
53
+
54
+ try:
55
+ test_tensor = torch.zeros((1, *self.input_size))
56
+ A_output_size = self._create_model(A_space, 0)(test_tensor).shape
57
+ B_output_size = self._create_model(B_space, 0)(test_tensor).shape
58
+
59
+ assert (
60
+ A_output_size == B_output_size
61
+ ), "The output size of the two models must be the same"
62
+
63
+ self.output_size = A_output_size[1:]
64
+
65
+ if base_space is not None:
66
+ assert (
67
+ self.input_size == base_space.input_size
68
+ ), "The input size of the two models must be the same"
69
+ base_output_size = self._create_model(base_space, 0)(test_tensor).shape
70
+ assert (
71
+ self.output_size == base_output_size[1:]
72
+ ), "The output size of the two models must be the same"
73
+ assert len(base_space.parameters) == self.count
74
+
75
+ except Exception as e:
76
+ print("The input size is not correct", e)
77
+
78
+ def compare(
79
+ self,
80
+ max_iterations: int = 10,
81
+ sub_iterations: int = 1,
82
+ variance_threashold: float | None = None,
83
+ plot_mode: str | None = None,
84
+ ) -> tuple[list[float]]:
85
+ """
86
+ Compare architectures by fitting one to the other and evaluating performance.
87
+
88
+ Parameters:
89
+ - max_iterations (int): Maximum number of gradient descent iterations.
90
+ - sub_iterations (int): Number of attempts of the source architecture to minimize error at each iteration.
91
+ - variance_threashold (float, optional): Threshold to stop iterations based on variance.
92
+ - plot_mode (str, optional): Plot comparison results; "min" or "mean".
93
+
94
+ Returns:
95
+ - tuple[list[float]]: Minimum and mean losses for architectures A and B.
96
+ """
97
+
98
+ self.max_iterations = max_iterations
99
+ self.sub_iterations = sub_iterations
100
+ if variance_threashold is None:
101
+ self.variance_threashold = 0
102
+ else:
103
+ self.variance_threashold = variance_threashold
104
+
105
+ self.min_A_fit = [None for _ in range(self.count)]
106
+ self.mean_A_fit = [None for _ in range(self.count)]
107
+ self.min_B_fit = [None for _ in range(self.count)]
108
+ self.mean_B_fit = [None for _ in range(self.count)]
109
+
110
+ for i in range(self.count):
111
+ print(f"Fitting model {i+1} out of {self.count}")
112
+ if self.base_space is None:
113
+ print(f"{self.A_space.name} fits {self.B_space.name}")
114
+ self.min_A_fit[i], self.mean_A_fit[i] = self._fit_source_to_target(
115
+ self.A_space, self.B_space, i
116
+ )
117
+ print(f"{self.B_space.name} fits {self.A_space.name}")
118
+ self.min_B_fit[i], self.mean_B_fit[i] = self._fit_source_to_target(
119
+ self.B_space, self.A_space, i
120
+ )
121
+ else:
122
+ print(f"{self.A_space.name} fits {self.base_space.name}")
123
+ self.min_A_fit[i], self.mean_A_fit[i] = self._fit_source_to_target(
124
+ self.A_space, self.base_space, i
125
+ )
126
+ print(f"{self.base_space.name} fits {self.B_space.name}")
127
+ self.min_B_fit[i], self.mean_B_fit[i] = self._fit_source_to_target(
128
+ self.B_space, self.base_space, i
129
+ )
130
+
131
+ if self.min_B_fit[i] > self.min_A_fit[i]:
132
+ self.winnner = "A"
133
+ print(f"Model {self.A_space.name} is better than {self.B_space.name}")
134
+ else:
135
+ self.winnner = "B"
136
+ print(f"Model {self.B_space.name} is better than {self.A_space.name}")
137
+
138
+ if self.mean_B_fit[i] > self.mean_A_fit[i]:
139
+ if self.winnner == "A":
140
+ print(
141
+ f"Model {self.A_space.name} is better than {self.B_space.name} by any mean"
142
+ )
143
+ else:
144
+ print(
145
+ f"However, model {self.A_space.name} shows better convergence in mean than {self.B_space.name}"
146
+ )
147
+ else:
148
+ if self.winnner == "B":
149
+ print(
150
+ f"Model {self.B_space.name} is better than {self.A_space.name} by any mean"
151
+ )
152
+ else:
153
+ print(
154
+ f"However, model {self.B_space.name} shows better convergence in mean than {self.A_space.name}"
155
+ )
156
+
157
+ if plot_mode is not None:
158
+ self.plot(plot_mode)
159
+
160
+ return self.min_A_fit, self.mean_A_fit, self.min_B_fit, self.mean_B_fit
161
+
162
+ def _create_model(self, space: ArchitecturalSpace, index: int) -> nn.Module:
163
+ """
164
+ Create a model from a given architecture and a set of parameters.
165
+
166
+ Parameters:
167
+ - space (ArchitecturalSpace): The architectural space.
168
+ - index (int): Index of the model within the space.
169
+
170
+ Returns:
171
+ - nn.Module: The created model.
172
+ """
173
+
174
+ return space.architecture(**space.parameters[index])
175
+
176
+ def _fit_source_to_target(
177
+ self,
178
+ source_space: ArchitecturalSpace,
179
+ target_space: ArchitecturalSpace,
180
+ model_index: int,
181
+ ) -> tuple[float]:
182
+ """
183
+ Fit a source model to match the behavior of a target model.
184
+
185
+ Parameters:
186
+ - source_space (ArchitecturalSpace): The source architectural space.
187
+ - target_space (ArchitecturalSpace): The target architectural space.
188
+ - model_index (int): Index of the model being compared.
189
+
190
+ Returns:
191
+ - tuple[float]: Mean and minimum losses for the source model fitting the target.
192
+ """
193
+
194
+ minimum = torch.tensor([torch.inf] * self.max_iterations)
195
+ mean = torch.zeros(self.max_iterations)
196
+
197
+ # Initialize epochs, grad_clamp and criterion
198
+ epochs = source_space.epoch[model_index]
199
+ grad_clamp = source_space.grad_clamp[model_index]
200
+ criterion = self.criterion
201
+
202
+ # We initialize mini_batch_count with both the target_space batch size and the source_space mini batch size
203
+ # This allows us to take the information of the source space to improve convergence (as it is an important hyperparamter during the learning process of the source)
204
+ # In the meantime, using the target space batch size allows us to know how much samples are need, if the target network has only one parameter, then the maximum degree of freedom its output is 1 (this is much more relevant thant taking the one of the source, but when compareing without a base space, we recommand to have similar mesurements for both the source and the target)
205
+ mini_batch_count = (
206
+ target_space.batch_size[model_index]
207
+ // source_space.mini_batch_size[model_index]
208
+ )
209
+ mini_batch_size = source_space.mini_batch_size[model_index]
210
+ shape = (
211
+ mini_batch_count,
212
+ mini_batch_size,
213
+ *self.input_size,
214
+ )
215
+
216
+ for i in range(self.max_iterations):
217
+ print(f"Iteration {i+1}/{self.max_iterations}")
218
+ # Generate data
219
+ X = self.law.sample(shape)
220
+ X.detach()
221
+
222
+ # Initialize target model
223
+ target_model = self._create_model(target_space, model_index)
224
+ target_model.eval()
225
+
226
+ # Foward pass into target model
227
+ with torch.no_grad():
228
+ target_output = target_model(
229
+ X.view(mini_batch_count * mini_batch_size, *self.input_size)
230
+ )
231
+ target_output = target_output.view(
232
+ mini_batch_count, mini_batch_size, *self.output_size
233
+ )
234
+
235
+ for j in range(self.sub_iterations):
236
+ print(f"Sub-iteration {j+1}/{self.sub_iterations}")
237
+ # Initialize source model
238
+ source_model = self._create_model(source_space, model_index)
239
+ optimizer = source_space.optimizer(
240
+ source_model.parameters(), source_space.lr[model_index]
241
+ )
242
+ loss = self.test_model(
243
+ source_model,
244
+ criterion,
245
+ X,
246
+ target_output,
247
+ )
248
+
249
+ # Train source model to fit target model
250
+ self.train_model(
251
+ source_model,
252
+ epochs,
253
+ criterion,
254
+ optimizer,
255
+ grad_clamp,
256
+ X,
257
+ target_output,
258
+ )
259
+
260
+ # Compute loss on the whole batch
261
+ print("Computing score on the eval set...")
262
+ loss = self.test_model(
263
+ source_model,
264
+ criterion,
265
+ X,
266
+ target_output,
267
+ )
268
+
269
+ minimum[i] = min(minimum[i], loss)
270
+ mean[i] += loss
271
+
272
+ mean[i] /= self.sub_iterations
273
+
274
+ # Calculer la variance empirique afin de savoir quand s'arrêter
275
+ min_var = torch.var(minimum, unbiased=True)
276
+ mean_var = torch.var(mean, unbiased=True)
277
+ max_var = max(min_var, mean_var)
278
+ if max_var < self.variance_threashold:
279
+ break
280
+
281
+ return minimum.mean().item(), mean.mean().item()
282
+
283
+ def test_model(
284
+ self,
285
+ model: nn.Module,
286
+ criterion: nn.Module,
287
+ X: list[torch.Tensor],
288
+ y: list[torch.Tensor],
289
+ ) -> float:
290
+ """
291
+ Test a model on a given dataset.
292
+
293
+ Parameters:
294
+ - model (nn.Module): The model to train.
295
+ - criterion (nn.Module): Loss function.
296
+ - X (list[torch.Tensor]): Input tensors.
297
+ - y (list[torch.Tensor]): Target tensors.
298
+ """
299
+
300
+ model.eval()
301
+ loss = 0
302
+ for mini_batch, target in zip(X, y):
303
+ output = model(mini_batch)
304
+ loss += criterion(output, target)
305
+ loss /= X.shape[0]
306
+ print(f"Score on the whole set, loss: {loss}")
307
+ return loss.item()
308
+
309
+ def train_model(
310
+ self,
311
+ model: nn.Module,
312
+ epochs: int,
313
+ criterion: nn.Module,
314
+ optimizer: optim.Optimizer,
315
+ grad_clamp: float,
316
+ X: list[torch.Tensor],
317
+ y: list[torch.Tensor],
318
+ ) -> None:
319
+ """
320
+ Train a model to minimize the loss between predicted and target outputs.
321
+
322
+ Parameters:
323
+ - model (nn.Module): The model to train.
324
+ - epochs (int): Number of training epochs.
325
+ - criterion (nn.Module): Loss function.
326
+ - optimizer (optim.Optimizer): Optimizer for gradient updates.
327
+ - grad_clamp (float): Maximum gradient value for clipping.
328
+ - X (list[torch.Tensor]): Input tensors.
329
+ - y (list[torch.Tensor]): Target tensors.
330
+ """
331
+ model.train()
332
+ for epoch in range(epochs):
333
+ for mini_batch, target in zip(X, y):
334
+ optimizer.zero_grad()
335
+ output = model(mini_batch)
336
+ loss = criterion(output, target)
337
+ loss.backward()
338
+ torch.nn.utils.clip_grad_value_(model.parameters(), grad_clamp)
339
+ optimizer.step()
340
+
341
+ print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}")
342
+
343
+ def plot(self, mode: str) -> None:
344
+ """
345
+ Plot comparison results between architectures.
346
+
347
+ Parameters:
348
+ - mode (str): Plot type, "min" for minimum loss or "mean" for average loss.
349
+
350
+ Raises:
351
+ - ValueError: If the mode is not "min" or "mean".
352
+ """
353
+
354
+ if mode not in ["min", "mean"]:
355
+ raise ValueError("Mode must be 'min' or 'mean'")
356
+
357
+ if mode == "min":
358
+ values_A = self.min_A_fit
359
+ values_B = self.min_B_fit
360
+ elif mode == "mean":
361
+ values_A = self.mean_A_fit
362
+ values_B = self.mean_B_fit
363
+
364
+ plt.figure(figsize=(10, 5))
365
+ plt.plot(
366
+ self.A_space.mesurement,
367
+ values_A,
368
+ label=f"Architecture {self.A_space.name} ({mode})",
369
+ marker="o",
370
+ )
371
+ plt.plot(
372
+ self.B_space.mesurement,
373
+ values_B,
374
+ label=f"Architecture {self.B_space.name} ({mode})",
375
+ marker="o",
376
+ )
377
+ plt.xlabel("Number of Parameters")
378
+ plt.ylabel(f"{mode.capitalize()} Value")
379
+ plt.title(f"Comparison of {mode.capitalize()} Values for Architectures A and B")
380
+ plt.legend()
381
+ plt.grid(True)
382
+ plt.show()
383
+
384
+ def get_densities(self):
385
+ """
386
+ Compute and return the density of the comparison.
387
+
388
+ Returns:
389
+ - To be implemented if mathematically cool.
390
+ """
391
+ pass
@@ -0,0 +1,142 @@
1
+ from copy import deepcopy
2
+ import torch
3
+ import torch.optim as optim
4
+ import torch.nn as nn
5
+ from typing import Dict, Any
6
+
7
+
8
+ class ArchitecturalSpace:
9
+ def __init__(
10
+ self,
11
+ input_size: tuple | torch.Size,
12
+ name: str = None,
13
+ architecture: nn.Module = None,
14
+ parameters: Dict[str, Any] | list[Dict[str, Any]] | None = None,
15
+ lr: float | list[float] = 0.001,
16
+ epoch: int | list[int] = 3,
17
+ batch_size: int | list[int] | None = None,
18
+ automatic_batch_size_scale: float | None = 1.0,
19
+ mesurement: float | list[float] | None = None,
20
+ automatic_mesurement_mode: str | None = "information",
21
+ mini_batch_size: int | list[int] = 16,
22
+ optimizer=optim.AdamW,
23
+ grad_clamp: int | list[int] = 1,
24
+ ) -> None:
25
+ """
26
+ Initializes an instance of the ArchitecturalSpace class.
27
+
28
+ Parameters:
29
+ - input_size (tuple | torch.Size): The size of the input data.
30
+ - name (str, optional): The name of the architectural space. Defaults to None.
31
+ - architecture (nn.Module, optional): The neural network architecture. Defaults to None.
32
+ - parameters (Dict[str, Any] | list[Dict[str, Any]] | None, optional): The parameters needed when initilizing the architecture. Defaults to None.
33
+ - lr (float | list[float], optional): Learning rate(s) for the optimizer. Defaults to 0.001.
34
+ - epoch (int | list[int], optional): Number of epochs for training. Defaults to 10.
35
+ - batch_size (int | list[int] | None, optional): Batch size(s) for training. Defaults to None.
36
+ - automatic_batch_size_scale (float | None, optional): Scale factor for automatic batch size calculation. Defaults to 10.0.
37
+ - mesurement (float | list[float] | None, optional): Measurement(s) for the architecture. Defaults to None.
38
+ - automatic_mesurement_mode (str | None, optional): Mode for automatic measurement calculation. Defaults to "information".
39
+ - mini_batch_size (int | list[int], optional): Mini-batch size(s) for training. Defaults to 16.
40
+ - optimizer (optional): Optimizer for training. Defaults to optim.AdamW.
41
+ - grad_clamp (int | list[int], optional): Gradient clamp value(s). Defaults to 1.
42
+
43
+ Returns:
44
+ - None
45
+ """
46
+
47
+ assert (
48
+ batch_size is not None or automatic_batch_size_scale is not None
49
+ ), "Either batch_size or automatic_batch_size_scale must be defined"
50
+ assert (
51
+ mesurement is not None or automatic_mesurement_mode is not None
52
+ ), "Either mesurement or automatic_mesurement_mode must be defined"
53
+
54
+ if mesurement is None:
55
+ assert automatic_mesurement_mode in [
56
+ "information",
57
+ "parameters",
58
+ ], "automatic_mesurement_mode must be either 'information' or 'parameters'"
59
+
60
+ self.name = name
61
+ self.architecture = architecture
62
+ self.parameters = parameters
63
+ self.lr = lr
64
+ self.epoch = epoch
65
+ self.automatic_mesurement_mode = automatic_mesurement_mode
66
+ self.mini_batch_size = mini_batch_size
67
+ self.optimizer = optimizer
68
+ self.input_size = input_size
69
+ self.grad_clamp = grad_clamp
70
+
71
+ if type(parameters) is not list:
72
+ self.parameters = [parameters]
73
+
74
+ list_size = len(self.parameters)
75
+
76
+ if automatic_mesurement_mode == "information":
77
+ self.mesurement_method = self.count_information
78
+ elif automatic_mesurement_mode == "parameters":
79
+ self.mesurement_method = self.count_parameters
80
+ else:
81
+ self.mesurement_method = None
82
+
83
+ if mesurement is None:
84
+ self.mesurement = [
85
+ self.mesurement_method(architecture(**params))
86
+ for params in self.parameters
87
+ ]
88
+ else:
89
+ self.mesurement = mesurement
90
+
91
+ if automatic_batch_size_scale is None:
92
+ self.batch_size = batch_size
93
+ else:
94
+ self.batch_size = [
95
+ int(automatic_batch_size_scale * mesure) for mesure in self.mesurement
96
+ ]
97
+
98
+ for attr_name, attr_value in vars(self).items():
99
+ if type(attr_value) is list:
100
+ assert (
101
+ len(attr_value) == list_size
102
+ ), "You should have as much elements in each list in your parameters"
103
+ elif attr_name in [
104
+ "lr",
105
+ "epoch",
106
+ "mini_batch_size",
107
+ "grad_clamp",
108
+ "batch_size",
109
+ "mesurement",
110
+ ]:
111
+ setattr(
112
+ self, attr_name, [deepcopy(attr_value) for _ in range(list_size)]
113
+ )
114
+
115
+ def count_parameters(self, model: nn.Module) -> int:
116
+ """
117
+ Counts the number of trainable parameters in a given neural network architecture.
118
+
119
+ Parameters:
120
+ - model (nn.Module): The neural network architecture for which the parameters are to be counted.
121
+
122
+ Returns:
123
+ - int: The total number of trainable parameters in the architecture.
124
+ """
125
+ return sum(p.numel() for p in model.parameters() if p.requires_grad)
126
+
127
+ def count_information(self, model: nn.Module) -> int:
128
+ """
129
+ Calculate and return the count of information in bit.
130
+
131
+ Returns:
132
+ int: The number of bit needed to code the list of all trainable parameters.
133
+ """
134
+ total_bits = 0
135
+
136
+ for p in model.parameters():
137
+ if p.requires_grad:
138
+ element_size_in_bytes = p.element_size()
139
+ element_size_in_bits = element_size_in_bytes * 8
140
+ total_bits += p.numel() * element_size_in_bits
141
+
142
+ return total_bits
@@ -0,0 +1,27 @@
1
+ Metadata-Version: 2.4
2
+ Name: expressivity
3
+ Version: 0.1.0
4
+ Summary: A package made to objectively compare the predicting power of neural network architectures implented with torch.
5
+ Author-email: Clustery <bigarnaque@gmail.com>
6
+ Requires-Python: >=3.11
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: matplotlib>=3.10.0
9
+ Requires-Dist: torch>=2.5.1
10
+ Provides-Extra: dev
11
+ Requires-Dist: black>=24.10.0; extra == "dev"
12
+ Requires-Dist: ruff>=0.9.0; extra == "dev"
13
+
14
+ Pour comparer 2 architectures, il faut définir une distribution de probabilité sur les paramtres de chaque réseau.
15
+ Cette distribution doit être définie directement lors de la création du réseau dans la méthode init.
16
+ Afin de comparer 2 architectures de manière équitable, il faut s'assurer qu'il y a ait autant de paramètres pour chaque paire prise à indice égale dans les listes space.architecture.
17
+ Il n'y a pas ce problème lorsque l'on compare avec un réseau tier.
18
+ L'argument 'mesure' dans un espace sert à quantifier la 'taille' d'un réseau. Le mode 'parameter' pour l'argument 'automatic_mesurement_mode' permet de comptabiliser automatiquement le nombre paramètres apprenables dans le modèle. Le mode par défaut 'information' permet lui de prendre en compte également la précision des paramètres pour comptabiliser l'intégralité des bits néccessaires pour encoder l'information des poids apprenables. L'utilisateur pourra lui-même définir ses propres métriques en définissant manuellement l'argument 'mesure' pour l'intégralité des modèles de l'espace.
19
+
20
+
21
+ Tests :
22
+ - Le fichier split_transformers_test.py permet de comparer un transformers classique à un transformer appliquant une fonction d'activation juste avant le produit entre Q et K. Un effectue un test A/B entre les deux modèles.
23
+ - Le fichier n_diagonal_test.py compare l'usage d'une matrice n_diagonale, comparée à une matrice écrite sous la forme d'une LoRA. Le nombre de paramètre n'étant pas régoureusement identique dans les 2 architectures, on se propose de comparer ces dernière à une architecture tier qui englobe les 2 architectures. On compare donc à un réseau constitué de matrices de poids pleines.
24
+
25
+ TODO:
26
+ - In the next version it would be great to let the user define its training routine directly in the ArchitecturalSpace class
27
+ - Lors de l'entrainement avec une architecture tier, on recalcule 2 fois des passes avant pour le modèle tier. On pourrait opitmiser le temps de calcul en s'assurant que l'on génère une target, puis les 2 modèles concurrents doivent l'approximer (au prix d'un code plus long, à moins de repenser la structure logique)
@@ -0,0 +1,21 @@
1
+ README.md
2
+ pyproject.toml
3
+ expressivity/__init__.py
4
+ expressivity/probabilistic_density.py
5
+ expressivity/space.py
6
+ expressivity.egg-info/PKG-INFO
7
+ expressivity.egg-info/SOURCES.txt
8
+ expressivity.egg-info/dependency_links.txt
9
+ expressivity.egg-info/requires.txt
10
+ expressivity.egg-info/top_level.txt
11
+ tests/cubic_transformer/attention.py
12
+ tests/cubic_transformer/cubic_transformer_test.py
13
+ tests/cubic_transformer/transformer.py
14
+ tests/n_diagonal/deep_nn.py
15
+ tests/n_diagonal/linear.py
16
+ tests/n_diagonal/lora.py
17
+ tests/n_diagonal/n_diagonal.py
18
+ tests/n_diagonal/n_diagonal_test.py
19
+ tests/split_transformer/attention.py
20
+ tests/split_transformer/split_transformer_test.py
21
+ tests/split_transformer/transformer.py
@@ -0,0 +1,6 @@
1
+ matplotlib>=3.10.0
2
+ torch>=2.5.1
3
+
4
+ [dev]
5
+ black>=24.10.0
6
+ ruff>=0.9.0
@@ -0,0 +1,2 @@
1
+ dist
2
+ expressivity
@@ -0,0 +1,22 @@
1
+ [project]
2
+ name = "expressivity"
3
+ version = "0.1.0"
4
+ authors = [
5
+ {name = "Clustery", email = "bigarnaque@gmail.com"}
6
+ ]
7
+ description = "A package made to objectively compare the predicting power of neural network architectures implented with torch."
8
+ readme = "README.md"
9
+ requires-python = ">=3.11"
10
+ dependencies = [
11
+ "matplotlib>=3.10.0",
12
+ "torch>=2.5.1",
13
+ ]
14
+
15
+ [project.optional-dependencies]
16
+ dev = [
17
+ "black>=24.10.0",
18
+ "ruff>=0.9.0",
19
+ ]
20
+
21
+ [tool.setuptools.packages.find]
22
+ exclude = ["notebook", "notebook.*", "tests", "tests.*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,73 @@
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ import math
5
+
6
+ """
7
+ Cette classe implémente l'attention multi-tête d'OpenAI (et non celle du papier originel 'Attention is all you need').
8
+ La différence principale est que les tête d'attention sont sommées au lieu d'être concaténées. Cela permet n'implique donc pas de respecter la contrainte d_model%n_heads = 0.
9
+ L'autre différence, est qu'il n'y a pas de couche linéaire tradictionnellement appelée 'O' (pour 'output') appliquée à la fin du mécanisme d'attention.
10
+ """
11
+
12
+
13
+ class MultiHeadAttention(nn.Module):
14
+ def __init__(
15
+ self,
16
+ d_model,
17
+ n_heads,
18
+ cubic=False,
19
+ ):
20
+ super(MultiHeadAttention, self).__init__()
21
+ self.d_model = d_model
22
+ self.n_heads = n_heads
23
+ self.cubic = cubic
24
+
25
+ self.Q = nn.Linear(d_model, d_model * n_heads, False)
26
+ self.K = nn.Linear(d_model, d_model * n_heads, False)
27
+ self.V = nn.Linear(d_model, d_model * n_heads, False)
28
+
29
+ def forward(
30
+ self,
31
+ x: torch.Tensor,
32
+ ):
33
+ """
34
+ x : Tensor de taille (batch_size, seq_len, d_model)
35
+ """
36
+ batch_size, seq_len, _ = x.size()
37
+
38
+ # Calculer Q, K, V et les diviser en têtes
39
+ q, k, v = self.Q(x), self.K(x), self.V(x)
40
+
41
+ query = self._reshape_to_batches(q)
42
+ key = self._reshape_to_batches(k)
43
+ value = self._reshape_to_batches(v)
44
+
45
+ dk = query.size()[-1]
46
+ scores = query.matmul(key.transpose(-2, -1)) / math.sqrt(dk)
47
+ if self.cubic:
48
+ attention = F.softmax(scores, dim=-1)
49
+ else:
50
+ attention = scores
51
+ y = attention.matmul(value)
52
+
53
+ y = y.reshape(batch_size, self.n_heads, seq_len, self.d_model)
54
+ y = y.sum(dim=1)
55
+
56
+ return y
57
+
58
+ def _reshape_to_batches(
59
+ self,
60
+ x: torch.Tensor,
61
+ ) -> torch.Tensor:
62
+ """
63
+ x: input tensor with shape (batch_size, seq_len, d_model*n_heads)
64
+
65
+ Returns:
66
+ Reshaped tensor with shape (batch_size*n_heads, seq_len, d_model)
67
+ """
68
+ batch_size, seq_len, _ = x.size()
69
+ return (
70
+ x.reshape(batch_size, seq_len, self.n_heads, self.d_model)
71
+ .permute(0, 2, 1, 3)
72
+ .reshape(batch_size * self.n_heads, seq_len, self.d_model)
73
+ )
@@ -0,0 +1,64 @@
1
+ from transformer import Transformer
2
+ from expressivity.space import ArchitecturalSpace
3
+ from expressivity.probabilistic_density import ArchitectureComparator
4
+ from torch import optim
5
+
6
+ """
7
+ In this exemple we are comparing the OpenAI style Transformer achritecture with the mathematically simplest network allowing attention.
8
+ We have purposely remove one fully connected layer from the original architecture to ensure both architectures hold the same number of parameters.
9
+ """
10
+
11
+ d_model = 6
12
+ seq_length = 5
13
+ n_heads = 1
14
+ d_ff = 6
15
+ max_depth = 4
16
+
17
+ # Create competing architectures
18
+ cubic_transformer_params = [
19
+ {
20
+ "d_model": d_model,
21
+ "n_heads": n_heads,
22
+ "d_ff": d_ff,
23
+ "depth": i + 4,
24
+ "cubic": True,
25
+ }
26
+ for i in range(max_depth)
27
+ ]
28
+
29
+ transformer_params = [
30
+ {
31
+ "d_model": d_model,
32
+ "n_heads": n_heads,
33
+ "d_ff": d_ff,
34
+ "depth": i + 4,
35
+ }
36
+ for i in range(max_depth)
37
+ ]
38
+
39
+ # Create architectural spaces
40
+ epoch = [i+3 for i in range(max_depth)]
41
+
42
+ cubic_transformer_space = ArchitecturalSpace(
43
+ (seq_length, d_model),
44
+ "Cubic Transformer",
45
+ Transformer,
46
+ cubic_transformer_params,
47
+ epoch=epoch,
48
+ optimizer=optim.AdamW
49
+ )
50
+
51
+ transformer_space = ArchitecturalSpace(
52
+ (seq_length, d_model),
53
+ "Transformer",
54
+ Transformer,
55
+ transformer_params,
56
+ epoch=epoch,
57
+ )
58
+
59
+ # Create comparator
60
+ comparator = ArchitectureComparator(cubic_transformer_space, transformer_space)
61
+
62
+ res = comparator.compare()
63
+ print(res)
64
+ comparator.plot("min")
@@ -0,0 +1,107 @@
1
+ import sys
2
+ import os
3
+
4
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
5
+
6
+ from tests.cubic_transformer.attention import MultiHeadAttention
7
+ import torch.nn as nn
8
+ import torch
9
+ import torch.nn.functional as F
10
+
11
+ class Transformer(nn.Module):
12
+ def __init__(
13
+ self,
14
+ d_model,
15
+ n_heads,
16
+ d_ff,
17
+ depth,
18
+ dropout=0.1,
19
+ cubic=False,
20
+ ):
21
+ super(Transformer, self).__init__()
22
+ self.d_model = d_model
23
+
24
+ # Liste des couches de l'encodeur
25
+ self.encoder_layers = nn.ModuleList(
26
+ [
27
+ TransformerEncoderLayer(
28
+ d_model,
29
+ n_heads,
30
+ d_ff,
31
+ dropout,
32
+ cubic,
33
+ )
34
+ for _ in range(depth)
35
+ ]
36
+ )
37
+
38
+ def forward(self, x):
39
+ """
40
+ x : Tensor de taille (batch_size, seq_len, d_model)
41
+ """
42
+ for layer in self.encoder_layers:
43
+ x = layer(x)
44
+ return x
45
+
46
+
47
+ class TransformerEncoderLayer(nn.Module):
48
+ def __init__(
49
+ self,
50
+ d_model,
51
+ n_heads,
52
+ d_ff,
53
+ dropout=0.1,
54
+ cubic=False,
55
+ ):
56
+ super(TransformerEncoderLayer, self).__init__()
57
+ self.self_attention = MultiHeadAttention(
58
+ d_model, n_heads, cubic
59
+ )
60
+
61
+ self.cubic = cubic
62
+
63
+ self.fc = nn.Linear(d_model, d_model)
64
+ self.fc_1 = nn.Linear(d_model, d_ff)
65
+ self.fc_2 = nn.Linear(d_ff, d_model)
66
+
67
+ self.activation = nn.ReLU()
68
+ self.layer_norm1 = nn.LayerNorm(d_model)
69
+ self.layer_norm2 = nn.LayerNorm(d_model)
70
+ self.dropout = nn.Dropout(dropout)
71
+
72
+ self.previous_weights = None
73
+
74
+ def forward(self, x):
75
+ """
76
+ x : Tensor de taille (batch_size, seq_len, d_model)
77
+ """
78
+ # Attention multi-têtes
79
+ attn_output = self.self_attention(x)
80
+ if self.cubic:
81
+ x = attn_output + self.fc(x)
82
+ else:
83
+ # x = self.layer_norm1(x + self.dropout(attn_output))
84
+ x = F.normalize(x + self.dropout(attn_output), p=2, dim=-1)
85
+ # x = F.normalize(x + attn_output, p=2, dim=-1)
86
+ x = self.fc(x)
87
+
88
+ # Réseau feed-forward
89
+ # ff_output = self.fc_2(self.activation(self.fc_1(x)))
90
+ ff_output = x
91
+ # x = self.layer_norm2(x + self.dropout(ff_output))
92
+ x = F.normalize(x + self.dropout(ff_output), p=2, dim=-1)
93
+ # x = F.normalize(x + ff_output, p=2, dim=-1)
94
+ # self.check_weights()
95
+ return x
96
+
97
+ def check_weights(self):
98
+ current_weights = {name: param.clone() for name, param in self.named_parameters()}
99
+
100
+ if self.previous_weights is not None:
101
+ for name, param in current_weights.items():
102
+ param = torch.round(torch.clamp(param, -1, 2))
103
+ self.previous_weights[name] = torch.round(torch.clamp(self.previous_weights[name], -1, 2))
104
+ pass
105
+ # assert torch.equal(param, self.previous_weights[name]), f"Les poids pour {name} ont changé."
106
+
107
+ self.previous_weights = current_weights
@@ -0,0 +1,35 @@
1
+ import torch.nn as nn
2
+ from n_diagonal import NDiagonalLayer
3
+ from lora import LoRALayer
4
+ from linear import LinearLayer
5
+
6
+
7
+ class DeepNetwork(nn.Module):
8
+ def __init__(
9
+ self, layer_type="fully_connected", dim=10, depth=1, rank=1, bias=True
10
+ ):
11
+ super(DeepNetwork, self).__init__()
12
+ self.layers = nn.ModuleList()
13
+ self.norms = nn.ModuleList()
14
+ self.activation = nn.ReLU()
15
+
16
+ for _ in range(depth):
17
+ if layer_type == "n_diagonal":
18
+ self.layers.append(NDiagonalLayer(dim, rank, bias))
19
+ elif layer_type == "LoRA":
20
+ self.layers.append(LoRALayer(dim, rank, bias))
21
+ elif layer_type == "fully_connected":
22
+ self.layers.append(LinearLayer(dim, bias))
23
+ else:
24
+ raise ValueError(f"Type de couche non supporté : {layer_type}")
25
+
26
+ self.norms.append(nn.LayerNorm(dim))
27
+
28
+ def forward(self, x):
29
+ for layer, norm in zip(self.layers, self.norms):
30
+ skip = x
31
+ x = layer(x)
32
+ x = self.activation(x)
33
+ x = x + skip
34
+ x = norm(x)
35
+ return x
@@ -0,0 +1,24 @@
1
+ import torch
2
+ import torch.nn as nn
3
+ from math import sqrt
4
+
5
+
6
+ class LinearLayer(nn.Module):
7
+ def __init__(self, dim, bias=True):
8
+ super(LinearLayer, self).__init__()
9
+ self.dim = dim
10
+
11
+ std = sqrt(2 / dim)
12
+ # std = 0.0001
13
+
14
+ # Matrix
15
+ self.weight = nn.Parameter(torch.randn(dim, dim) * std)
16
+
17
+ # Optionnel : biais
18
+ self.bias = nn.Parameter(torch.randn(dim)) if bias else None
19
+
20
+ def forward(self, x):
21
+ out = x @ self.weight
22
+ if self.bias is not None:
23
+ out = out + self.bias
24
+ return out
@@ -0,0 +1,27 @@
1
+ import torch
2
+ import torch.nn as nn
3
+ from math import sqrt
4
+
5
+
6
+ class LoRALayer(nn.Module):
7
+ def __init__(self, dim, rank, bias=True):
8
+ super(LoRALayer, self).__init__()
9
+ self.dim = dim
10
+ self.rank = rank
11
+
12
+ # std = sqrt(2/rank)
13
+ std = 0.0001
14
+
15
+ # Matrices LoRA
16
+ self.down_proj = nn.Parameter(torch.randn(dim, rank) * std)
17
+ self.up_proj = nn.Parameter(torch.randn(rank, dim) * std)
18
+
19
+ # Optionnel : biais
20
+ self.bias = nn.Parameter(torch.randn(dim)) if bias else None
21
+
22
+ def forward(self, x):
23
+ # Application de la réduction puis de la projection
24
+ lora_out = x @ self.down_proj @ self.up_proj
25
+ if self.bias is not None:
26
+ lora_out += self.bias
27
+ return lora_out
@@ -0,0 +1,41 @@
1
+ import torch
2
+ import torch.nn as nn
3
+ from math import sqrt
4
+
5
+
6
+ class NDiagonalLayer(nn.Module):
7
+ def __init__(self, dim, rank, bias=True):
8
+ super(NDiagonalLayer, self).__init__()
9
+ assert rank > 0, "Le nombre de diagonales doit être positif."
10
+
11
+ self.dim = dim
12
+ self.rank = rank
13
+
14
+ # std = sqrt(2/rank)
15
+ std = 0.0001
16
+
17
+ # Matrice de poids limitée à n diagonales
18
+ self.diagonal_weights = nn.Parameter(torch.randn(dim) * std)
19
+ self.lower_weights = [
20
+ nn.Parameter(torch.randn(dim - (i + 1)) * std) for i in range(rank - 1)
21
+ ]
22
+ self.upper_weights = [
23
+ nn.Parameter(torch.randn(dim - (i + 1)) * std) for i in range(rank - 1)
24
+ ]
25
+
26
+ # Optionnel : biais
27
+ self.bias = nn.Parameter(torch.randn(dim)) if bias else None
28
+
29
+ def forward(self, x):
30
+ weigths = torch.zeros((self.dim, self.dim))
31
+ weigths = weigths + torch.diag(self.diagonal_weights)
32
+ for i, (lower_weights, upper_weights) in enumerate(
33
+ zip(self.lower_weights, self.upper_weights)
34
+ ):
35
+ weigths = weigths + torch.diag(lower_weights, diagonal=-(i + 1))
36
+ weigths = weigths + torch.diag(upper_weights, diagonal=i + 1)
37
+
38
+ out = x @ weigths
39
+ if self.bias is not None:
40
+ out += self.bias
41
+ return out
@@ -0,0 +1,90 @@
1
+ import sys
2
+ import os
3
+
4
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
5
+
6
+ from expressivity.probabilistic_density import ArchitectureComparator
7
+ from expressivity.space import ArchitecturalSpace
8
+ from tests.n_diagonal.deep_nn import DeepNetwork
9
+
10
+ """
11
+ In this exemple we are comparing the reduction of parameter count when using the usuall LoRA and a n-diagonal matrix instead
12
+ As the count of parameter does not scale the same way, we compare both architectures to an architecture with fully connected layers
13
+ """
14
+
15
+ # Create competing architectures
16
+ n_diagonal_params = [
17
+ {
18
+ "layer_type": "n_diagonal",
19
+ "dim": 5,
20
+ "depth": 4,
21
+ "rank": i + 1,
22
+ "bias": True,
23
+ }
24
+ for i in range(5)
25
+ ]
26
+
27
+ lora_params = [
28
+ {
29
+ "layer_type": "LoRA",
30
+ "dim": 5,
31
+ "depth": 4,
32
+ "rank": i + 1,
33
+ "bias": True,
34
+ }
35
+ for i in range(5)
36
+ ]
37
+
38
+ fully_connected_params = [
39
+ {
40
+ "layer_type": "fully_connected",
41
+ "dim": 5,
42
+ "depth": 4,
43
+ "bias": True,
44
+ }
45
+ for _ in range(5)
46
+ ]
47
+
48
+
49
+ def compute_params(dim, depth, rank, bias):
50
+ return depth * (dim + 2 * dim * rank - rank * (rank + 1) + dim * bias)
51
+
52
+
53
+ # Create architectural spaces
54
+ n_diagonal_space = ArchitecturalSpace(
55
+ (5,),
56
+ "N-Diagonal",
57
+ DeepNetwork,
58
+ n_diagonal_params,
59
+ epoch=10,
60
+ lr=0.01,
61
+ automatic_mesurement_mode=None,
62
+ mesurement=[compute_params(5, 4, i, True) for i in range(5)],
63
+ )
64
+
65
+ lora_space = ArchitecturalSpace(
66
+ (5,),
67
+ "LoRA",
68
+ DeepNetwork,
69
+ lora_params,
70
+ epoch=10,
71
+ lr=0.01,
72
+ automatic_mesurement_mode="parameters",
73
+ )
74
+
75
+ fully_connected_space = ArchitecturalSpace(
76
+ (5,),
77
+ "Fully Connected",
78
+ DeepNetwork,
79
+ fully_connected_params,
80
+ epoch=10,
81
+ lr=0.01,
82
+ automatic_mesurement_mode="parameters",
83
+ )
84
+
85
+ # Create comparator
86
+ comparator = ArchitectureComparator(n_diagonal_space, lora_space, fully_connected_space)
87
+ # comparator = ArchitectureComparator(lora_space, n_diagonal_space, fully_connected_space)
88
+ res = comparator.compare(100, 5)
89
+ print(res)
90
+ comparator.plot("min")
@@ -0,0 +1,57 @@
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ import math
5
+
6
+
7
+ class MultiHeadAttention(nn.Module):
8
+ def __init__(self, d_model, n_heads, split=False):
9
+ super(MultiHeadAttention, self).__init__()
10
+ assert d_model % n_heads == 0, "d_model doit être divisible par n_heads"
11
+ self.d_model = d_model
12
+ self.n_heads = n_heads
13
+ self.head_dim = d_model // n_heads
14
+
15
+ # Projections linéaires pour Q, K, V
16
+ self.qkv_proj = nn.Linear(d_model, 3 * d_model, False)
17
+ self.fc_out = nn.Linear(d_model, d_model)
18
+ self.scale = math.sqrt(self.head_dim)
19
+ self.split = split
20
+
21
+ if split:
22
+ # self.activation = nn.ReLU()
23
+ self.activation = nn.Tanh()
24
+
25
+ def forward(self, query, key, value):
26
+ """
27
+ query, key, value : Tensor de taille (batch_size, seq_len, d_model)
28
+ """
29
+ batch_size, seq_len, _ = query.size()
30
+
31
+ # Calculer Q, K, V et les diviser en têtes
32
+ qkv = self.qkv_proj(query).chunk(3, dim=-1)
33
+ query, key, value = [
34
+ x.view(batch_size, -1, self.n_heads, self.head_dim).transpose(1, 2)
35
+ for x in qkv
36
+ ]
37
+
38
+ if self.split:
39
+ query = self.activation(query)
40
+ key = self.activation(key)
41
+
42
+ # Produit scalaire pour l'attention
43
+ scores = torch.matmul(query, key.transpose(-2, -1)) / self.scale
44
+ attention = F.softmax(scores, dim=-1)
45
+
46
+ # Appliquer les poids d'attention sur V
47
+ attn_output = torch.matmul(attention, value)
48
+
49
+ # Réassembler les têtes
50
+ attn_output = (
51
+ attn_output.transpose(1, 2)
52
+ .contiguous()
53
+ .view(batch_size, seq_len, self.d_model)
54
+ )
55
+
56
+ # Projection finale
57
+ return self.fc_out(attn_output)
@@ -0,0 +1,50 @@
1
+ import sys
2
+ import os
3
+
4
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
5
+
6
+ from expressivity.probabilistic_density import ArchitectureComparator
7
+ from expressivity.space import ArchitecturalSpace
8
+ from tests.split_transformer.transformer import Transformer
9
+
10
+ """
11
+ In this exemple we are comparing 2 transformers with the same general architecture
12
+ The only difference will be that one will apply an activation function before multiplying Q and K together
13
+ """
14
+
15
+ # Create competing architectures
16
+ transformer_params = [
17
+ {
18
+ "d_model": 6,
19
+ "n_heads": 3,
20
+ "d_ff": 6,
21
+ "num_layers": i + 1,
22
+ }
23
+ for i in range(4)
24
+ ]
25
+
26
+ split_transformer_params = [
27
+ {
28
+ "d_model": 6,
29
+ "n_heads": 3,
30
+ "d_ff": 6,
31
+ "num_layers": i + 1,
32
+ "split": True,
33
+ }
34
+ for i in range(4)
35
+ ]
36
+
37
+ # Create architectural spaces
38
+ transformer_space = ArchitecturalSpace(
39
+ (5, 6), "transformers", Transformer, transformer_params
40
+ )
41
+
42
+ split_transformer_space = ArchitecturalSpace(
43
+ (5, 6), "split_transformers", Transformer, split_transformer_params
44
+ )
45
+
46
+ # Create comparator
47
+ comparator = ArchitectureComparator(transformer_space, split_transformer_space)
48
+ res = comparator.compare()
49
+ print(res)
50
+ comparator.plot("min")
@@ -0,0 +1,50 @@
1
+ import torch.nn as nn
2
+ from tests.split_transformer.attention import MultiHeadAttention
3
+
4
+
5
+ class Transformer(nn.Module):
6
+ def __init__(self, d_model, n_heads, d_ff, num_layers, dropout=0.1, split=False):
7
+ super(Transformer, self).__init__()
8
+ self.d_model = d_model
9
+
10
+ # Liste des couches de l'encodeur
11
+ self.encoder_layers = nn.ModuleList(
12
+ [
13
+ TransformerEncoderLayer(d_model, n_heads, d_ff, dropout, split)
14
+ for _ in range(num_layers)
15
+ ]
16
+ )
17
+
18
+ def forward(self, x):
19
+ """
20
+ x : Tensor de taille (batch_size, seq_len, d_model)
21
+ """
22
+ for layer in self.encoder_layers:
23
+ x = layer(x)
24
+ return x
25
+
26
+
27
+ class TransformerEncoderLayer(nn.Module):
28
+ def __init__(self, d_model, n_heads, d_ff, dropout=0.1, split=False):
29
+ super(TransformerEncoderLayer, self).__init__()
30
+ self.self_attention = MultiHeadAttention(d_model, n_heads, split)
31
+ self.feed_forward = nn.Sequential(
32
+ nn.Linear(d_model, d_ff), nn.ReLU(), nn.Linear(d_ff, d_model)
33
+ )
34
+ self.layer_norm1 = nn.LayerNorm(d_model)
35
+ self.layer_norm2 = nn.LayerNorm(d_model)
36
+ self.dropout = nn.Dropout(dropout)
37
+
38
+ def forward(self, x):
39
+ """
40
+ x : Tensor de taille (batch_size, seq_len, d_model)
41
+ """
42
+ # Attention multi-têtes
43
+ attn_output = self.self_attention(x, x, x)
44
+ x = self.layer_norm1(x + self.dropout(attn_output))
45
+
46
+ # Réseau feed-forward
47
+ ff_output = self.feed_forward(x)
48
+ x = self.layer_norm2(x + self.dropout(ff_output))
49
+
50
+ return x