@epfml/discojs 3.0.1-p20250402090722.0 → 3.0.1-p20250429140233.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,72 @@
1
1
  import * as tf from '@tensorflow/tfjs';
2
2
  import type { GPTConfig } from './config.js';
3
+ import type { ModelSize } from './config.js';
4
+ /**
5
+ * Defines a range, from 0 to T, that is used to create positional embeddings
6
+ */
7
+ export declare class Range extends tf.layers.Layer {
8
+ static readonly className = "Range";
9
+ computeOutputShape(inputShape: tf.Shape | tf.Shape[]): tf.Shape | tf.Shape[];
10
+ call(input: tf.Tensor | tf.Tensor[], kwargs: Record<string, unknown>): tf.Tensor | tf.Tensor[];
11
+ }
12
+ export type CausalSelfAttentionConfig = ConstructorParameters<typeof tf.layers.Layer>[0] & Record<'contextLength' | 'nHead' | 'nEmbd' | 'dropout' | 'nLayer' | 'seed', number>;
13
+ export declare class CausalSelfAttention extends tf.layers.Layer {
14
+ private readonly config;
15
+ static readonly className = "CausalSelfAttention";
16
+ private readonly nHead;
17
+ private readonly nEmbd;
18
+ private readonly nLayer;
19
+ private readonly dropout;
20
+ private readonly seed;
21
+ private readonly mask;
22
+ cAttnKernel?: tf.LayerVariable;
23
+ cAttnBias?: tf.LayerVariable;
24
+ cProjKernel?: tf.LayerVariable;
25
+ cProjBias?: tf.LayerVariable;
26
+ constructor(config: CausalSelfAttentionConfig);
27
+ build(): void;
28
+ computeOutputShape(inputShape: tf.Shape | tf.Shape[]): tf.Shape | tf.Shape[];
29
+ getConfig(): tf.serialization.ConfigDict;
30
+ call(input: tf.Tensor | tf.Tensor[], kwargs: Record<string, unknown>): tf.Tensor;
31
+ dense(x: tf.Tensor, kernel: tf.LayerVariable, bias: tf.LayerVariable): tf.Tensor;
32
+ splitHeads(x: tf.Tensor, B: number, T: number, nHead: number): tf.Tensor;
33
+ applyCausalMask(att: tf.Tensor, T: number): tf.Tensor;
34
+ computeAttention(q: tf.Tensor, k: tf.Tensor, training: boolean, T: number): tf.Tensor;
35
+ }
36
+ /**
37
+ * GELU with tanh approximate
38
+ * GELU(x) = x * 0.5 * (1 + Tanh[sqrt(2/π) * (x + 0.044715 * x^3)])
39
+ *
40
+ * https://pytorch.org/docs/stable/generated/torch.nn.GELU.html
41
+ */
42
+ export declare class GELU extends tf.layers.Layer {
43
+ static readonly className = "GELU";
44
+ constructor();
45
+ computeOutputShape(inputShape: tf.Shape | tf.Shape[]): tf.Shape | tf.Shape[];
46
+ call(input: tf.Tensor | tf.Tensor[], kwargs: Record<string, unknown>): tf.Tensor | tf.Tensor[];
47
+ }
48
+ export type MLPConfig = ConstructorParameters<typeof tf.layers.Layer>[0] & Required<ModelSize> & Record<'contextLength' | 'residDrop' | 'nLayer' | 'seed', number>;
49
+ export declare function MLP(config: MLPConfig): tf.LayersModel;
50
+ /**
51
+ * LanguageModelEmbedding is a layer that combines the token embeddings and the language modeling head
52
+ * I.e. LMEmbedding is used to translate token indices into token embeddings
53
+ * as well as to project embeddings back into token indices
54
+ * The GPT2 model uses the same embedding matrix for both the token embeddings and the language modeling head
55
+ * Because Tensorflow.js doesn't offer an easy weight sharing mechanism, we need to define a custom layer
56
+ * that can be used for both the token embeddings and the language modeling head.
57
+ * In the GPT2 model definition, this layers corresponds to wte and lm_head (which reuses wte)
58
+ */
59
+ export declare class LMEmbedding extends tf.layers.Layer {
60
+ private readonly vocabSize;
61
+ private readonly nEmbd;
62
+ private readonly seed;
63
+ static readonly className = "LMEmbedding";
64
+ embeddings?: tf.LayerVariable;
65
+ constructor(vocabSize: number, nEmbd: number, seed: number);
66
+ build(): void;
67
+ computeOutputShape(inputShape: tf.Shape | tf.Shape[]): tf.Shape | tf.Shape[];
68
+ call(input: tf.Tensor | tf.Tensor[], kwargs: Record<string, unknown>): tf.Tensor | tf.Tensor[];
69
+ }
3
70
  /**
4
71
  * The GPTArchitecture specifically defines a GPT forward pass, i.e.,
5
72
  * what are the inputs, the successive transformer blocks and the outputs. It is then
@@ -4,7 +4,7 @@ const debug = createDebug("discojs:models:gpt:layers");
4
4
  /**
5
5
  * Defines a range, from 0 to T, that is used to create positional embeddings
6
6
  */
7
- class Range extends tf.layers.Layer {
7
+ export class Range extends tf.layers.Layer {
8
8
  static className = 'Range';
9
9
  computeOutputShape(inputShape) {
10
10
  return inputShape;
@@ -56,7 +56,7 @@ class LogLayer extends tf.layers.Layer {
56
56
  }
57
57
  }
58
58
  tf.serialization.registerClass(LogLayer);
59
- class CausalSelfAttention extends tf.layers.Layer {
59
+ export class CausalSelfAttention extends tf.layers.Layer {
60
60
  config;
61
61
  static className = 'CausalSelfAttention';
62
62
  nHead;
@@ -86,8 +86,7 @@ class CausalSelfAttention extends tf.layers.Layer {
86
86
  }
87
87
  build() {
88
88
  // key, query, value projections for all heads, but in a batch
89
- this.cAttnKernel = this.addWeight('c_attn.weight', [this.nEmbd, 3 * this.nEmbd], 'float32', tf.initializers.randomNormal({ mean: 0, stddev: 0.02, seed: this.seed }) // use same init as GPT2
90
- );
89
+ this.cAttnKernel = this.addWeight('c_attn.weight', [this.nEmbd, 3 * this.nEmbd], 'float32', tf.initializers.randomNormal({ mean: 0, stddev: 0.02, seed: this.seed }));
91
90
  this.cAttnBias = this.addWeight('c_attn.bias', [3 * this.nEmbd], 'float32', tf.initializers.zeros());
92
91
  // output projection
93
92
  this.cProjKernel = this.addWeight('c_proj.kernel', [this.nEmbd, this.nEmbd], 'float32',
@@ -97,7 +96,9 @@ class CausalSelfAttention extends tf.layers.Layer {
97
96
  // https://github.com/karpathy/build-nanogpt/blob/6104ab1b53920f6e2159749676073ff7d815c1fa/train_gpt2.py#L103
98
97
  // https://youtu.be/l8pRSuU81PU?si=5GcKfi_kPgLgvtg2&t=4640
99
98
  tf.initializers.randomNormal({
100
- mean: 0, stddev: 0.02 * Math.sqrt(2 * this.nLayer), seed: this.seed
99
+ mean: 0,
100
+ stddev: 0.02 * Math.sqrt(2 * this.nLayer),
101
+ seed: this.seed
101
102
  }));
102
103
  this.cProjBias = this.addWeight('c_proj.bias', [this.nEmbd], 'float32', tf.initializers.zeros());
103
104
  }
@@ -122,59 +123,72 @@ class CausalSelfAttention extends tf.layers.Layer {
122
123
  input = input[0];
123
124
  }
124
125
  this.invokeCallHook(input, kwargs);
125
- const dense = (x, kernel, bias) => {
126
- // TODO: use broadcasting when tfjs will support backpropagating through broadcasting
127
- const k = kernel.read().expandDims(0).tile([x.shape[0], 1, 1]);
128
- const m = x.matMul(k);
129
- return tf.add(m, bias.read());
130
- };
126
+ // --- Use helper methods below to build the computation ---
131
127
  // Apply attention weights to inputs as one big matrix which is then split into the
132
128
  // query, key and value submatrices
133
129
  // nHead is "number of heads", hs is "head size", and C (number of channels) = n_embd = nHead * hs
134
- // e.g. in GPT-2 (124M), nHead = 12, hs = 64, so nHead * hs = C = 768 channels in the Transformer
135
- const cAttn = dense(input, this.cAttnKernel, this.cAttnBias);
130
+ // e.g. in GPT-2 (124M), nHead = 12, hs = 64, so nHead * hs = C = 768 channels in the Transformer const cAttn = dense(input, this.cAttnKernel, this.cAttnBias);
131
+ const cAttn = this.dense(input, this.cAttnKernel, this.cAttnBias);
136
132
  let [q, k, v] = tf.split(cAttn, 3, -1);
137
133
  // Follow naming conventions in https://github.com/karpathy/build-nanogpt/
138
134
  const [B, T, C] = k.shape; // batch size, sequence length, embedding dimensionality (number of channels)
139
- const splitHeads = (x) => tf.transpose(tf.reshape(x, [B, T, this.nHead, C / this.nHead]), // (B, T, nHead, head size)
140
- [0, 2, 1, 3] // (B, nHead, T, hs)
141
- );
142
- q = splitHeads(q); // (B, nHead, T, hs)
143
- k = splitHeads(k); // (B, nHead, T, hs)
144
- v = splitHeads(v); // (B, nHead, T, hs)
135
+ // Split into attention heads.
136
+ q = this.splitHeads(q, B, T, this.nHead);
137
+ k = this.splitHeads(k, B, T, this.nHead);
138
+ v = this.splitHeads(v, B, T, this.nHead);
145
139
  // Scaled self attention: query @ key / sqrt(hs)
146
140
  // Matrix representing the token-to-token attention (B, nHead, T, T)
147
- let att = tf.mul(tf.matMul(q, k, false, true), // (B, nHead, T, hs) x (B, nHead, hs, T) -> (B, nHead, T, T)
148
- tf.div(1, tf.sqrt(tf.cast(k.shape[k.shape.length - 1], 'float32'))) // 1 / sqrt(hs)
149
- );
150
- /**
151
- * The next operations apply attention only on the past tokens, which is
152
- * essentially a weighted average of the past tokens with complicated weights,
153
- * it relies on a mask to not "pay any attention" to future tokens
154
- */
155
- // mask is lower triangular matrix filled with 1
156
- const mask = this.mask.slice([0, 0], [T, T]); // (T, T)
157
- // 1 - mask => upper triangular matrix filled with 1
158
- // (1 - mask) * -10^9 => upper triangular matrix filled with -inf
159
- // att + ((1 - mask) * -10^9) => lower triangular part is the same as the `att` matrix
160
- // upper triangular part is -inf
161
- att = tf.add(att, tf.mul(tf.sub(1, mask), -1e9)); // (B, nHead, T, T)
162
- // applying softmax zeroes out the upper triangular part (softmax(-inf) = 0)
163
- // i.e., zeroes out future tokens's attention weights
164
- // and creates a probability distribution for the lower triangular
165
- // (attention weights of past tokens). The probability distribution ensures
166
- // that the attention weights of past tokens for a particular token sum to one
167
- att = tf.softmax(att, -1);
168
- att = kwargs.training === true ? tf.dropout(att, this.dropout, undefined, this.seed) : att;
141
+ const att = this.computeAttention(q, k, kwargs.training === true, T);
169
142
  // This is where the (attention-)weighted sum of past values is performed
170
143
  let y = tf.matMul(att, v); // (B, nHead, T, T) x (B, nHead, T, hs) -> (B, nHead, T, hs)
171
144
  y = tf.transpose(y, [0, 2, 1, 3]); // (B, T, nHead, hs)
172
145
  y = tf.reshape(y, [B, T, C]); // (B, T, C = nHead * hs)
173
- y = dense(y, this.cProjKernel, this.cProjBias); // output projection (B, T, C)
146
+ y = this.dense(y, this.cProjKernel, this.cProjBias); // output projection (B, T, C)
174
147
  y = kwargs.training === true ? tf.dropout(y, this.dropout, undefined, this.seed) : y;
175
148
  return y;
176
149
  });
177
150
  }
151
+ // --- Helper Methods ---
152
+ dense(x, kernel, bias) {
153
+ const k = kernel.read().expandDims(0).tile([x.shape[0], 1, 1]);
154
+ const m = x.matMul(k);
155
+ return tf.add(m, bias.read());
156
+ }
157
+ splitHeads(x, B, T, nHead) {
158
+ return tf.transpose(tf.reshape(x, [B, T, nHead, (x.shape[2] ?? 0) / nHead]), [0, 2, 1, 3]);
159
+ }
160
+ applyCausalMask(att, T) {
161
+ // mask is lower triangular matrix filled with 1
162
+ const mask = this.mask.slice([0, 0], [T, T]);
163
+ // 1 - mask => upper triangular matrix filled with 1
164
+ // (1 - mask) * -10^9 => upper triangular matrix filled with -inf
165
+ // att + ((1 - mask) * -10^9) => lower triangular part is the same as the `att` matrix
166
+ // upper triangular part is -inf
167
+ return tf.add(att, tf.mul(tf.sub(1, mask), -1e9)); // (B, nHead, T, T)
168
+ }
169
+ computeAttention(q, k, training, T) {
170
+ /**
171
+ * The next operations apply attention only on the past tokens, which is
172
+ * essentially a weighted average of the past tokens with complicated weights,
173
+ * it relies on a mask to not "pay any attention" to future tokens
174
+ */
175
+ const headSize = k.shape[k.shape.length - 1];
176
+ // Scaled self attention: query @ key / sqrt(hs)
177
+ // Matrix representing the token-to-token attention (B, nHead, T, T)
178
+ let att = tf.matMul(q, k, false, true); // (B, nHead, T, hs) x (B, nHead, hs, T) -> (B, nHead, T, T)
179
+ att = tf.mul(att, tf.div(1, tf.sqrt(tf.cast(headSize, 'float32')))); // 1 / sqrt(hs)
180
+ att = this.applyCausalMask(att, T);
181
+ // applying softmax zeroes out the upper triangular part (softmax(-inf) = 0)
182
+ // i.e., zeroes out future tokens's attention weights
183
+ // and creates a probability distribution for the lower triangular
184
+ // (attention weights of past tokens). The probability distribution ensures
185
+ // that the attention weights of past tokens for a particular token sum to one
186
+ att = tf.softmax(att, -1);
187
+ if (training) {
188
+ att = tf.dropout(att, this.dropout, undefined, this.seed);
189
+ }
190
+ return att;
191
+ }
178
192
  }
179
193
  tf.serialization.registerClass(CausalSelfAttention);
180
194
  /**
@@ -183,7 +197,7 @@ tf.serialization.registerClass(CausalSelfAttention);
183
197
  *
184
198
  * https://pytorch.org/docs/stable/generated/torch.nn.GELU.html
185
199
  */
186
- class GELU extends tf.layers.Layer {
200
+ export class GELU extends tf.layers.Layer {
187
201
  static className = 'GELU';
188
202
  constructor() {
189
203
  super({});
@@ -210,7 +224,7 @@ class GELU extends tf.layers.Layer {
210
224
  }
211
225
  }
212
226
  tf.serialization.registerClass(GELU);
213
- function MLP(config) {
227
+ export function MLP(config) {
214
228
  return tf.sequential({ layers: [
215
229
  tf.layers.dense({
216
230
  name: config.name + `.mlp.c_fc`,
@@ -298,7 +312,7 @@ function TransformerBlock(conf) {
298
312
  * that can be used for both the token embeddings and the language modeling head.
299
313
  * In the GPT2 model definition, this layers corresponds to wte and lm_head (which reuses wte)
300
314
  */
301
- class LMEmbedding extends tf.layers.Layer {
315
+ export class LMEmbedding extends tf.layers.Layer {
302
316
  vocabSize;
303
317
  nEmbd;
304
318
  seed;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@epfml/discojs",
3
- "version": "3.0.1-p20250402090722.0",
3
+ "version": "3.0.1-p20250429140233.0",
4
4
  "type": "module",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",