npm - @epfml/discojs - Versions diffs - 3.0.1-p20250402090722.0 → 3.0.1-p20250429140233.0 - Mend

@epfml/discojs 3.0.1-p20250402090722.0 → 3.0.1-p20250429140233.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/models/gpt/layers.d.ts +67 -0
package/dist/models/gpt/layers.js +59 -45
package/package.json +1 -1

package/dist/models/gpt/layers.d.ts CHANGED Viewed

@@ -1,5 +1,72 @@
 import * as tf from '@tensorflow/tfjs';
 import type { GPTConfig } from './config.js';
+import type { ModelSize } from './config.js';
+/**
+ * Defines a range, from 0 to T, that is used to create positional embeddings
+ */
+export declare class Range extends tf.layers.Layer {
+    static readonly className = "Range";
+    computeOutputShape(inputShape: tf.Shape | tf.Shape[]): tf.Shape | tf.Shape[];
+    call(input: tf.Tensor | tf.Tensor[], kwargs: Record<string, unknown>): tf.Tensor | tf.Tensor[];
+}
+export type CausalSelfAttentionConfig = ConstructorParameters<typeof tf.layers.Layer>[0] & Record<'contextLength' | 'nHead' | 'nEmbd' | 'dropout' | 'nLayer' | 'seed', number>;
+export declare class CausalSelfAttention extends tf.layers.Layer {
+    private readonly config;
+    static readonly className = "CausalSelfAttention";
+    private readonly nHead;
+    private readonly nEmbd;
+    private readonly nLayer;
+    private readonly dropout;
+    private readonly seed;
+    private readonly mask;
+    cAttnKernel?: tf.LayerVariable;
+    cAttnBias?: tf.LayerVariable;
+    cProjKernel?: tf.LayerVariable;
+    cProjBias?: tf.LayerVariable;
+    constructor(config: CausalSelfAttentionConfig);
+    build(): void;
+    computeOutputShape(inputShape: tf.Shape | tf.Shape[]): tf.Shape | tf.Shape[];
+    getConfig(): tf.serialization.ConfigDict;
+    call(input: tf.Tensor | tf.Tensor[], kwargs: Record<string, unknown>): tf.Tensor;
+    dense(x: tf.Tensor, kernel: tf.LayerVariable, bias: tf.LayerVariable): tf.Tensor;
+    splitHeads(x: tf.Tensor, B: number, T: number, nHead: number): tf.Tensor;
+    applyCausalMask(att: tf.Tensor, T: number): tf.Tensor;
+    computeAttention(q: tf.Tensor, k: tf.Tensor, training: boolean, T: number): tf.Tensor;
+}
+/**
+ * GELU with tanh approximate
+ * GELU(x) = x * 0.5 * (1 + Tanh[sqrt(2/π) * (x + 0.044715 * x^3)])
+ *
+ * https://pytorch.org/docs/stable/generated/torch.nn.GELU.html
+ */
+export declare class GELU extends tf.layers.Layer {
+    static readonly className = "GELU";
+    constructor();
+    computeOutputShape(inputShape: tf.Shape | tf.Shape[]): tf.Shape | tf.Shape[];
+    call(input: tf.Tensor | tf.Tensor[], kwargs: Record<string, unknown>): tf.Tensor | tf.Tensor[];
+}
+export type MLPConfig = ConstructorParameters<typeof tf.layers.Layer>[0] & Required<ModelSize> & Record<'contextLength' | 'residDrop' | 'nLayer' | 'seed', number>;
+export declare function MLP(config: MLPConfig): tf.LayersModel;
+/**
+ * LanguageModelEmbedding is a layer that combines the token embeddings and the language modeling head
+ * I.e. LMEmbedding is used to translate token indices into token embeddings
+ * as well as to project embeddings back into token indices
+ * The GPT2 model uses the same embedding matrix for both the token embeddings and the language modeling head
+ * Because Tensorflow.js doesn't offer an easy weight sharing mechanism, we need to define a custom layer
+ * that can be used for both the token embeddings and the language modeling head.
+ * In the GPT2 model definition, this layers corresponds to wte and lm_head (which reuses wte)
+ */
+export declare class LMEmbedding extends tf.layers.Layer {
+    private readonly vocabSize;
+    private readonly nEmbd;
+    private readonly seed;
+    static readonly className = "LMEmbedding";
+    embeddings?: tf.LayerVariable;
+    constructor(vocabSize: number, nEmbd: number, seed: number);
+    build(): void;
+    computeOutputShape(inputShape: tf.Shape | tf.Shape[]): tf.Shape | tf.Shape[];
+    call(input: tf.Tensor | tf.Tensor[], kwargs: Record<string, unknown>): tf.Tensor | tf.Tensor[];
+}
 /**
  * The GPTArchitecture specifically defines a GPT forward pass, i.e.,
  * what are the inputs, the successive transformer blocks and the outputs. It is then

package/dist/models/gpt/layers.js CHANGED Viewed

@@ -4,7 +4,7 @@ const debug = createDebug("discojs:models:gpt:layers");
 /**
  * Defines a range, from 0 to T, that is used to create positional embeddings
  */
-class Range extends tf.layers.Layer {
+export class Range extends tf.layers.Layer {
     static className = 'Range';
     computeOutputShape(inputShape) {
         return inputShape;
@@ -56,7 +56,7 @@ class LogLayer extends tf.layers.Layer {
     }
 }
 tf.serialization.registerClass(LogLayer);
-class CausalSelfAttention extends tf.layers.Layer {
+export class CausalSelfAttention extends tf.layers.Layer {
     config;
     static className = 'CausalSelfAttention';
     nHead;
@@ -86,8 +86,7 @@ class CausalSelfAttention extends tf.layers.Layer {
     }
     build() {
         // key, query, value projections for all heads, but in a batch
-        this.cAttnKernel = this.addWeight('c_attn.weight', [this.nEmbd, 3 * this.nEmbd], 'float32', tf.initializers.randomNormal({ mean: 0, stddev: 0.02, seed: this.seed }) // use same init as GPT2
-        );
+        this.cAttnKernel = this.addWeight('c_attn.weight', [this.nEmbd, 3 * this.nEmbd], 'float32', tf.initializers.randomNormal({ mean: 0, stddev: 0.02, seed: this.seed }));
         this.cAttnBias = this.addWeight('c_attn.bias', [3 * this.nEmbd], 'float32', tf.initializers.zeros());
         // output projection
         this.cProjKernel = this.addWeight('c_proj.kernel', [this.nEmbd, this.nEmbd], 'float32',
@@ -97,7 +96,9 @@ class CausalSelfAttention extends tf.layers.Layer {
         // https://github.com/karpathy/build-nanogpt/blob/6104ab1b53920f6e2159749676073ff7d815c1fa/train_gpt2.py#L103
         // https://youtu.be/l8pRSuU81PU?si=5GcKfi_kPgLgvtg2&t=4640
         tf.initializers.randomNormal({
-            mean: 0, stddev: 0.02 * Math.sqrt(2 * this.nLayer), seed: this.seed
+            mean: 0,
+            stddev: 0.02 * Math.sqrt(2 * this.nLayer),
+            seed: this.seed
         }));
         this.cProjBias = this.addWeight('c_proj.bias', [this.nEmbd], 'float32', tf.initializers.zeros());
     }
@@ -122,59 +123,72 @@ class CausalSelfAttention extends tf.layers.Layer {
                 input = input[0];
             }
             this.invokeCallHook(input, kwargs);
-            const dense = (x, kernel, bias) => {
-                // TODO: use broadcasting when tfjs will support backpropagating through broadcasting
-                const k = kernel.read().expandDims(0).tile([x.shape[0], 1, 1]);
-                const m = x.matMul(k);
-                return tf.add(m, bias.read());
-            };
+            // --- Use helper methods below to build the computation ---
             // Apply attention weights to inputs as one big matrix which is then split into the
             // query, key and value submatrices
             // nHead is "number of heads", hs is "head size", and C (number of channels) = n_embd = nHead * hs
-            // e.g. in GPT-2 (124M), nHead = 12, hs = 64, so nHead * hs = C = 768 channels in the Transformer
-            const cAttn = dense(input, this.cAttnKernel, this.cAttnBias);
+            // e.g. in GPT-2 (124M), nHead = 12, hs = 64, so nHead * hs = C = 768 channels in the Transformer      const cAttn = dense(input, this.cAttnKernel, this.cAttnBias);
+            const cAttn = this.dense(input, this.cAttnKernel, this.cAttnBias);
             let [q, k, v] = tf.split(cAttn, 3, -1);
             // Follow naming conventions in https://github.com/karpathy/build-nanogpt/
             const [B, T, C] = k.shape; // batch size, sequence length, embedding dimensionality (number of channels)
-            const splitHeads = (x) => tf.transpose(tf.reshape(x, [B, T, this.nHead, C / this.nHead]), // (B, T, nHead, head size)
-            [0, 2, 1, 3] // (B, nHead, T, hs)
-            );
-            q = splitHeads(q); // (B, nHead, T, hs)
-            k = splitHeads(k); // (B, nHead, T, hs)
-            v = splitHeads(v); // (B, nHead, T, hs)
+            // Split into attention heads.
+            q = this.splitHeads(q, B, T, this.nHead);
+            k = this.splitHeads(k, B, T, this.nHead);
+            v = this.splitHeads(v, B, T, this.nHead);
             // Scaled self attention: query @ key / sqrt(hs)
             // Matrix representing the token-to-token attention (B, nHead, T, T)
-            let att = tf.mul(tf.matMul(q, k, false, true), // (B, nHead, T, hs) x (B, nHead, hs, T) -> (B, nHead, T, T)
-            tf.div(1, tf.sqrt(tf.cast(k.shape[k.shape.length - 1], 'float32'))) // 1 / sqrt(hs)
-            );
-            /**
-             * The next operations apply attention only on the past tokens, which is
-             * essentially a weighted average of the past tokens with complicated weights,
-             * it relies on a mask to not "pay any attention" to future tokens
-             */
-            // mask is lower triangular matrix filled with 1
-            const mask = this.mask.slice([0, 0], [T, T]); // (T, T)
-            // 1 - mask                   => upper triangular matrix filled with 1
-            // (1 - mask) * -10^9         => upper triangular matrix filled with -inf
-            // att + ((1 - mask) * -10^9) => lower triangular part is the same as the `att` matrix
-            //                               upper triangular part is -inf
-            att = tf.add(att, tf.mul(tf.sub(1, mask), -1e9)); // (B, nHead, T, T)
-            // applying softmax zeroes out the upper triangular part (softmax(-inf) = 0)
-            // i.e., zeroes out future tokens's attention weights
-            // and creates a probability distribution for the lower triangular
-            // (attention weights of past tokens). The probability distribution ensures
-            // that the attention weights of past tokens for a particular token sum to one
-            att = tf.softmax(att, -1);
-            att = kwargs.training === true ? tf.dropout(att, this.dropout, undefined, this.seed) : att;
+            const att = this.computeAttention(q, k, kwargs.training === true, T);
             // This is where the (attention-)weighted sum of past values is performed
             let y = tf.matMul(att, v); // (B, nHead, T, T) x (B, nHead, T, hs) -> (B, nHead, T, hs)
             y = tf.transpose(y, [0, 2, 1, 3]); // (B, T, nHead, hs)
             y = tf.reshape(y, [B, T, C]); // (B, T, C = nHead * hs)
-            y = dense(y, this.cProjKernel, this.cProjBias); // output projection (B, T, C)
+            y = this.dense(y, this.cProjKernel, this.cProjBias); // output projection (B, T, C)
             y = kwargs.training === true ? tf.dropout(y, this.dropout, undefined, this.seed) : y;
             return y;
         });
     }
+    // --- Helper Methods ---
+    dense(x, kernel, bias) {
+        const k = kernel.read().expandDims(0).tile([x.shape[0], 1, 1]);
+        const m = x.matMul(k);
+        return tf.add(m, bias.read());
+    }
+    splitHeads(x, B, T, nHead) {
+        return tf.transpose(tf.reshape(x, [B, T, nHead, (x.shape[2] ?? 0) / nHead]), [0, 2, 1, 3]);
+    }
+    applyCausalMask(att, T) {
+        // mask is lower triangular matrix filled with 1
+        const mask = this.mask.slice([0, 0], [T, T]);
+        // 1 - mask                   => upper triangular matrix filled with 1
+        // (1 - mask) * -10^9         => upper triangular matrix filled with -inf
+        // att + ((1 - mask) * -10^9) => lower triangular part is the same as the `att` matrix
+        //                               upper triangular part is -inf
+        return tf.add(att, tf.mul(tf.sub(1, mask), -1e9)); // (B, nHead, T, T)
+    }
+    computeAttention(q, k, training, T) {
+        /**
+        * The next operations apply attention only on the past tokens, which is
+        * essentially a weighted average of the past tokens with complicated weights,
+        * it relies on a mask to not "pay any attention" to future tokens
+       */
+        const headSize = k.shape[k.shape.length - 1];
+        // Scaled self attention: query @ key / sqrt(hs)
+        // Matrix representing the token-to-token attention (B, nHead, T, T)
+        let att = tf.matMul(q, k, false, true); // (B, nHead, T, hs) x (B, nHead, hs, T) -> (B, nHead, T, T)
+        att = tf.mul(att, tf.div(1, tf.sqrt(tf.cast(headSize, 'float32')))); // 1 / sqrt(hs)
+        att = this.applyCausalMask(att, T);
+        // applying softmax zeroes out the upper triangular part (softmax(-inf) = 0)
+        // i.e., zeroes out future tokens's attention weights
+        // and creates a probability distribution for the lower triangular
+        // (attention weights of past tokens). The probability distribution ensures
+        // that the attention weights of past tokens for a particular token sum to one
+        att = tf.softmax(att, -1);
+        if (training) {
+            att = tf.dropout(att, this.dropout, undefined, this.seed);
+        }
+        return att;
+    }
 }
 tf.serialization.registerClass(CausalSelfAttention);
 /**
@@ -183,7 +197,7 @@ tf.serialization.registerClass(CausalSelfAttention);
  *
  * https://pytorch.org/docs/stable/generated/torch.nn.GELU.html
  */
-class GELU extends tf.layers.Layer {
+export class GELU extends tf.layers.Layer {
     static className = 'GELU';
     constructor() {
         super({});
@@ -210,7 +224,7 @@ class GELU extends tf.layers.Layer {
     }
 }
 tf.serialization.registerClass(GELU);
-function MLP(config) {
+export function MLP(config) {
     return tf.sequential({ layers: [
             tf.layers.dense({
                 name: config.name + `.mlp.c_fc`,
@@ -298,7 +312,7 @@ function TransformerBlock(conf) {
  * that can be used for both the token embeddings and the language modeling head.
  * In the GPT2 model definition, this layers corresponds to wte and lm_head (which reuses wte)
  */
-class LMEmbedding extends tf.layers.Layer {
+export class LMEmbedding extends tf.layers.Layer {
     vocabSize;
     nEmbd;
     seed;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@epfml/discojs",
-  "version": "3.0.1-p20250402090722.0",
+  "version": "3.0.1-p20250429140233.0",
   "type": "module",
   "main": "dist/index.js",
   "types": "dist/index.d.ts",