npm - @epfml/discojs - Versions diffs - 3.0.1-p20241119093954.0 → 3.0.1-p20241206133538.0 - Mend

@epfml/discojs 3.0.1-p20241119093954.0 → 3.0.1-p20241206133538.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/dist/client/client.js +2 -0
package/dist/client/federated/federated_client.js +2 -2
package/dist/dataset/dataset.d.ts +18 -5
package/dist/dataset/dataset.js +58 -23
package/dist/dataset/types.d.ts +1 -0
package/dist/default_tasks/index.d.ts +1 -0
package/dist/default_tasks/index.js +1 -0
package/dist/default_tasks/tinder_dog.d.ts +2 -0
package/dist/default_tasks/tinder_dog.js +72 -0
package/dist/default_tasks/wikitext.js +5 -3
package/dist/models/gpt/config.d.ts +11 -6
package/dist/models/gpt/config.js +11 -7
package/dist/models/gpt/index.d.ts +5 -9
package/dist/models/gpt/index.js +36 -15
package/dist/models/gpt/layers.js +260 -82
package/dist/models/gpt/model.d.ts +1 -1
package/dist/models/gpt/model.js +4 -4
package/dist/processing/index.js +8 -9
package/dist/processing/text.d.ts +16 -6
package/dist/processing/text.js +29 -26
package/dist/task/task_handler.js +5 -1
package/dist/task/training_information.d.ts +1 -1
package/dist/task/training_information.js +3 -4
package/dist/training/disco.js +6 -3
package/dist/types/data_format.d.ts +2 -2
package/dist/validator.js +2 -2
package/package.json +1 -1

package/dist/models/gpt/layers.js CHANGED Viewed

@@ -1,4 +1,6 @@
+import createDebug from "debug";
 import * as tf from '@tensorflow/tfjs';
+const debug = createDebug("discojs:models:gpt:layers");
 /**
  * Defines a range, from 0 to T, that is used to create positional embeddings
  */
@@ -10,7 +12,8 @@ class Range extends tf.layers.Layer {
     call(input, kwargs) {
         return tf.tidy(() => {
             if (Array.isArray(input)) {
-                // TODO support multitensor
+                if (input.length !== 1)
+                    throw new Error('expected exactly one tensor');
                 input = input[0];
             }
             this.invokeCallHook(input, kwargs);
@@ -22,6 +25,11 @@ class Range extends tf.layers.Layer {
     }
 }
 tf.serialization.registerClass(Range);
+/**
+ * LogLayer is a layer that allows debugging the input that is fed to this layer
+ * This layer allows to inspect the input tensor at a specific point
+ * in the model by adding a log layer in the model definition
+ */
 class LogLayer extends tf.layers.Layer {
     static className = 'LogLayer';
     computeOutputShape(inputShape) {
@@ -30,9 +38,19 @@ class LogLayer extends tf.layers.Layer {
     call(input, kwargs) {
         return tf.tidy(() => {
             if (Array.isArray(input)) {
+                if (input.length !== 1)
+                    throw new Error('expected exactly one tensor');
                 input = input[0];
             }
             this.invokeCallHook(input, kwargs);
+            const logs = {
+                'shape': input.shape,
+                'is_only_zero': !!input.equal(tf.tensor(0)).all().dataSync()[0],
+                'has_some_NaN': !!input.isNaN().any().dataSync()[0],
+                'min': +input.min().dataSync()[0].toPrecision(3),
+                'max': +input.max().dataSync()[0].toPrecision(3),
+            };
+            debug("%s logged: %o", this.name, logs);
             return input;
         });
     }
@@ -43,8 +61,9 @@ class CausalSelfAttention extends tf.layers.Layer {
     static className = 'CausalSelfAttention';
     nHead;
     nEmbd;
+    nLayer;
     dropout;
-    bias;
+    seed;
     mask;
     cAttnKernel;
     cAttnBias;
@@ -53,20 +72,34 @@ class CausalSelfAttention extends tf.layers.Layer {
     constructor(config) {
         super(config);
         this.config = config;
+        if (config.nEmbd % config.nHead !== 0)
+            throw new Error('The embedding dimension `nEmbd` must be divisible by the number of attention heads `nHead`');
         this.nEmbd = config.nEmbd;
         this.nHead = config.nHead;
+        this.nLayer = config.nLayer;
         this.dropout = config.dropout;
-        this.bias = config.bias;
+        this.seed = config.seed;
         // mask is a lower triangular matrix filled with 1
         // calling bandPart zero out the upper triangular part of the all-ones matrix
         // from the doc: tf.linalg.band_part(input, -1, 0) ==> Lower triangular part
-        this.mask = tf.linalg.bandPart(tf.ones([config.blockSize, config.blockSize]), -1, 0);
+        this.mask = tf.linalg.bandPart(tf.ones([config.contextLength, config.contextLength]), -1, 0);
     }
     build() {
-        this.cAttnKernel = this.addWeight('c_attn/kernel', [this.nEmbd, 3 * this.nEmbd], 'float32', tf.initializers.glorotNormal({}));
-        this.cAttnBias = this.addWeight('c_attn/bias', [3 * this.nEmbd], 'float32', tf.initializers.zeros());
-        this.cProjKernel = this.addWeight('c_proj/kernel', [this.nEmbd, this.nEmbd], 'float32', tf.initializers.glorotNormal({}));
-        this.cProjBias = this.addWeight('c_proj/bias', [this.nEmbd], 'float32', tf.initializers.zeros());
+        // key, query, value projections for all heads, but in a batch
+        this.cAttnKernel = this.addWeight('c_attn.weight', [this.nEmbd, 3 * this.nEmbd], 'float32', tf.initializers.randomNormal({ mean: 0, stddev: 0.02, seed: this.seed }) // use same init as GPT2
+        );
+        this.cAttnBias = this.addWeight('c_attn.bias', [3 * this.nEmbd], 'float32', tf.initializers.zeros());
+        // output projection
+        this.cProjKernel = this.addWeight('c_proj.kernel', [this.nEmbd, this.nEmbd], 'float32',
+        // the input keeps accumulating through the residual stream so we
+        // scale the initialization with the nb of layers to keep a unit std
+        // Sources:
+        // https://github.com/karpathy/build-nanogpt/blob/6104ab1b53920f6e2159749676073ff7d815c1fa/train_gpt2.py#L103
+        // https://youtu.be/l8pRSuU81PU?si=5GcKfi_kPgLgvtg2&t=4640
+        tf.initializers.randomNormal({
+            mean: 0, stddev: 0.02 * Math.sqrt(2 * this.nLayer), seed: this.seed
+        }));
+        this.cProjBias = this.addWeight('c_proj.bias', [this.nEmbd], 'float32', tf.initializers.zeros());
     }
     computeOutputShape(inputShape) {
         return inputShape;
@@ -84,58 +117,72 @@ class CausalSelfAttention extends tf.layers.Layer {
                 throw new Error('not built');
             }
             if (Array.isArray(input)) {
+                if (input.length !== 1)
+                    throw new Error('expected exactly one tensor');
                 input = input[0];
             }
             this.invokeCallHook(input, kwargs);
             const dense = (x, kernel, bias) => {
+                // TODO: use broadcasting when tfjs will support backpropagating through broadcasting
                 const k = kernel.read().expandDims(0).tile([x.shape[0], 1, 1]);
                 const m = x.matMul(k);
-                if (this.bias) {
-                    return tf.add(m, bias.read());
-                }
-                else {
-                    return m;
-                }
+                return tf.add(m, bias.read());
             };
             // Apply attention weights to inputs as one big matrix which is then split into the
             // query, key and value submatrices
+            // nHead is "number of heads", hs is "head size", and C (number of channels) = n_embd = nHead * hs
+            // e.g. in GPT-2 (124M), nHead = 12, hs = 64, so nHead * hs = C = 768 channels in the Transformer
             const cAttn = dense(input, this.cAttnKernel, this.cAttnBias);
             let [q, k, v] = tf.split(cAttn, 3, -1);
-            const [B, T, C] = k.shape;
-            const splitHeads = (x) => tf.transpose(tf.reshape(x, [B, T, this.nHead, C / this.nHead]), [0, 2, 1, 3]);
-            q = splitHeads(q);
-            k = splitHeads(k);
-            v = splitHeads(v);
-            // Scaled self attention: query @ key / sqrt(n_heads)
-            let att = tf.mul(tf.matMul(q, k, false, true), tf.div(1, tf.sqrt(tf.cast(k.shape[k.shape.length - 1], 'float32'))));
-            // The next operations apply attention to the past tokens, which is
-            // essentially a weighted average of the past tokens with complicated weights,
-            // and makes sure to not pay any attention to future tokens
+            // Follow naming conventions in https://github.com/karpathy/build-nanogpt/
+            const [B, T, C] = k.shape; // batch size, sequence length, embedding dimensionality (number of channels)
+            const splitHeads = (x) => tf.transpose(tf.reshape(x, [B, T, this.nHead, C / this.nHead]), // (B, T, nHead, head size)
+            [0, 2, 1, 3] // (B, nHead, T, hs)
+            );
+            q = splitHeads(q); // (B, nHead, T, hs)
+            k = splitHeads(k); // (B, nHead, T, hs)
+            v = splitHeads(v); // (B, nHead, T, hs)
+            // Scaled self attention: query @ key / sqrt(hs)
+            // Matrix representing the token-to-token attention (B, nHead, T, T)
+            let att = tf.mul(tf.matMul(q, k, false, true), // (B, nHead, T, hs) x (B, nHead, hs, T) -> (B, nHead, T, T)
+            tf.div(1, tf.sqrt(tf.cast(k.shape[k.shape.length - 1], 'float32'))) // 1 / sqrt(hs)
+            );
+            /**
+             * The next operations apply attention only on the past tokens, which is
+             * essentially a weighted average of the past tokens with complicated weights,
+             * it relies on a mask to not "pay any attention" to future tokens
+             */
             // mask is lower triangular matrix filled with 1
-            const mask = this.mask.slice([0, 0], [T, T]);
+            const mask = this.mask.slice([0, 0], [T, T]); // (T, T)
             // 1 - mask                   => upper triangular matrix filled with 1
             // (1 - mask) * -10^9         => upper triangular matrix filled with -inf
             // att + ((1 - mask) * -10^9) => lower triangular part is the same as the `att` matrix
             //                               upper triangular part is -inf
-            att = tf.add(att, tf.mul(tf.sub(1, mask), -1e9));
-            // applying softmax zeros out the upper triangular part
-            //(which are the attention weights of future tokens)
+            att = tf.add(att, tf.mul(tf.sub(1, mask), -1e9)); // (B, nHead, T, T)
+            // applying softmax zeroes out the upper triangular part (softmax(-inf) = 0)
+            // i.e., zeroes out future tokens's attention weights
             // and creates a probability distribution for the lower triangular
             // (attention weights of past tokens). The probability distribution ensures
             // that the attention weights of past tokens for a particular token sum to one
             att = tf.softmax(att, -1);
-            att = kwargs.training === true ? tf.dropout(att, this.dropout) : att;
+            att = kwargs.training === true ? tf.dropout(att, this.dropout, undefined, this.seed) : att;
             // This is where the (attention-)weighted sum of past values is performed
-            let y = tf.matMul(att, v);
-            y = tf.transpose(y, [0, 2, 1, 3]);
-            y = tf.reshape(y, [B, T, C]);
-            y = dense(y, this.cProjKernel, this.cProjBias);
-            y = kwargs.training === true ? tf.dropout(y, this.dropout) : y;
+            let y = tf.matMul(att, v); // (B, nHead, T, T) x (B, nHead, T, hs) -> (B, nHead, T, hs)
+            y = tf.transpose(y, [0, 2, 1, 3]); // (B, T, nHead, hs)
+            y = tf.reshape(y, [B, T, C]); // (B, T, C = nHead * hs)
+            y = dense(y, this.cProjKernel, this.cProjBias); // output projection (B, T, C)
+            y = kwargs.training === true ? tf.dropout(y, this.dropout, undefined, this.seed) : y;
             return y;
         });
     }
 }
 tf.serialization.registerClass(CausalSelfAttention);
+/**
+ * GELU with tanh approximate
+ * GELU(x) = x * 0.5 * (1 + Tanh[sqrt(2/π) * (x + 0.044715 * x^3)])
+ *
+ * https://pytorch.org/docs/stable/generated/torch.nn.GELU.html
+ */
 class GELU extends tf.layers.Layer {
     static className = 'GELU';
     constructor() {
@@ -148,11 +195,17 @@ class GELU extends tf.layers.Layer {
         return tf.tidy(() => {
             if (Array.isArray(input)) {
                 // TODO support multitensor
+                if (input.length !== 1)
+                    throw new Error('expected exactly one tensor');
                 input = input[0];
             }
             this.invokeCallHook(input, kwargs);
-            const cdf = tf.mul(0.5, tf.add(1, tf.tanh(tf.mul(tf.sqrt(tf.div(2, Math.PI)), tf.add(input, tf.mul(0.044715, tf.pow(input, 3)))))));
-            return tf.mul(input, cdf);
+            const cdf = tf.mul(// 0.5 * (1 + Tanh[sqrt(2/π) * (x + 0.044715 * x^3)])
+            0.5, tf.add(1, tf.tanh(// Tanh[sqrt(2/π) * (x + 0.044715 * x^3)]
+            tf.mul(tf.sqrt(tf.div(2, Math.PI)), // (sqrt(2/π)
+            tf.add(input, tf.mul(0.044715, tf.pow(input, 3))) // (x + 0.044715 * x^3)
+            ))));
+            return tf.mul(input, cdf); // x * 0.5 * (1 + Tanh[sqrt(2/π) * (x + 0.044715 * x^3)])
         });
     }
 }
@@ -160,48 +213,173 @@ tf.serialization.registerClass(GELU);
 function MLP(config) {
     return tf.sequential({ layers: [
             tf.layers.dense({
-                name: config.name + `/mlp/c_fc`,
+                name: config.name + `.mlp.c_fc`,
                 units: 4 * config.nEmbd,
                 inputDim: config.nEmbd,
-                inputShape: [config.blockSize, config.nEmbd]
+                inputShape: [config.contextLength, config.nEmbd],
+                kernelInitializer: tf.initializers.randomNormal({
+                    mean: 0, stddev: 0.02, seed: config.seed
+                }),
             }),
             new GELU(),
             tf.layers.dense({
-                name: config.name + '/mlp/c_proj',
+                name: config.name + '.mlp.c_proj',
                 units: config.nEmbd,
                 inputDim: 4 * config.nEmbd,
-                inputShape: [config.blockSize, 4 * config.nEmbd]
+                inputShape: [config.contextLength, 4 * config.nEmbd],
+                kernelInitializer: tf.initializers.randomNormal({
+                    mean: 0, stddev: 0.02 * Math.sqrt(2 * config.nLayer), seed: config.seed
+                }),
             }),
             tf.layers.dropout({
-                name: config.name + '/mlp/drop',
-                rate: config.residDrop
+                name: config.name + '.mlp.drop',
+                rate: config.residDrop,
+                seed: config.seed
             }),
         ] });
 }
+/**
+ * Performs the following operations:
+ * x1 = input + mlp(layernorm_1(input))
+ * output = x1 + mlp(layernorm_2(x1))
+ */
 function TransformerBlock(conf) {
-    const config = Object.assign({ name: 'h' }, conf);
-    const inputs = tf.input({ shape: [config.blockSize, config.nEmbd] });
+    const config = Object.assign({ name: '.h' }, conf);
+    const inputs = tf.input({ shape: [config.contextLength, config.nEmbd] });
     let x1, x2;
     // input normalization
-    x1 = tf.layers.layerNormalization({ name: config.name + '/ln_1', epsilon: 1e-5 })
-        .apply(inputs);
+    x1 = tf.layers.layerNormalization({
+        name: config.name + '.ln_1',
+        epsilon: 1e-5,
+        gammaInitializer: 'ones', // already the default but make it explicit
+        betaInitializer: 'zeros',
+    }).apply(inputs);
     if (config.debug) {
-        x1 = new LogLayer({ name: config.name + '/ln_1_log' }).apply(x1);
+        x1 = new LogLayer({ name: config.name + '.ln_1_log' }).apply(x1);
     }
     // self attention layer
-    x1 = new CausalSelfAttention(Object.assign({}, config, { name: config.name + '/attn' })).apply(x1);
+    x1 = new CausalSelfAttention(Object.assign({}, config, { name: config.name + '.attn' })).apply(x1);
+    if (config.debug) {
+        x1 = new LogLayer({ name: config.name + '.attn_log' }).apply(x1);
+    }
     // Residual connection
     x1 = tf.layers.add().apply([inputs, x1]);
+    if (config.debug) {
+        x1 = new LogLayer({ name: config.name + '.residual_log' }).apply(x1);
+    }
     // normalization
-    x2 = tf.layers
-        .layerNormalization({ name: config.name + '/ln_2', epsilon: 1e-5 })
-        .apply(x1);
+    x2 = tf.layers.layerNormalization({
+        name: config.name + '.ln_2',
+        epsilon: 1e-5,
+        gammaInitializer: 'ones',
+        betaInitializer: 'zeros',
+    }).apply(x1);
+    if (config.debug) {
+        x2 = new LogLayer({ name: config.name + '.ln_2_log' }).apply(x2);
+    }
     // MLP
-    x2 = MLP(Object.assign({}, config, { name: config.name })).apply(x2);
+    x2 = MLP(Object.assign({}, config, { name: config.name + '.mlp' })).apply(x2);
+    if (config.debug) {
+        x2 = new LogLayer({ name: config.name + '.mlp_log' }).apply(x2);
+    }
     // add attention output to mlp output
     x2 = tf.layers.add().apply([x1, x2]);
+    if (config.debug) {
+        x2 = new LogLayer({ name: config.name + '.add_log' }).apply(x2);
+    }
     return tf.model({ name: config.name, inputs, outputs: x2 });
 }
+/**
+ * LanguageModelEmbedding is a layer that combines the token embeddings and the language modeling head
+ * I.e. LMEmbedding is used to translate token indices into token embeddings
+ * as well as to project embeddings back into token indices
+ * The GPT2 model uses the same embedding matrix for both the token embeddings and the language modeling head
+ * Because Tensorflow.js doesn't offer an easy weight sharing mechanism, we need to define a custom layer
+ * that can be used for both the token embeddings and the language modeling head.
+ * In the GPT2 model definition, this layers corresponds to wte and lm_head (which reuses wte)
+ */
+class LMEmbedding extends tf.layers.Layer {
+    vocabSize;
+    nEmbd;
+    seed;
+    static className = 'LMEmbedding';
+    embeddings;
+    constructor(vocabSize, nEmbd, seed) {
+        super({});
+        this.vocabSize = vocabSize;
+        this.nEmbd = nEmbd;
+        this.seed = seed;
+    }
+    build() {
+        this.embeddings = this.addWeight('wte', //use same name as GPT2
+        [this.vocabSize, this.nEmbd], 'float32', tf.initializers.randomNormal({ mean: 0, stddev: 0.02, seed: this.seed }));
+    }
+    computeOutputShape(inputShape) {
+        let shape;
+        if (Array.isArray(inputShape) && Array.isArray(inputShape[0])) {
+            if (inputShape.length !== 1)
+                throw new Error('Expected exactly one Shape');
+            shape = inputShape[0];
+        }
+        else
+            shape = inputShape;
+        // input shape for the token embedding
+        if (shape.length === 2) {
+            // https://github.com/tensorflow/tfjs/blob/3daf152cb794f4da58fce5e21e09e8a4f89c8f80/tfjs-layers/src/layers/embeddings.ts#L155
+            // batch size and sequence length are undetermined
+            // so the output shape is [null, null, nEmbd]
+            if (shape[0] !== null || shape[1] !== null)
+                throw new Error('expected shape [null, null, ...]');
+            return [null, null, this.nEmbd];
+        }
+        // input shape for the language modeling head
+        // https://github.com/tensorflow/tfjs/blob/3daf152cb794f4da58fce5e21e09e8a4f89c8f80/tfjs-layers/src/layers/core.ts#L258
+        else if (shape.length === 3) {
+            // batch size and sequence length are undetermined
+            // so the output shape is [null, null, nEmbd]
+            if (shape[0] !== null || shape[1] !== null)
+                throw new Error('expected shape [null, null, ...]');
+            return [null, null, this.vocabSize];
+        }
+        else
+            throw new Error('unexpected input shape');
+    }
+    call(input, kwargs) {
+        return tf.tidy(() => {
+            if (this.embeddings === undefined)
+                throw new Error('not built');
+            if (Array.isArray(input)) {
+                if (input.length !== 1)
+                    throw new Error('expected exactly one tensor');
+                input = input[0];
+            }
+            this.invokeCallHook(input, kwargs);
+            // If the input is a 2D tensor, it is a batch of sequences of tokens
+            // so we translate the tokens into embeddings
+            // using `this.embeddings` as a lookup table
+            if (input.shape.length === 2) {
+                // (batch_size, sequence_length) => (batch_size, sequence_length, nEmbd)
+                return tf.gather(this.embeddings.read(), tf.cast(input, 'int32'), 0);
+            }
+            // If the input is a 3D tensor, it is a sequence of embeddings
+            // so we apply a dense layer to project the embeddings back into the vocabulary space
+            else if (input.shape.length === 3 && input.shape[2] === this.nEmbd) {
+                // Replicate the kernel for each batch element
+                const kernel = this.embeddings.read().expandDims(0).tile([input.shape[0], 1, 1]);
+                // TODO: rely on broadcasting when tfjs will support backpropagating through broadcasting
+                // Remove the tile, or use tf.einsum('BTE,VE->BTV', input, this.embeddings.read())
+                // to prevent tensor duplication but tensorflow.js fails to backpropagate einsum
+                // https://github.com/tensorflow/tfjs/issues/5690
+                // (batch_size, sequence_length, nEmbd) x (vocabSize, nEmbd)^T -> (batch_size, sequence_length, vocabSize)
+                return tf.matMul(input, kernel, false, true);
+            }
+            else {
+                throw new Error('unexpected input shape for token embeddings');
+            }
+        });
+    }
+}
+tf.serialization.registerClass(LMEmbedding);
 /**
  * The GPTArchitecture specifically defines a GPT forward pass, i.e.,
  * what are the inputs, the successive transformer blocks and the outputs. It is then
@@ -212,54 +390,54 @@ function TransformerBlock(conf) {
  */
 export function GPTArchitecture(config) {
     const inputs = tf.input({ shape: [null] });
-    //Token embedding
-    const tokEmb = config.tokEmb
-        ? tf.layers.embedding({
-            name: config.name + '/wte',
-            inputDim: config.vocabSize,
-            outputDim: config.nEmbd,
-            embeddingsInitializer: 'zeros',
-            embeddingsRegularizer: undefined,
-            activityRegularizer: undefined
-        }).apply(inputs)
-        : inputs;
+    // token embedding
+    const wte = new LMEmbedding(config.vocabSize, config.nEmbd, config.seed);
+    let tokEmb = wte.apply(inputs); // (batch_size, input length T, nEmbd)
+    if (config.debug) {
+        tokEmb = new LogLayer({ name: 'tokEmb_log' }).apply(tokEmb);
+    }
     // Positional embedding
     const range = new Range({}).apply(inputs);
     let posEmb = tf.layers.embedding({
-        name: config.name + '/wpe',
-        inputDim: config.blockSize,
+        name: config.name + '.wpe',
+        inputDim: config.contextLength,
         outputDim: config.nEmbd,
-        embeddingsInitializer: 'zeros'
+        embeddingsInitializer: tf.initializers.randomNormal({
+            mean: 0, stddev: 0.02, seed: config.seed
+        }),
     }).apply(range);
     if (config.debug) {
-        posEmb = new LogLayer({ name: 'posEmb' }).apply(posEmb);
+        posEmb = new LogLayer({ name: 'posEmb_log' }).apply(posEmb);
     }
     // token and positional embeddings are added together
     let x = tf.layers.add().apply([tokEmb, posEmb]);
     // dropout
-    x = tf.layers.dropout({ name: 'drop', rate: config.embdDrop }).apply(x);
+    x = tf.layers.dropout({
+        name: 'drop', rate: config.embdDrop, seed: config.seed
+    }).apply(x);
     if (config.debug) {
-        x = new LogLayer({ name: 'dropadd' }).apply(x);
+        x = new LogLayer({ name: 'drop_log' }).apply(x);
     }
-    //Apply successively transformer blocks, attention and dense layers
+    // apply successively transformer blocks, attention and dense layers
     for (let i = 0; i < config.nLayer; i++) {
-        x = TransformerBlock(Object.assign({}, config, { name: config.name + '/h/' + i })).apply(x);
+        x = TransformerBlock(Object.assign({}, config, { name: config.name + '.h' + i })).apply(x);
     }
     // Normalization
-    x = tf.layers.layerNormalization({ name: config.name + '/ln_f', epsilon: 1e-5 })
+    x = tf.layers.layerNormalization({
+        name: config.name + '.ln_f',
+        epsilon: 1e-5,
+        gammaInitializer: 'ones',
+        betaInitializer: 'zeros',
+    })
         .apply(x);
     if (config.debug) {
-        x = new LogLayer({ name: 'fin/ln' }).apply(x);
+        x = new LogLayer({ name: 'ln_f_log' }).apply(x);
     }
-    // Append a language modeling head if specified
-    if (config.lmHead) {
-        x = tf.layers.dense({
-            name: 'lm_head',
-            units: config.vocabSize,
-            inputDim: config.nEmbd,
-            inputShape: [config.blockSize, config.nEmbd],
-            useBias: false
-        }).apply(x);
+    // language modeling head
+    // GPT2 uses the same matrix for the token embedding and the modeling head
+    x = wte.apply(x);
+    if (config.debug) {
+        x = new LogLayer({ name: 'lm_head_log' }).apply(x);
     }
     return tf.model({ inputs, outputs: x });
 }

package/dist/models/gpt/model.d.ts CHANGED Viewed

@@ -16,7 +16,7 @@ export declare abstract class Dataset<T> {
  */
 export declare class GPTModel extends tf.LayersModel {
     protected readonly config: Required<GPTConfig>;
-    constructor(partialConfig?: GPTConfig, layersModel?: tf.LayersModel);
+    constructor(partialConfig?: Partial<GPTConfig>, layersModel?: tf.LayersModel);
     get getGPTConfig(): Required<GPTConfig>;
     compile(): void;
     fitDataset<T>(dataset: Dataset<T>, trainingArgs: tf.ModelFitDatasetArgs<T>): Promise<tf.History>;

package/dist/models/gpt/model.js CHANGED Viewed

@@ -1,10 +1,10 @@
 import createDebug from "debug";
 import * as tf from '@tensorflow/tfjs';
-import { getModelSizes, DEFAULT_CONFIG } from './config.js';
+import { getModelSizes, DefaultGPTConfig } from './config.js';
 import { getCustomAdam, clipByGlobalNormObj } from './optimizers.js';
 import evaluate from './evaluate.js';
 import { GPTArchitecture } from './layers.js';
-const debug = createDebug("discojs:models:gpt");
+const debug = createDebug("discojs:models:gpt:model");
 /**
  * GPTModel extends tf.LayersModel and overrides tfjs' default training loop
  *
@@ -13,7 +13,7 @@ export class GPTModel extends tf.LayersModel {
     config;
     constructor(partialConfig, layersModel) {
         // Fill missing config parameters with default values
-        let completeConfig = { ...DEFAULT_CONFIG, ...partialConfig };
+        let completeConfig = { ...DefaultGPTConfig, ...partialConfig };
         // Add layer sizes depending on which model has been specified
         completeConfig = { ...completeConfig, ...getModelSizes(completeConfig.modelType) };
         if (layersModel !== undefined) {
@@ -112,7 +112,7 @@ export class GPTModel extends tf.LayersModel {
                 tf.dispose([xs, ys]);
             }
             let logs = {
-                'loss': averageLoss / iteration,
+                'loss': averageLoss / (iteration - 1), // -1 because iteration got incremented at the end of the loop
                 'acc': accuracyFraction[0] / accuracyFraction[1],
             };
             if (evalDataset !== undefined) {

package/dist/processing/index.js CHANGED Viewed

@@ -33,11 +33,11 @@ export async function preprocess(task, dataset) {
             // cast as typescript doesn't reduce generic type
             const d = dataset;
             const t = task;
+            const contextLength = task.trainingInformation.contextLength;
             const tokenizer = await models.getTaskTokenizer(t);
-            const totalTokenCount = task.trainingInformation.maxSequenceLength ??
-                tokenizer.model_max_length;
-            return d
-                .map((line) => processing.tokenizeAndLeftPad(line, tokenizer, totalTokenCount))
+            return d.map(text => processing.tokenize(tokenizer, text))
+                .flatten()
+                .batch(contextLength + 1, 1)
                 .map((tokens) => [tokens.pop(), tokens.last()]);
         }
     }
@@ -60,12 +60,11 @@ export async function preprocessWithoutLabel(task, dataset) {
             // cast as typescript doesn't reduce generic type
             const d = dataset;
             const t = task;
+            const contextLength = task.trainingInformation.contextLength;
             const tokenizer = await models.getTaskTokenizer(t);
-            const totalTokenCount = t.trainingInformation.maxSequenceLength ??
-                tokenizer.model_max_length;
-            return d
-                .map((line) => processing.tokenizeAndLeftPad(line, tokenizer, totalTokenCount))
-                .map((tokens) => tokens.pop());
+            return d.map(text => processing.tokenize(tokenizer, text))
+                .flatten()
+                .batch(contextLength);
         }
     }
 }

package/dist/processing/text.d.ts CHANGED Viewed

@@ -1,11 +1,21 @@
-import { List } from "immutable";
 import { PreTrainedTokenizer } from "@xenova/transformers";
-type Token = number;
+import type { Text, TokenizedText } from '../index.js';
+interface TokenizingConfig {
+    padding?: boolean;
+    padding_side?: 'left' | 'right';
+    truncation?: boolean;
+    max_length?: number;
+}
 /**
- * Tokenize and truncates input strings
+ * Tokenize one line of text.
+ * Wrapper around Transformers.js tokenizer to handle type checking and format the output.
+ * Note that Transformers.js's tokenizer can tokenize multiple lines of text at once
+ * but we are currently not making use of it. Can be useful when padding a batch
  *
- * @param length number of tokens
- * @returns encoded string in an array of token, size of max_length
+ * @param tokenizer the tokenizer object
+ * @param text the text to tokenize
+ * @param config TokenizingConfig, the tokenizing parameters when using `tokenizer`
+ * @returns List<number> the tokenized text
  */
-export declare function tokenizeAndLeftPad(line: string, tokenizer: PreTrainedTokenizer, length: number): List<Token>;
+export declare function tokenize(tokenizer: PreTrainedTokenizer, text: Text, config?: TokenizingConfig): TokenizedText;
 export {};