npm - @epfml/discojs - Versions diffs - 3.0.1-p20241203151748.0 → 3.0.1-p20241206154707.0 - Mend

@epfml/discojs 3.0.1-p20241203151748.0 → 3.0.1-p20241206154707.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/dist/dataset/dataset.d.ts +18 -5
package/dist/dataset/dataset.js +58 -23
package/dist/dataset/types.d.ts +1 -0
package/dist/default_tasks/wikitext.js +5 -3
package/dist/models/gpt/config.d.ts +11 -6
package/dist/models/gpt/config.js +11 -7
package/dist/models/gpt/index.d.ts +5 -9
package/dist/models/gpt/index.js +36 -15
package/dist/models/gpt/layers.js +260 -82
package/dist/models/gpt/model.d.ts +1 -1
package/dist/models/gpt/model.js +4 -4
package/dist/processing/index.js +8 -9
package/dist/processing/text.d.ts +16 -6
package/dist/processing/text.js +29 -26
package/dist/task/training_information.d.ts +1 -1
package/dist/task/training_information.js +3 -4
package/dist/types/data_format.d.ts +2 -2
package/dist/validator.js +2 -2
package/package.json +1 -1

package/dist/dataset/dataset.d.ts CHANGED Viewed

@@ -25,15 +25,22 @@ export declare class Dataset<T> implements AsyncIterable<T> {
      * @param ratio between 0 (all on left) and 1 (all on right)
      */
     split(ratio: number): [Dataset<T>, Dataset<T>];
-    /** Slice into chunks
+    /** Create batches of `size` elements with potential overlap.
+     * Last batch is smaller if dataset isn't perfectly divisible
      *
-     * Last slice is smaller if dataset isn't perfectly divisible
+     * If overlap is set to a positive integer, the last `overlap` elements of a batch
+     * are the first `overlap` elements of the next batch.
+     *
+     * This method is tailored to create text sequences where each token's label is the following token.
+     * In order to have a label for the last token of the input sequence, we include the first token
+     * of the next sequence (i.e. with an overlap of 1).
      *
      * @param size count of element per chunk
+     * @param overlap number of elements overlapping between two consecutive batches
      */
-    batch(size: number): Dataset<Batched<T>>;
-    /** Flatten chunks */
-    unbatch<U>(this: Dataset<Batched<U>>): Dataset<U>;
+    batch(size: number, overlap?: number): Dataset<Batched<T>>;
+    /** Flatten batches/arrays of elements */
+    flatten<U>(this: Dataset<DatasetLike<U>>): Dataset<U>;
     /** Join side-by-side
      *
      * Stops as soon as one runs out
@@ -41,6 +48,12 @@ export declare class Dataset<T> implements AsyncIterable<T> {
      * @param other right side
      **/
     zip<U>(other: Dataset<U> | DatasetLike<U>): Dataset<[T, U]>;
+    /**
+     * Repeat the dataset `times` times
+     * @param times number of times to repeat the dataset, if undefined, the dataset is repeated indefinitely
+     * @returns a dataset repeated `times` times
+     */
+    repeat(times?: number): Dataset<T>;
     /** Compute size
      *
      * This is a costly operation as we need to go through the whole Dataset.

package/dist/dataset/dataset.js CHANGED Viewed

@@ -1,6 +1,22 @@
 import createDebug from "debug";
 import { List, Range } from "immutable";
 const debug = createDebug("discojs:dataset");
+/** Convert a DatasetLike object to an async generator */
+async function* datasetLikeToGenerator(content) {
+    let iter;
+    if (typeof content === "function")
+        iter = content();
+    else if (Symbol.asyncIterator in content)
+        iter = content[Symbol.asyncIterator]();
+    else
+        iter = content[Symbol.iterator]();
+    while (true) {
+        const result = await iter.next();
+        if (result.done === true)
+            break;
+        yield result.value;
+    }
+}
 /** Immutable series of data */
 export class Dataset {
     #content;
@@ -11,19 +27,7 @@ export class Dataset {
      */
     constructor(content) {
         this.#content = async function* () {
-            let iter;
-            if (typeof content === "function")
-                iter = content();
-            else if (Symbol.asyncIterator in content)
-                iter = content[Symbol.asyncIterator]();
-            else
-                iter = content[Symbol.iterator]();
-            while (true) {
-                const result = await iter.next();
-                if (result.done === true)
-                    break;
-                yield result.value;
-            }
+            yield* datasetLikeToGenerator(content);
         };
     }
     [Symbol.asyncIterator]() {
@@ -87,19 +91,31 @@ export class Dataset {
             }.bind(this)),
         ];
     }
-    /** Slice into chunks
+    /** Create batches of `size` elements with potential overlap.
+     * Last batch is smaller if dataset isn't perfectly divisible
+     *
+     * If overlap is set to a positive integer, the last `overlap` elements of a batch
+     * are the first `overlap` elements of the next batch.
      *
-     * Last slice is smaller if dataset isn't perfectly divisible
+     * This method is tailored to create text sequences where each token's label is the following token.
+     * In order to have a label for the last token of the input sequence, we include the first token
+     * of the next sequence (i.e. with an overlap of 1).
      *
      * @param size count of element per chunk
+     * @param overlap number of elements overlapping between two consecutive batches
      */
-    batch(size) {
+    batch(size, overlap = 0) {
         if (size <= 0 || !Number.isInteger(size))
             throw new Error("invalid size");
+        if (overlap >= size || !Number.isInteger(overlap))
+            throw new Error("invalid overlap");
         return new Dataset(async function* () {
             const iter = this[Symbol.asyncIterator]();
+            let overlapped = List();
             for (;;) {
-                const batch = List(await Promise.all(Range(0, size).map(() => iter.next()))).flatMap((res) => {
+                const batch = List(
+                // get the first elements of the next batch
+                await Promise.all(Range(overlapped.size, size).map(() => iter.next()))).flatMap((res) => {
                     if (res.done)
                         return [];
                     else
@@ -107,18 +123,21 @@ export class Dataset {
                 });
                 if (batch.isEmpty())
                     break;
-                yield batch;
+                // yield the current batch with the first elements of the next batch
+                yield overlapped.concat(batch);
+                overlapped = batch.takeLast(overlap);
                 // iterator couldn't generate more
-                if (batch.size < size)
+                if (batch.size < size - overlap)
                     break;
             }
         }.bind(this));
     }
-    /** Flatten chunks */
-    unbatch() {
+    /** Flatten batches/arrays of elements */
+    flatten() {
         return new Dataset(async function* () {
-            for await (const batch of this)
-                yield* batch;
+            for await (const batch of this) {
+                yield* datasetLikeToGenerator(batch);
+            }
         }.bind(this));
     }
     /** Join side-by-side
@@ -141,6 +160,22 @@ export class Dataset {
             }
         }.bind(this));
     }
+    /**
+     * Repeat the dataset `times` times
+     * @param times number of times to repeat the dataset, if undefined, the dataset is repeated indefinitely
+     * @returns a dataset repeated `times` times
+     */
+    repeat(times) {
+        if (times !== undefined && (!Number.isInteger(times) || times < 1))
+            throw new Error("times needs to be a positive integer or undefined");
+        return new Dataset(async function* () {
+            let loop = 0;
+            do {
+                yield* this;
+                loop++;
+            } while (times === undefined || loop < times);
+        }.bind(this));
+    }
     /** Compute size
      *
      * This is a costly operation as we need to go through the whole Dataset.

package/dist/dataset/types.d.ts CHANGED Viewed

@@ -4,3 +4,4 @@ export type Batched<T> = List<T>;
 export { Image };
 export type Tabular = Partial<Record<string, string>>;
 export type Text = string;
+export type TokenizedText = List<number>;

package/dist/default_tasks/wikitext.js CHANGED Viewed

@@ -31,14 +31,16 @@ export const wikitext = {
                 // But if set to 0 then the webapp doesn't display the validation metrics
                 validationSplit: 0.1,
                 roundDuration: 2,
-                batchSize: 1, // If set too high (e.g. 16) firefox raises a WebGL error
+                batchSize: 8, // If set too high firefox raises a WebGL error
                 tokenizer: 'Xenova/gpt2',
-                maxSequenceLength: 128,
+                contextLength: 64,
                 tensorBackend: 'gpt'
             }
         };
     },
     getModel() {
-        return Promise.resolve(new models.GPT());
+        return Promise.resolve(new models.GPT({
+            contextLength: this.getTask().trainingInformation.contextLength,
+        }));
     }
 };

package/dist/models/gpt/config.d.ts CHANGED Viewed

@@ -1,8 +1,8 @@
 type GPTModelType = 'gpt2' | 'gpt2-medium' | 'gpt2-large' | 'gpt2-xl' | 'gpt-mini' | 'gpt-micro' | 'gpt-nano';
 export interface GPTConfig {
     lr: number;
-    blockSize: number;
-    vocabSize: number;
+    contextLength: number;
+    vocabSize?: number;
     modelType: GPTModelType;
     name?: string;
     evaluate?: boolean;
@@ -11,22 +11,27 @@ export interface GPTConfig {
     maxIter?: number;
     weightDecay?: number;
     verbose?: 0 | 1;
-    bias?: boolean;
     debug?: boolean;
     dropout?: number;
     residDrop?: number;
     embdDrop?: number;
-    tokEmb?: boolean;
-    lmHead?: boolean;
     nLayer?: number;
     nHead?: number;
     nEmbd?: number;
+    seed?: number;
 }
-export declare const DEFAULT_CONFIG: Required<GPTConfig>;
+export declare const DefaultGPTConfig: Required<GPTConfig>;
 export type ModelSize = {
     nLayer: number;
     nHead: number;
     nEmbd: number;
 };
 export declare function getModelSizes(modelType: GPTModelType): Required<ModelSize>;
+export interface GenerationConfig {
+    doSample: boolean;
+    temperature: number;
+    topk: number;
+    seed: number;
+}
+export declare const DefaultGenerationConfig: Required<GenerationConfig>;
 export {};

package/dist/models/gpt/config.js CHANGED Viewed

@@ -1,6 +1,6 @@
 // for a benchmark of performance, see https://github.com/epfml/disco/pull/659
-export const DEFAULT_CONFIG = {
-    name: 'transformer',
+export const DefaultGPTConfig = {
+    name: 'transformer', // prefix for the model layer names
     lr: 0.001,
     weightDecay: 0,
     maxIter: 10,
@@ -9,18 +9,16 @@ export const DEFAULT_CONFIG = {
     evaluate: true,
     maxEvalBatches: 12,
     evaluateEvery: 100,
-    blockSize: 128,
-    vocabSize: 50258,
-    bias: true,
+    contextLength: 128,
+    vocabSize: 50257,
     debug: false,
     dropout: 0.2,
     residDrop: 0.2,
     embdDrop: 0.2,
-    tokEmb: true,
-    lmHead: true,
     nLayer: 3,
     nHead: 3,
     nEmbd: 48,
+    seed: Math.random(),
 };
 export function getModelSizes(modelType) {
     switch (modelType) {
@@ -40,3 +38,9 @@ export function getModelSizes(modelType) {
             return { nLayer: 3, nHead: 3, nEmbd: 48 };
     }
 }
+export const DefaultGenerationConfig = {
+    temperature: 1.0,
+    doSample: false,
+    seed: Math.random(),
+    topk: 50
+};

package/dist/models/gpt/index.d.ts CHANGED Viewed

@@ -1,23 +1,20 @@
 /**
- * this code is taken from gpt-tfjs with modifications from @peacefulotter and @lukemovement
+ * Source: https://github.com/zemlyansky/gpt-tfjs and https://github.com/karpathy/build-nanogpt
+ * With modifications from @peacefulotter, @lukemovement and the Disco team
  **/
 import * as tf from "@tensorflow/tfjs";
 import type { Batched, Dataset, DataFormat } from "../../index.js";
 import { WeightsContainer } from "../../index.js";
 import { BatchLogs, Model, EpochLogs } from "../index.js";
-import { type GPTConfig } from "./config.js";
+import type { GPTConfig, GenerationConfig } from './config.js';
 export type GPTSerialization = {
     weights: WeightsContainer;
     config?: GPTConfig;
 };
-interface PredictConfig {
-    temperature: number;
-    doSample: boolean;
-}
 export declare class GPT extends Model<"text"> {
     #private;
     private readonly model;
-    constructor(partialConfig?: GPTConfig, layersModel?: tf.LayersModel);
+    constructor(partialConfig?: Partial<GPTConfig>, layersModel?: tf.LayersModel);
     /**
      * The GPT train methods wraps the model.fitDataset call in a for loop to act as a generator (of logs)
      * This allows for getting logs and stopping training without callbacks.
@@ -28,7 +25,7 @@ export declare class GPT extends Model<"text"> {
      * @param tracker
      */
     train(trainingDataset: Dataset<Batched<DataFormat.ModelEncoded["text"]>>, validationDataset?: Dataset<Batched<DataFormat.ModelEncoded["text"]>>): AsyncGenerator<BatchLogs, EpochLogs>;
-    predict(batch: Batched<DataFormat.ModelEncoded["text"][0]>, options?: Partial<PredictConfig>): Promise<Batched<DataFormat.ModelEncoded["text"][1]>>;
+    predict(batch: Batched<DataFormat.ModelEncoded["text"][0]>, options?: Partial<GenerationConfig>): Promise<Batched<DataFormat.ModelEncoded["text"][1]>>;
     get config(): Required<GPTConfig>;
     get weights(): WeightsContainer;
     set weights(ws: WeightsContainer);
@@ -37,4 +34,3 @@ export declare class GPT extends Model<"text"> {
     extract(): tf.LayersModel;
     [Symbol.dispose](): void;
 }
-export {};

package/dist/models/gpt/index.js CHANGED Viewed

@@ -1,5 +1,6 @@
 /**
- * this code is taken from gpt-tfjs with modifications from @peacefulotter and @lukemovement
+ * Source: https://github.com/zemlyansky/gpt-tfjs and https://github.com/karpathy/build-nanogpt
+ * With modifications from @peacefulotter, @lukemovement and the Disco team
  **/
 import createDebug from "debug";
 import { List, Range } from "immutable";
@@ -7,12 +8,12 @@ import * as tf from "@tensorflow/tfjs";
 import { WeightsContainer } from "../../index.js";
 import { Model, EpochLogs } from "../index.js";
 import { GPTModel } from "./model.js";
-import { DEFAULT_CONFIG } from "./config.js";
 import evaluate from "./evaluate.js";
+import { DefaultGPTConfig, DefaultGenerationConfig } from './config.js';
 const debug = createDebug("discojs:models:gpt");
 export class GPT extends Model {
     model;
-    #blockSize;
+    #contextLength;
     #maxBatchCount;
     #vocabSize;
     constructor(partialConfig, layersModel) {
@@ -20,9 +21,9 @@ export class GPT extends Model {
         const model = new GPTModel(partialConfig, layersModel);
         model.compile();
         this.model = model;
-        this.#blockSize = partialConfig?.blockSize ?? DEFAULT_CONFIG.blockSize;
-        this.#maxBatchCount = partialConfig?.maxIter ?? DEFAULT_CONFIG.maxIter;
-        this.#vocabSize = partialConfig?.vocabSize ?? DEFAULT_CONFIG.vocabSize;
+        this.#contextLength = partialConfig?.contextLength ?? DefaultGPTConfig.contextLength;
+        this.#maxBatchCount = partialConfig?.maxIter ?? DefaultGPTConfig.maxIter;
+        this.#vocabSize = partialConfig?.vocabSize ?? DefaultGPTConfig.vocabSize;
     }
     /**
      * The GPT train methods wraps the model.fitDataset call in a for loop to act as a generator (of logs)
@@ -85,16 +86,21 @@ export class GPT extends Model {
         }));
     }
     async predict(batch, options) {
-        const config = {
-            temperature: 1.0,
-            doSample: false,
-            ...options,
-        };
+        // overwrite default with user config
+        const config = Object.assign({}, DefaultGenerationConfig, options);
         return List(await Promise.all(batch.map((tokens) => this.#predictSingle(tokens, config))));
     }
+    /**
+     * Generate the next token after the input sequence.
+     * In other words, takes an input tensor of shape (prompt length T) and returns a tensor of shape (T+1)
+     *
+     * @param token input tokens of shape (T,). T is truncated to the model's context length
+     * @param config generation config: temperature, doSample, topk
+     * @returns the next token predicted by the model
+     */
     async #predictSingle(tokens, config) {
         // slice input tokens if longer than context length
-        tokens = tokens.slice(-this.#blockSize);
+        tokens = tokens.slice(-this.#contextLength);
         const input = tf.tidy(() => tf.tensor1d(tokens.toArray(), "int32").expandDims(0));
         const logits = tf.tidy(() => {
             const output = this.model.predict(input);
@@ -111,9 +117,24 @@ export class GPT extends Model {
             .div(config.temperature)
             .softmax());
         logits.dispose();
-        const next = tf.tidy(() => config.doSample
-            ? tf.multinomial(probs, 1).squeeze([0])
-            : probs.argMax());
+        const next = tf.tidy(() => {
+            if (config.doSample) {
+                // returns topk biggest values among the `vocab_size` probabilities and the corresponding tokens indices
+                // both shapes are (config.topk,)
+                const { values: topkProbs, indices: topkTokens } = tf.topk(probs, config.topk);
+                // sample an index from the top-k probabilities
+                // e.g. [[0.1, 0.4, 0.3], [0.1, 0.2, 0.5]] -> [[1], [2]]
+                // note: multinomial does not need the input to sum to 1
+                const selectedIndices = tf.multinomial(topkProbs, 1, config.seed, false); // (B, )
+                // return the corresponding token from the sampled indices (one per sequence in the batch).
+                // if for some reason the probabilities are NaN, selectedIndices will be out of bounds
+                return topkTokens.gather(selectedIndices).squeeze([0]); // (1)
+            }
+            else {
+                // greedy decoding: return the token with the highest probability
+                return probs.argMax();
+            }
+        });
         probs.dispose();
         const ret = await next.array();
         next.dispose();

package/dist/models/gpt/layers.js CHANGED Viewed

@@ -1,4 +1,6 @@
+import createDebug from "debug";
 import * as tf from '@tensorflow/tfjs';
+const debug = createDebug("discojs:models:gpt:layers");
 /**
  * Defines a range, from 0 to T, that is used to create positional embeddings
  */
@@ -10,7 +12,8 @@ class Range extends tf.layers.Layer {
     call(input, kwargs) {
         return tf.tidy(() => {
             if (Array.isArray(input)) {
-                // TODO support multitensor
+                if (input.length !== 1)
+                    throw new Error('expected exactly one tensor');
                 input = input[0];
             }
             this.invokeCallHook(input, kwargs);
@@ -22,6 +25,11 @@ class Range extends tf.layers.Layer {
     }
 }
 tf.serialization.registerClass(Range);
+/**
+ * LogLayer is a layer that allows debugging the input that is fed to this layer
+ * This layer allows to inspect the input tensor at a specific point
+ * in the model by adding a log layer in the model definition
+ */
 class LogLayer extends tf.layers.Layer {
     static className = 'LogLayer';
     computeOutputShape(inputShape) {
@@ -30,9 +38,19 @@ class LogLayer extends tf.layers.Layer {
     call(input, kwargs) {
         return tf.tidy(() => {
             if (Array.isArray(input)) {
+                if (input.length !== 1)
+                    throw new Error('expected exactly one tensor');
                 input = input[0];
             }
             this.invokeCallHook(input, kwargs);
+            const logs = {
+                'shape': input.shape,
+                'is_only_zero': !!input.equal(tf.tensor(0)).all().dataSync()[0],
+                'has_some_NaN': !!input.isNaN().any().dataSync()[0],
+                'min': +input.min().dataSync()[0].toPrecision(3),
+                'max': +input.max().dataSync()[0].toPrecision(3),
+            };
+            debug("%s logged: %o", this.name, logs);
             return input;
         });
     }
@@ -43,8 +61,9 @@ class CausalSelfAttention extends tf.layers.Layer {
     static className = 'CausalSelfAttention';
     nHead;
     nEmbd;
+    nLayer;
     dropout;
-    bias;
+    seed;
     mask;
     cAttnKernel;
     cAttnBias;
@@ -53,20 +72,34 @@ class CausalSelfAttention extends tf.layers.Layer {
     constructor(config) {
         super(config);
         this.config = config;
+        if (config.nEmbd % config.nHead !== 0)
+            throw new Error('The embedding dimension `nEmbd` must be divisible by the number of attention heads `nHead`');
         this.nEmbd = config.nEmbd;
         this.nHead = config.nHead;
+        this.nLayer = config.nLayer;
         this.dropout = config.dropout;
-        this.bias = config.bias;
+        this.seed = config.seed;
         // mask is a lower triangular matrix filled with 1
         // calling bandPart zero out the upper triangular part of the all-ones matrix
         // from the doc: tf.linalg.band_part(input, -1, 0) ==> Lower triangular part
-        this.mask = tf.linalg.bandPart(tf.ones([config.blockSize, config.blockSize]), -1, 0);
+        this.mask = tf.linalg.bandPart(tf.ones([config.contextLength, config.contextLength]), -1, 0);
     }
     build() {
-        this.cAttnKernel = this.addWeight('c_attn/kernel', [this.nEmbd, 3 * this.nEmbd], 'float32', tf.initializers.glorotNormal({}));
-        this.cAttnBias = this.addWeight('c_attn/bias', [3 * this.nEmbd], 'float32', tf.initializers.zeros());
-        this.cProjKernel = this.addWeight('c_proj/kernel', [this.nEmbd, this.nEmbd], 'float32', tf.initializers.glorotNormal({}));
-        this.cProjBias = this.addWeight('c_proj/bias', [this.nEmbd], 'float32', tf.initializers.zeros());
+        // key, query, value projections for all heads, but in a batch
+        this.cAttnKernel = this.addWeight('c_attn.weight', [this.nEmbd, 3 * this.nEmbd], 'float32', tf.initializers.randomNormal({ mean: 0, stddev: 0.02, seed: this.seed }) // use same init as GPT2
+        );
+        this.cAttnBias = this.addWeight('c_attn.bias', [3 * this.nEmbd], 'float32', tf.initializers.zeros());
+        // output projection
+        this.cProjKernel = this.addWeight('c_proj.kernel', [this.nEmbd, this.nEmbd], 'float32',
+        // the input keeps accumulating through the residual stream so we
+        // scale the initialization with the nb of layers to keep a unit std
+        // Sources:
+        // https://github.com/karpathy/build-nanogpt/blob/6104ab1b53920f6e2159749676073ff7d815c1fa/train_gpt2.py#L103
+        // https://youtu.be/l8pRSuU81PU?si=5GcKfi_kPgLgvtg2&t=4640
+        tf.initializers.randomNormal({
+            mean: 0, stddev: 0.02 * Math.sqrt(2 * this.nLayer), seed: this.seed
+        }));
+        this.cProjBias = this.addWeight('c_proj.bias', [this.nEmbd], 'float32', tf.initializers.zeros());
     }
     computeOutputShape(inputShape) {
         return inputShape;
@@ -84,58 +117,72 @@ class CausalSelfAttention extends tf.layers.Layer {
                 throw new Error('not built');
             }
             if (Array.isArray(input)) {
+                if (input.length !== 1)
+                    throw new Error('expected exactly one tensor');
                 input = input[0];
             }
             this.invokeCallHook(input, kwargs);
             const dense = (x, kernel, bias) => {
+                // TODO: use broadcasting when tfjs will support backpropagating through broadcasting
                 const k = kernel.read().expandDims(0).tile([x.shape[0], 1, 1]);
                 const m = x.matMul(k);
-                if (this.bias) {
-                    return tf.add(m, bias.read());
-                }
-                else {
-                    return m;
-                }
+                return tf.add(m, bias.read());
             };
             // Apply attention weights to inputs as one big matrix which is then split into the
             // query, key and value submatrices
+            // nHead is "number of heads", hs is "head size", and C (number of channels) = n_embd = nHead * hs
+            // e.g. in GPT-2 (124M), nHead = 12, hs = 64, so nHead * hs = C = 768 channels in the Transformer
             const cAttn = dense(input, this.cAttnKernel, this.cAttnBias);
             let [q, k, v] = tf.split(cAttn, 3, -1);
-            const [B, T, C] = k.shape;
-            const splitHeads = (x) => tf.transpose(tf.reshape(x, [B, T, this.nHead, C / this.nHead]), [0, 2, 1, 3]);
-            q = splitHeads(q);
-            k = splitHeads(k);
-            v = splitHeads(v);
-            // Scaled self attention: query @ key / sqrt(n_heads)
-            let att = tf.mul(tf.matMul(q, k, false, true), tf.div(1, tf.sqrt(tf.cast(k.shape[k.shape.length - 1], 'float32'))));
-            // The next operations apply attention to the past tokens, which is
-            // essentially a weighted average of the past tokens with complicated weights,
-            // and makes sure to not pay any attention to future tokens
+            // Follow naming conventions in https://github.com/karpathy/build-nanogpt/
+            const [B, T, C] = k.shape; // batch size, sequence length, embedding dimensionality (number of channels)
+            const splitHeads = (x) => tf.transpose(tf.reshape(x, [B, T, this.nHead, C / this.nHead]), // (B, T, nHead, head size)
+            [0, 2, 1, 3] // (B, nHead, T, hs)
+            );
+            q = splitHeads(q); // (B, nHead, T, hs)
+            k = splitHeads(k); // (B, nHead, T, hs)
+            v = splitHeads(v); // (B, nHead, T, hs)
+            // Scaled self attention: query @ key / sqrt(hs)
+            // Matrix representing the token-to-token attention (B, nHead, T, T)
+            let att = tf.mul(tf.matMul(q, k, false, true), // (B, nHead, T, hs) x (B, nHead, hs, T) -> (B, nHead, T, T)
+            tf.div(1, tf.sqrt(tf.cast(k.shape[k.shape.length - 1], 'float32'))) // 1 / sqrt(hs)
+            );
+            /**
+             * The next operations apply attention only on the past tokens, which is
+             * essentially a weighted average of the past tokens with complicated weights,
+             * it relies on a mask to not "pay any attention" to future tokens
+             */
             // mask is lower triangular matrix filled with 1
-            const mask = this.mask.slice([0, 0], [T, T]);
+            const mask = this.mask.slice([0, 0], [T, T]); // (T, T)
             // 1 - mask                   => upper triangular matrix filled with 1
             // (1 - mask) * -10^9         => upper triangular matrix filled with -inf
             // att + ((1 - mask) * -10^9) => lower triangular part is the same as the `att` matrix
             //                               upper triangular part is -inf
-            att = tf.add(att, tf.mul(tf.sub(1, mask), -1e9));
-            // applying softmax zeros out the upper triangular part
-            //(which are the attention weights of future tokens)
+            att = tf.add(att, tf.mul(tf.sub(1, mask), -1e9)); // (B, nHead, T, T)
+            // applying softmax zeroes out the upper triangular part (softmax(-inf) = 0)
+            // i.e., zeroes out future tokens's attention weights
             // and creates a probability distribution for the lower triangular
             // (attention weights of past tokens). The probability distribution ensures
             // that the attention weights of past tokens for a particular token sum to one
             att = tf.softmax(att, -1);
-            att = kwargs.training === true ? tf.dropout(att, this.dropout) : att;
+            att = kwargs.training === true ? tf.dropout(att, this.dropout, undefined, this.seed) : att;
             // This is where the (attention-)weighted sum of past values is performed
-            let y = tf.matMul(att, v);
-            y = tf.transpose(y, [0, 2, 1, 3]);
-            y = tf.reshape(y, [B, T, C]);
-            y = dense(y, this.cProjKernel, this.cProjBias);
-            y = kwargs.training === true ? tf.dropout(y, this.dropout) : y;
+            let y = tf.matMul(att, v); // (B, nHead, T, T) x (B, nHead, T, hs) -> (B, nHead, T, hs)
+            y = tf.transpose(y, [0, 2, 1, 3]); // (B, T, nHead, hs)
+            y = tf.reshape(y, [B, T, C]); // (B, T, C = nHead * hs)
+            y = dense(y, this.cProjKernel, this.cProjBias); // output projection (B, T, C)
+            y = kwargs.training === true ? tf.dropout(y, this.dropout, undefined, this.seed) : y;
             return y;
         });
     }
 }
 tf.serialization.registerClass(CausalSelfAttention);
+/**
+ * GELU with tanh approximate
+ * GELU(x) = x * 0.5 * (1 + Tanh[sqrt(2/π) * (x + 0.044715 * x^3)])
+ *
+ * https://pytorch.org/docs/stable/generated/torch.nn.GELU.html
+ */
 class GELU extends tf.layers.Layer {
     static className = 'GELU';
     constructor() {
@@ -148,11 +195,17 @@ class GELU extends tf.layers.Layer {
         return tf.tidy(() => {
             if (Array.isArray(input)) {
                 // TODO support multitensor
+                if (input.length !== 1)
+                    throw new Error('expected exactly one tensor');
                 input = input[0];
             }
             this.invokeCallHook(input, kwargs);
-            const cdf = tf.mul(0.5, tf.add(1, tf.tanh(tf.mul(tf.sqrt(tf.div(2, Math.PI)), tf.add(input, tf.mul(0.044715, tf.pow(input, 3)))))));
-            return tf.mul(input, cdf);
+            const cdf = tf.mul(// 0.5 * (1 + Tanh[sqrt(2/π) * (x + 0.044715 * x^3)])
+            0.5, tf.add(1, tf.tanh(// Tanh[sqrt(2/π) * (x + 0.044715 * x^3)]
+            tf.mul(tf.sqrt(tf.div(2, Math.PI)), // (sqrt(2/π)
+            tf.add(input, tf.mul(0.044715, tf.pow(input, 3))) // (x + 0.044715 * x^3)
+            ))));
+            return tf.mul(input, cdf); // x * 0.5 * (1 + Tanh[sqrt(2/π) * (x + 0.044715 * x^3)])
         });
     }
 }
@@ -160,48 +213,173 @@ tf.serialization.registerClass(GELU);
 function MLP(config) {
     return tf.sequential({ layers: [
             tf.layers.dense({
-                name: config.name + `/mlp/c_fc`,
+                name: config.name + `.mlp.c_fc`,
                 units: 4 * config.nEmbd,
                 inputDim: config.nEmbd,
-                inputShape: [config.blockSize, config.nEmbd]
+                inputShape: [config.contextLength, config.nEmbd],
+                kernelInitializer: tf.initializers.randomNormal({
+                    mean: 0, stddev: 0.02, seed: config.seed
+                }),
             }),
             new GELU(),
             tf.layers.dense({
-                name: config.name + '/mlp/c_proj',
+                name: config.name + '.mlp.c_proj',
                 units: config.nEmbd,
                 inputDim: 4 * config.nEmbd,
-                inputShape: [config.blockSize, 4 * config.nEmbd]
+                inputShape: [config.contextLength, 4 * config.nEmbd],
+                kernelInitializer: tf.initializers.randomNormal({
+                    mean: 0, stddev: 0.02 * Math.sqrt(2 * config.nLayer), seed: config.seed
+                }),
             }),
             tf.layers.dropout({
-                name: config.name + '/mlp/drop',
-                rate: config.residDrop
+                name: config.name + '.mlp.drop',
+                rate: config.residDrop,
+                seed: config.seed
             }),
         ] });
 }
+/**
+ * Performs the following operations:
+ * x1 = input + mlp(layernorm_1(input))
+ * output = x1 + mlp(layernorm_2(x1))
+ */
 function TransformerBlock(conf) {
-    const config = Object.assign({ name: 'h' }, conf);
-    const inputs = tf.input({ shape: [config.blockSize, config.nEmbd] });
+    const config = Object.assign({ name: '.h' }, conf);
+    const inputs = tf.input({ shape: [config.contextLength, config.nEmbd] });
     let x1, x2;
     // input normalization
-    x1 = tf.layers.layerNormalization({ name: config.name + '/ln_1', epsilon: 1e-5 })
-        .apply(inputs);
+    x1 = tf.layers.layerNormalization({
+        name: config.name + '.ln_1',
+        epsilon: 1e-5,
+        gammaInitializer: 'ones', // already the default but make it explicit
+        betaInitializer: 'zeros',
+    }).apply(inputs);
     if (config.debug) {
-        x1 = new LogLayer({ name: config.name + '/ln_1_log' }).apply(x1);
+        x1 = new LogLayer({ name: config.name + '.ln_1_log' }).apply(x1);
     }
     // self attention layer
-    x1 = new CausalSelfAttention(Object.assign({}, config, { name: config.name + '/attn' })).apply(x1);
+    x1 = new CausalSelfAttention(Object.assign({}, config, { name: config.name + '.attn' })).apply(x1);
+    if (config.debug) {
+        x1 = new LogLayer({ name: config.name + '.attn_log' }).apply(x1);
+    }
     // Residual connection
     x1 = tf.layers.add().apply([inputs, x1]);
+    if (config.debug) {
+        x1 = new LogLayer({ name: config.name + '.residual_log' }).apply(x1);
+    }
     // normalization
-    x2 = tf.layers
-        .layerNormalization({ name: config.name + '/ln_2', epsilon: 1e-5 })
-        .apply(x1);
+    x2 = tf.layers.layerNormalization({
+        name: config.name + '.ln_2',
+        epsilon: 1e-5,
+        gammaInitializer: 'ones',
+        betaInitializer: 'zeros',
+    }).apply(x1);
+    if (config.debug) {
+        x2 = new LogLayer({ name: config.name + '.ln_2_log' }).apply(x2);
+    }
     // MLP
-    x2 = MLP(Object.assign({}, config, { name: config.name })).apply(x2);
+    x2 = MLP(Object.assign({}, config, { name: config.name + '.mlp' })).apply(x2);
+    if (config.debug) {
+        x2 = new LogLayer({ name: config.name + '.mlp_log' }).apply(x2);
+    }
     // add attention output to mlp output
     x2 = tf.layers.add().apply([x1, x2]);
+    if (config.debug) {
+        x2 = new LogLayer({ name: config.name + '.add_log' }).apply(x2);
+    }
     return tf.model({ name: config.name, inputs, outputs: x2 });
 }
+/**
+ * LanguageModelEmbedding is a layer that combines the token embeddings and the language modeling head
+ * I.e. LMEmbedding is used to translate token indices into token embeddings
+ * as well as to project embeddings back into token indices
+ * The GPT2 model uses the same embedding matrix for both the token embeddings and the language modeling head
+ * Because Tensorflow.js doesn't offer an easy weight sharing mechanism, we need to define a custom layer
+ * that can be used for both the token embeddings and the language modeling head.
+ * In the GPT2 model definition, this layers corresponds to wte and lm_head (which reuses wte)
+ */
+class LMEmbedding extends tf.layers.Layer {
+    vocabSize;
+    nEmbd;
+    seed;
+    static className = 'LMEmbedding';
+    embeddings;
+    constructor(vocabSize, nEmbd, seed) {
+        super({});
+        this.vocabSize = vocabSize;
+        this.nEmbd = nEmbd;
+        this.seed = seed;
+    }
+    build() {
+        this.embeddings = this.addWeight('wte', //use same name as GPT2
+        [this.vocabSize, this.nEmbd], 'float32', tf.initializers.randomNormal({ mean: 0, stddev: 0.02, seed: this.seed }));
+    }
+    computeOutputShape(inputShape) {
+        let shape;
+        if (Array.isArray(inputShape) && Array.isArray(inputShape[0])) {
+            if (inputShape.length !== 1)
+                throw new Error('Expected exactly one Shape');
+            shape = inputShape[0];
+        }
+        else
+            shape = inputShape;
+        // input shape for the token embedding
+        if (shape.length === 2) {
+            // https://github.com/tensorflow/tfjs/blob/3daf152cb794f4da58fce5e21e09e8a4f89c8f80/tfjs-layers/src/layers/embeddings.ts#L155
+            // batch size and sequence length are undetermined
+            // so the output shape is [null, null, nEmbd]
+            if (shape[0] !== null || shape[1] !== null)
+                throw new Error('expected shape [null, null, ...]');
+            return [null, null, this.nEmbd];
+        }
+        // input shape for the language modeling head
+        // https://github.com/tensorflow/tfjs/blob/3daf152cb794f4da58fce5e21e09e8a4f89c8f80/tfjs-layers/src/layers/core.ts#L258
+        else if (shape.length === 3) {
+            // batch size and sequence length are undetermined
+            // so the output shape is [null, null, nEmbd]
+            if (shape[0] !== null || shape[1] !== null)
+                throw new Error('expected shape [null, null, ...]');
+            return [null, null, this.vocabSize];
+        }
+        else
+            throw new Error('unexpected input shape');
+    }
+    call(input, kwargs) {
+        return tf.tidy(() => {
+            if (this.embeddings === undefined)
+                throw new Error('not built');
+            if (Array.isArray(input)) {
+                if (input.length !== 1)
+                    throw new Error('expected exactly one tensor');
+                input = input[0];
+            }
+            this.invokeCallHook(input, kwargs);
+            // If the input is a 2D tensor, it is a batch of sequences of tokens
+            // so we translate the tokens into embeddings
+            // using `this.embeddings` as a lookup table
+            if (input.shape.length === 2) {
+                // (batch_size, sequence_length) => (batch_size, sequence_length, nEmbd)
+                return tf.gather(this.embeddings.read(), tf.cast(input, 'int32'), 0);
+            }
+            // If the input is a 3D tensor, it is a sequence of embeddings
+            // so we apply a dense layer to project the embeddings back into the vocabulary space
+            else if (input.shape.length === 3 && input.shape[2] === this.nEmbd) {
+                // Replicate the kernel for each batch element
+                const kernel = this.embeddings.read().expandDims(0).tile([input.shape[0], 1, 1]);
+                // TODO: rely on broadcasting when tfjs will support backpropagating through broadcasting
+                // Remove the tile, or use tf.einsum('BTE,VE->BTV', input, this.embeddings.read())
+                // to prevent tensor duplication but tensorflow.js fails to backpropagate einsum
+                // https://github.com/tensorflow/tfjs/issues/5690
+                // (batch_size, sequence_length, nEmbd) x (vocabSize, nEmbd)^T -> (batch_size, sequence_length, vocabSize)
+                return tf.matMul(input, kernel, false, true);
+            }
+            else {
+                throw new Error('unexpected input shape for token embeddings');
+            }
+        });
+    }
+}
+tf.serialization.registerClass(LMEmbedding);
 /**
  * The GPTArchitecture specifically defines a GPT forward pass, i.e.,
  * what are the inputs, the successive transformer blocks and the outputs. It is then
@@ -212,54 +390,54 @@ function TransformerBlock(conf) {
  */
 export function GPTArchitecture(config) {
     const inputs = tf.input({ shape: [null] });
-    //Token embedding
-    const tokEmb = config.tokEmb
-        ? tf.layers.embedding({
-            name: config.name + '/wte',
-            inputDim: config.vocabSize,
-            outputDim: config.nEmbd,
-            embeddingsInitializer: 'zeros',
-            embeddingsRegularizer: undefined,
-            activityRegularizer: undefined
-        }).apply(inputs)
-        : inputs;
+    // token embedding
+    const wte = new LMEmbedding(config.vocabSize, config.nEmbd, config.seed);
+    let tokEmb = wte.apply(inputs); // (batch_size, input length T, nEmbd)
+    if (config.debug) {
+        tokEmb = new LogLayer({ name: 'tokEmb_log' }).apply(tokEmb);
+    }
     // Positional embedding
     const range = new Range({}).apply(inputs);
     let posEmb = tf.layers.embedding({
-        name: config.name + '/wpe',
-        inputDim: config.blockSize,
+        name: config.name + '.wpe',
+        inputDim: config.contextLength,
         outputDim: config.nEmbd,
-        embeddingsInitializer: 'zeros'
+        embeddingsInitializer: tf.initializers.randomNormal({
+            mean: 0, stddev: 0.02, seed: config.seed
+        }),
     }).apply(range);
     if (config.debug) {
-        posEmb = new LogLayer({ name: 'posEmb' }).apply(posEmb);
+        posEmb = new LogLayer({ name: 'posEmb_log' }).apply(posEmb);
     }
     // token and positional embeddings are added together
     let x = tf.layers.add().apply([tokEmb, posEmb]);
     // dropout
-    x = tf.layers.dropout({ name: 'drop', rate: config.embdDrop }).apply(x);
+    x = tf.layers.dropout({
+        name: 'drop', rate: config.embdDrop, seed: config.seed
+    }).apply(x);
     if (config.debug) {
-        x = new LogLayer({ name: 'dropadd' }).apply(x);
+        x = new LogLayer({ name: 'drop_log' }).apply(x);
     }
-    //Apply successively transformer blocks, attention and dense layers
+    // apply successively transformer blocks, attention and dense layers
     for (let i = 0; i < config.nLayer; i++) {
-        x = TransformerBlock(Object.assign({}, config, { name: config.name + '/h/' + i })).apply(x);
+        x = TransformerBlock(Object.assign({}, config, { name: config.name + '.h' + i })).apply(x);
     }
     // Normalization
-    x = tf.layers.layerNormalization({ name: config.name + '/ln_f', epsilon: 1e-5 })
+    x = tf.layers.layerNormalization({
+        name: config.name + '.ln_f',
+        epsilon: 1e-5,
+        gammaInitializer: 'ones',
+        betaInitializer: 'zeros',
+    })
         .apply(x);
     if (config.debug) {
-        x = new LogLayer({ name: 'fin/ln' }).apply(x);
+        x = new LogLayer({ name: 'ln_f_log' }).apply(x);
     }
-    // Append a language modeling head if specified
-    if (config.lmHead) {
-        x = tf.layers.dense({
-            name: 'lm_head',
-            units: config.vocabSize,
-            inputDim: config.nEmbd,
-            inputShape: [config.blockSize, config.nEmbd],
-            useBias: false
-        }).apply(x);
+    // language modeling head
+    // GPT2 uses the same matrix for the token embedding and the modeling head
+    x = wte.apply(x);
+    if (config.debug) {
+        x = new LogLayer({ name: 'lm_head_log' }).apply(x);
     }
     return tf.model({ inputs, outputs: x });
 }

package/dist/models/gpt/model.d.ts CHANGED Viewed

@@ -16,7 +16,7 @@ export declare abstract class Dataset<T> {
  */
 export declare class GPTModel extends tf.LayersModel {
     protected readonly config: Required<GPTConfig>;
-    constructor(partialConfig?: GPTConfig, layersModel?: tf.LayersModel);
+    constructor(partialConfig?: Partial<GPTConfig>, layersModel?: tf.LayersModel);
     get getGPTConfig(): Required<GPTConfig>;
     compile(): void;
     fitDataset<T>(dataset: Dataset<T>, trainingArgs: tf.ModelFitDatasetArgs<T>): Promise<tf.History>;

package/dist/models/gpt/model.js CHANGED Viewed

@@ -1,10 +1,10 @@
 import createDebug from "debug";
 import * as tf from '@tensorflow/tfjs';
-import { getModelSizes, DEFAULT_CONFIG } from './config.js';
+import { getModelSizes, DefaultGPTConfig } from './config.js';
 import { getCustomAdam, clipByGlobalNormObj } from './optimizers.js';
 import evaluate from './evaluate.js';
 import { GPTArchitecture } from './layers.js';
-const debug = createDebug("discojs:models:gpt");
+const debug = createDebug("discojs:models:gpt:model");
 /**
  * GPTModel extends tf.LayersModel and overrides tfjs' default training loop
  *
@@ -13,7 +13,7 @@ export class GPTModel extends tf.LayersModel {
     config;
     constructor(partialConfig, layersModel) {
         // Fill missing config parameters with default values
-        let completeConfig = { ...DEFAULT_CONFIG, ...partialConfig };
+        let completeConfig = { ...DefaultGPTConfig, ...partialConfig };
         // Add layer sizes depending on which model has been specified
         completeConfig = { ...completeConfig, ...getModelSizes(completeConfig.modelType) };
         if (layersModel !== undefined) {
@@ -112,7 +112,7 @@ export class GPTModel extends tf.LayersModel {
                 tf.dispose([xs, ys]);
             }
             let logs = {
-                'loss': averageLoss / iteration,
+                'loss': averageLoss / (iteration - 1), // -1 because iteration got incremented at the end of the loop
                 'acc': accuracyFraction[0] / accuracyFraction[1],
             };
             if (evalDataset !== undefined) {

package/dist/processing/index.js CHANGED Viewed

@@ -33,11 +33,11 @@ export async function preprocess(task, dataset) {
             // cast as typescript doesn't reduce generic type
             const d = dataset;
             const t = task;
+            const contextLength = task.trainingInformation.contextLength;
             const tokenizer = await models.getTaskTokenizer(t);
-            const totalTokenCount = task.trainingInformation.maxSequenceLength ??
-                tokenizer.model_max_length;
-            return d
-                .map((line) => processing.tokenizeAndLeftPad(line, tokenizer, totalTokenCount))
+            return d.map(text => processing.tokenize(tokenizer, text))
+                .flatten()
+                .batch(contextLength + 1, 1)
                 .map((tokens) => [tokens.pop(), tokens.last()]);
         }
     }
@@ -60,12 +60,11 @@ export async function preprocessWithoutLabel(task, dataset) {
             // cast as typescript doesn't reduce generic type
             const d = dataset;
             const t = task;
+            const contextLength = task.trainingInformation.contextLength;
             const tokenizer = await models.getTaskTokenizer(t);
-            const totalTokenCount = t.trainingInformation.maxSequenceLength ??
-                tokenizer.model_max_length;
-            return d
-                .map((line) => processing.tokenizeAndLeftPad(line, tokenizer, totalTokenCount))
-                .map((tokens) => tokens.pop());
+            return d.map(text => processing.tokenize(tokenizer, text))
+                .flatten()
+                .batch(contextLength);
         }
     }
 }

package/dist/processing/text.d.ts CHANGED Viewed

@@ -1,11 +1,21 @@
-import { List } from "immutable";
 import { PreTrainedTokenizer } from "@xenova/transformers";
-type Token = number;
+import type { Text, TokenizedText } from '../index.js';
+interface TokenizingConfig {
+    padding?: boolean;
+    padding_side?: 'left' | 'right';
+    truncation?: boolean;
+    max_length?: number;
+}
 /**
- * Tokenize and truncates input strings
+ * Tokenize one line of text.
+ * Wrapper around Transformers.js tokenizer to handle type checking and format the output.
+ * Note that Transformers.js's tokenizer can tokenize multiple lines of text at once
+ * but we are currently not making use of it. Can be useful when padding a batch
  *
- * @param length number of tokens
- * @returns encoded string in an array of token, size of max_length
+ * @param tokenizer the tokenizer object
+ * @param text the text to tokenize
+ * @param config TokenizingConfig, the tokenizing parameters when using `tokenizer`
+ * @returns List<number> the tokenized text
  */
-export declare function tokenizeAndLeftPad(line: string, tokenizer: PreTrainedTokenizer, length: number): List<Token>;
+export declare function tokenize(tokenizer: PreTrainedTokenizer, text: Text, config?: TokenizingConfig): TokenizedText;
 export {};

package/dist/processing/text.js CHANGED Viewed

@@ -1,33 +1,36 @@
-import { Repeat } from "immutable";
+import { List } from "immutable";
 function isArrayOfNumber(raw) {
     return Array.isArray(raw) && raw.every((e) => typeof e === "number");
 }
 /**
- * Tokenize and truncates input strings
+ * Tokenize one line of text.
+ * Wrapper around Transformers.js tokenizer to handle type checking and format the output.
+ * Note that Transformers.js's tokenizer can tokenize multiple lines of text at once
+ * but we are currently not making use of it. Can be useful when padding a batch
  *
- * @param length number of tokens
- * @returns encoded string in an array of token, size of max_length
+ * @param tokenizer the tokenizer object
+ * @param text the text to tokenize
+ * @param config TokenizingConfig, the tokenizing parameters when using `tokenizer`
+ * @returns List<number> the tokenized text
  */
-export function tokenizeAndLeftPad(line, tokenizer, length) {
-    if (!Number.isInteger(length))
-        throw new Error("length should be an integer");
-    // Transformers.js currently only supports right padding while we need left for text generation
-    // Right padding should be supported in the future, once it is, we can directly pad while tokenizing
-    // https://github.com/xenova/transformers.js/blob/8804c36591d11d8456788d1bb4b16489121b3be2/src/tokenizers.js#L2517
-    const tokenized = tokenizer(line, {
-        padding: false,
-        truncation: true,
-        return_tensor: false,
-        max_length: length,
-    });
-    if (typeof tokenized !== "object" ||
-        tokenized === null ||
-        !("input_ids" in tokenized) ||
-        !isArrayOfNumber(tokenized.input_ids))
-        throw new Error("tokenizer returns unexpected type");
-    const tokens = tokenized.input_ids;
-    const paddingSize = length - tokens.length;
-    if (paddingSize < 0)
-        throw new Error("tokenized returned more token than expected");
-    return Repeat(tokenizer.pad_token_id, paddingSize).concat(tokens).toList();
+export function tokenize(tokenizer, text, config) {
+    config = { ...config }; // create a config if undefined
+    if (config.padding || config.truncation) {
+        if (config.max_length === undefined)
+            throw new Error("max_length needs to be specified to use padding or truncation");
+        if (!Number.isInteger(config.max_length))
+            throw new Error("max_length should be an integer");
+    }
+    if (config.padding) {
+        // The padding side is set as an attribute, not in the config
+        tokenizer.padding_side = config.padding_side ?? 'left';
+        config.truncation = true; // for a single sequence, padding implies truncation to max_length
+    }
+    const tokenizerResult = tokenizer(text, { ...config, return_tensor: false });
+    if (typeof tokenizerResult !== "object" ||
+        tokenizerResult === null ||
+        !("input_ids" in tokenizerResult) ||
+        !isArrayOfNumber(tokenizerResult.input_ids))
+        throw new Error("tokenizer returned unexpected type");
+    return List(tokenizerResult.input_ids);
 }

package/dist/task/training_information.d.ts CHANGED Viewed

@@ -31,7 +31,7 @@ interface DataTypeToTrainingInformation {
     text: {
         dataType: "text";
         tokenizer: string | PreTrainedTokenizer;
-        maxSequenceLength?: number;
+        contextLength: number;
     };
 }
 export declare function isTrainingInformation(raw: unknown): raw is TrainingInformation<DataType>;

package/dist/task/training_information.js CHANGED Viewed

@@ -94,16 +94,15 @@ export function isTrainingInformation(raw) {
             return true;
         }
         case "text": {
-            const { maxSequenceLength, tokenizer, } = raw;
+            const { contextLength, tokenizer, } = raw;
             if ((typeof tokenizer !== "string" &&
                 !(tokenizer instanceof PreTrainedTokenizer)) ||
-                (maxSequenceLength !== undefined &&
-                    typeof maxSequenceLength !== "number"))
+                (typeof contextLength !== "number"))
                 return false;
             const _ = {
                 ...repack,
                 dataType,
-                maxSequenceLength,
+                contextLength,
                 tokenizer,
             };
             return true;

package/dist/types/data_format.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 import { List } from "immutable";
-import type { Image, processing, Tabular, Text } from "../index.js";
+import type { Image, processing, Tabular, Text, TokenizedText } from "../index.js";
 /**
  * The data & label format goes through various stages.
  * Raw* is preprocessed into ModelEncoded.
@@ -29,7 +29,7 @@ type Token = number;
 export interface ModelEncoded {
     image: [image: processing.NormalizedImage<3>, label: number];
     tabular: [row: List<number>, number];
-    text: [line: List<Token>, next: Token];
+    text: [line: TokenizedText, next: Token];
 }
 /** what gets outputted by the Validator, for humans */
 export interface Inferred {

package/dist/validator.js CHANGED Viewed

@@ -13,7 +13,7 @@ export class Validator {
             .map(async (batch) => (await this.#model.predict(batch.map(([inputs, _]) => inputs)))
             .zip(batch.map(([_, outputs]) => outputs))
             .map(([inferred, truth]) => inferred === truth))
-            .unbatch();
+            .flatten();
         for await (const e of results)
             yield e;
     }
@@ -22,7 +22,7 @@ export class Validator {
         const modelPredictions = (await processing.preprocessWithoutLabel(this.task, dataset))
             .batch(this.task.trainingInformation.batchSize)
             .map((batch) => this.#model.predict(batch))
-            .unbatch();
+            .flatten();
         const predictions = await processing.postprocess(this.task, modelPredictions);
         for await (const e of predictions)
             yield e;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@epfml/discojs",
-  "version": "3.0.1-p20241203151748.0",
+  "version": "3.0.1-p20241206154707.0",
   "type": "module",
   "main": "dist/index.js",
   "types": "dist/index.d.ts",