npm - @epfml/discojs - Versions diffs - 3.0.1-p20241119093954.0 → 3.0.1-p20241206133538.0 - Mend

@epfml/discojs 3.0.1-p20241119093954.0 → 3.0.1-p20241206133538.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/dist/client/client.js +2 -0
package/dist/client/federated/federated_client.js +2 -2
package/dist/dataset/dataset.d.ts +18 -5
package/dist/dataset/dataset.js +58 -23
package/dist/dataset/types.d.ts +1 -0
package/dist/default_tasks/index.d.ts +1 -0
package/dist/default_tasks/index.js +1 -0
package/dist/default_tasks/tinder_dog.d.ts +2 -0
package/dist/default_tasks/tinder_dog.js +72 -0
package/dist/default_tasks/wikitext.js +5 -3
package/dist/models/gpt/config.d.ts +11 -6
package/dist/models/gpt/config.js +11 -7
package/dist/models/gpt/index.d.ts +5 -9
package/dist/models/gpt/index.js +36 -15
package/dist/models/gpt/layers.js +260 -82
package/dist/models/gpt/model.d.ts +1 -1
package/dist/models/gpt/model.js +4 -4
package/dist/processing/index.js +8 -9
package/dist/processing/text.d.ts +16 -6
package/dist/processing/text.js +29 -26
package/dist/task/task_handler.js +5 -1
package/dist/task/training_information.d.ts +1 -1
package/dist/task/training_information.js +3 -4
package/dist/training/disco.js +6 -3
package/dist/types/data_format.d.ts +2 -2
package/dist/validator.js +2 -2
package/package.json +1 -1

package/dist/client/client.js CHANGED Viewed

@@ -149,6 +149,8 @@ export class Client extends EventEmitter {
         }
         url.pathname += `tasks/${this.task.id}/model.json`;
         const response = await fetch(url);
+        if (!response.ok)
+            throw new Error(`fetch: HTTP status ${response.status}`);
         const encoded = new Uint8Array(await response.arrayBuffer());
         return await serialization.model.decode(encoded);
     }

package/dist/client/federated/federated_client.js CHANGED Viewed

@@ -2,7 +2,7 @@ import createDebug from "debug";
 import { serialization } from "../../index.js";
 import { Client, shortenId } from "../client.js";
 import { type } from "../messages.js";
-import { waitMessage, waitMessageWithTimeout, WebSocketServer, } from "../event_connection.js";
+import { waitMessage, WebSocketServer, } from "../event_connection.js";
 import * as messages from "./messages.js";
 const debug = createDebug("discojs:client:federated");
 /**
@@ -53,7 +53,7 @@ export class FederatedClient extends Client {
             type: type.ClientConnected,
         };
         this.server.send(msg);
-        const { id, waitForMoreParticipants, payload, round, nbOfParticipants } = await waitMessageWithTimeout(this.server, type.NewFederatedNodeInfo);
+        const { id, waitForMoreParticipants, payload, round, nbOfParticipants } = await waitMessage(this.server, type.NewFederatedNodeInfo);
         // This should come right after receiving the message to make sure
         // we don't miss a subsequent message from the server
         // We check if the server is telling us to wait for more participants

package/dist/dataset/dataset.d.ts CHANGED Viewed

@@ -25,15 +25,22 @@ export declare class Dataset<T> implements AsyncIterable<T> {
      * @param ratio between 0 (all on left) and 1 (all on right)
      */
     split(ratio: number): [Dataset<T>, Dataset<T>];
-    /** Slice into chunks
+    /** Create batches of `size` elements with potential overlap.
+     * Last batch is smaller if dataset isn't perfectly divisible
      *
-     * Last slice is smaller if dataset isn't perfectly divisible
+     * If overlap is set to a positive integer, the last `overlap` elements of a batch
+     * are the first `overlap` elements of the next batch.
+     *
+     * This method is tailored to create text sequences where each token's label is the following token.
+     * In order to have a label for the last token of the input sequence, we include the first token
+     * of the next sequence (i.e. with an overlap of 1).
      *
      * @param size count of element per chunk
+     * @param overlap number of elements overlapping between two consecutive batches
      */
-    batch(size: number): Dataset<Batched<T>>;
-    /** Flatten chunks */
-    unbatch<U>(this: Dataset<Batched<U>>): Dataset<U>;
+    batch(size: number, overlap?: number): Dataset<Batched<T>>;
+    /** Flatten batches/arrays of elements */
+    flatten<U>(this: Dataset<DatasetLike<U>>): Dataset<U>;
     /** Join side-by-side
      *
      * Stops as soon as one runs out
@@ -41,6 +48,12 @@ export declare class Dataset<T> implements AsyncIterable<T> {
      * @param other right side
      **/
     zip<U>(other: Dataset<U> | DatasetLike<U>): Dataset<[T, U]>;
+    /**
+     * Repeat the dataset `times` times
+     * @param times number of times to repeat the dataset, if undefined, the dataset is repeated indefinitely
+     * @returns a dataset repeated `times` times
+     */
+    repeat(times?: number): Dataset<T>;
     /** Compute size
      *
      * This is a costly operation as we need to go through the whole Dataset.

package/dist/dataset/dataset.js CHANGED Viewed

@@ -1,6 +1,22 @@
 import createDebug from "debug";
 import { List, Range } from "immutable";
 const debug = createDebug("discojs:dataset");
+/** Convert a DatasetLike object to an async generator */
+async function* datasetLikeToGenerator(content) {
+    let iter;
+    if (typeof content === "function")
+        iter = content();
+    else if (Symbol.asyncIterator in content)
+        iter = content[Symbol.asyncIterator]();
+    else
+        iter = content[Symbol.iterator]();
+    while (true) {
+        const result = await iter.next();
+        if (result.done === true)
+            break;
+        yield result.value;
+    }
+}
 /** Immutable series of data */
 export class Dataset {
     #content;
@@ -11,19 +27,7 @@ export class Dataset {
      */
     constructor(content) {
         this.#content = async function* () {
-            let iter;
-            if (typeof content === "function")
-                iter = content();
-            else if (Symbol.asyncIterator in content)
-                iter = content[Symbol.asyncIterator]();
-            else
-                iter = content[Symbol.iterator]();
-            while (true) {
-                const result = await iter.next();
-                if (result.done === true)
-                    break;
-                yield result.value;
-            }
+            yield* datasetLikeToGenerator(content);
         };
     }
     [Symbol.asyncIterator]() {
@@ -87,19 +91,31 @@ export class Dataset {
             }.bind(this)),
         ];
     }
-    /** Slice into chunks
+    /** Create batches of `size` elements with potential overlap.
+     * Last batch is smaller if dataset isn't perfectly divisible
+     *
+     * If overlap is set to a positive integer, the last `overlap` elements of a batch
+     * are the first `overlap` elements of the next batch.
      *
-     * Last slice is smaller if dataset isn't perfectly divisible
+     * This method is tailored to create text sequences where each token's label is the following token.
+     * In order to have a label for the last token of the input sequence, we include the first token
+     * of the next sequence (i.e. with an overlap of 1).
      *
      * @param size count of element per chunk
+     * @param overlap number of elements overlapping between two consecutive batches
      */
-    batch(size) {
+    batch(size, overlap = 0) {
         if (size <= 0 || !Number.isInteger(size))
             throw new Error("invalid size");
+        if (overlap >= size || !Number.isInteger(overlap))
+            throw new Error("invalid overlap");
         return new Dataset(async function* () {
             const iter = this[Symbol.asyncIterator]();
+            let overlapped = List();
             for (;;) {
-                const batch = List(await Promise.all(Range(0, size).map(() => iter.next()))).flatMap((res) => {
+                const batch = List(
+                // get the first elements of the next batch
+                await Promise.all(Range(overlapped.size, size).map(() => iter.next()))).flatMap((res) => {
                     if (res.done)
                         return [];
                     else
@@ -107,18 +123,21 @@ export class Dataset {
                 });
                 if (batch.isEmpty())
                     break;
-                yield batch;
+                // yield the current batch with the first elements of the next batch
+                yield overlapped.concat(batch);
+                overlapped = batch.takeLast(overlap);
                 // iterator couldn't generate more
-                if (batch.size < size)
+                if (batch.size < size - overlap)
                     break;
             }
         }.bind(this));
     }
-    /** Flatten chunks */
-    unbatch() {
+    /** Flatten batches/arrays of elements */
+    flatten() {
         return new Dataset(async function* () {
-            for await (const batch of this)
-                yield* batch;
+            for await (const batch of this) {
+                yield* datasetLikeToGenerator(batch);
+            }
         }.bind(this));
     }
     /** Join side-by-side
@@ -141,6 +160,22 @@ export class Dataset {
             }
         }.bind(this));
     }
+    /**
+     * Repeat the dataset `times` times
+     * @param times number of times to repeat the dataset, if undefined, the dataset is repeated indefinitely
+     * @returns a dataset repeated `times` times
+     */
+    repeat(times) {
+        if (times !== undefined && (!Number.isInteger(times) || times < 1))
+            throw new Error("times needs to be a positive integer or undefined");
+        return new Dataset(async function* () {
+            let loop = 0;
+            do {
+                yield* this;
+                loop++;
+            } while (times === undefined || loop < times);
+        }.bind(this));
+    }
     /** Compute size
      *
      * This is a costly operation as we need to go through the whole Dataset.

package/dist/dataset/types.d.ts CHANGED Viewed

@@ -4,3 +4,4 @@ export type Batched<T> = List<T>;
 export { Image };
 export type Tabular = Partial<Record<string, string>>;
 export type Text = string;
+export type TokenizedText = List<number>;

package/dist/default_tasks/index.d.ts CHANGED Viewed

@@ -4,3 +4,4 @@ export { mnist } from './mnist.js';
 export { simpleFace } from './simple_face.js';
 export { titanic } from './titanic.js';
 export { wikitext } from './wikitext.js';
+export { tinderDog } from './tinder_dog.js';

package/dist/default_tasks/index.js CHANGED Viewed

@@ -4,3 +4,4 @@ export { mnist } from './mnist.js';
 export { simpleFace } from './simple_face.js';
 export { titanic } from './titanic.js';
 export { wikitext } from './wikitext.js';
+export { tinderDog } from './tinder_dog.js';

package/dist/default_tasks/tinder_dog.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ import type { TaskProvider } from '../index.js';
2	+ export declare const tinderDog: TaskProvider<'image'>;

package/dist/default_tasks/tinder_dog.js ADDED Viewed

@@ -0,0 +1,72 @@
+import * as tf from '@tensorflow/tfjs';
+import { models } from '../index.js';
+export const tinderDog = {
+    getTask() {
+        return {
+            id: 'tinder_dog',
+            displayInformation: {
+                taskTitle: 'GDHF 2024 | TinderDog',
+                summary: {
+                    preview: 'Which dog is the cutest....or not?',
+                    overview: "Binary classification model for dog cuteness."
+                },
+                model: 'The model is a simple Convolutional Neural Network composed of two convolutional layers with ReLU activations and max pooling layers, followed by a fully connected output layer. The data preprocessing reshapes images into 64x64 pixels and normalizes values between 0 and 1',
+                dataFormatInformation: 'Accepted image formats are .png .jpg and .jpeg.',
+                dataExampleText: '',
+                dataExampleImage: 'https://storage.googleapis.com/deai-313515.appspot.com/tinder_dog_preview.png',
+                sampleDatasetLink: 'https://storage.googleapis.com/deai-313515.appspot.com/tinder_dog.zip',
+                sampleDatasetInstructions: 'Opening the link should start downloading a zip file which you can unzip. To connect the data, pick one of the data splits (the folder 0 for example) and use the CSV option below to select the file named "labels.csv". You can now connect the images located in the same folder.'
+            },
+            trainingInformation: {
+                epochs: 10,
+                roundDuration: 2,
+                validationSplit: 0, // nicer plot for GDHF demo
+                batchSize: 10,
+                dataType: 'image',
+                IMAGE_H: 64,
+                IMAGE_W: 64,
+                LABEL_LIST: ['Cute dogs', 'Less cute dogs'],
+                scheme: 'federated',
+                aggregationStrategy: 'mean',
+                minNbOfParticipants: 3,
+                tensorBackend: 'tfjs'
+            }
+        };
+    },
+    async getModel() {
+        const seed = 42; // set a seed to ensure reproducibility during GDHF demo
+        const imageHeight = this.getTask().trainingInformation.IMAGE_H;
+        const imageWidth = this.getTask().trainingInformation.IMAGE_W;
+        const imageChannels = 3;
+        const model = tf.sequential();
+        model.add(tf.layers.conv2d({
+            inputShape: [imageHeight, imageWidth, imageChannels],
+            kernelSize: 5,
+            filters: 8,
+            activation: 'relu',
+            kernelInitializer: tf.initializers.heNormal({ seed })
+        }));
+        model.add(tf.layers.conv2d({
+            kernelSize: 5, filters: 16, activation: 'relu',
+            kernelInitializer: tf.initializers.heNormal({ seed })
+        }));
+        model.add(tf.layers.maxPooling2d({ poolSize: 2, strides: 2 }));
+        model.add(tf.layers.dropout({ rate: 0.25, seed }));
+        model.add(tf.layers.flatten());
+        model.add(tf.layers.dense({
+            units: 32, activation: 'relu',
+            kernelInitializer: tf.initializers.heNormal({ seed })
+        }));
+        model.add(tf.layers.dropout({ rate: 0.25, seed }));
+        model.add(tf.layers.dense({
+            units: 2, activation: 'softmax',
+            kernelInitializer: tf.initializers.heNormal({ seed })
+        }));
+        model.compile({
+            optimizer: tf.train.adam(0.0005),
+            loss: 'categoricalCrossentropy',
+            metrics: ['accuracy']
+        });
+        return Promise.resolve(new models.TFJS('image', model));
+    }
+};

package/dist/default_tasks/wikitext.js CHANGED Viewed

@@ -31,14 +31,16 @@ export const wikitext = {
                 // But if set to 0 then the webapp doesn't display the validation metrics
                 validationSplit: 0.1,
                 roundDuration: 2,
-                batchSize: 1, // If set too high (e.g. 16) firefox raises a WebGL error
+                batchSize: 8, // If set too high firefox raises a WebGL error
                 tokenizer: 'Xenova/gpt2',
-                maxSequenceLength: 128,
+                contextLength: 64,
                 tensorBackend: 'gpt'
             }
         };
     },
     getModel() {
-        return Promise.resolve(new models.GPT());
+        return Promise.resolve(new models.GPT({
+            contextLength: this.getTask().trainingInformation.contextLength,
+        }));
     }
 };

package/dist/models/gpt/config.d.ts CHANGED Viewed

@@ -1,8 +1,8 @@
 type GPTModelType = 'gpt2' | 'gpt2-medium' | 'gpt2-large' | 'gpt2-xl' | 'gpt-mini' | 'gpt-micro' | 'gpt-nano';
 export interface GPTConfig {
     lr: number;
-    blockSize: number;
-    vocabSize: number;
+    contextLength: number;
+    vocabSize?: number;
     modelType: GPTModelType;
     name?: string;
     evaluate?: boolean;
@@ -11,22 +11,27 @@ export interface GPTConfig {
     maxIter?: number;
     weightDecay?: number;
     verbose?: 0 | 1;
-    bias?: boolean;
     debug?: boolean;
     dropout?: number;
     residDrop?: number;
     embdDrop?: number;
-    tokEmb?: boolean;
-    lmHead?: boolean;
     nLayer?: number;
     nHead?: number;
     nEmbd?: number;
+    seed?: number;
 }
-export declare const DEFAULT_CONFIG: Required<GPTConfig>;
+export declare const DefaultGPTConfig: Required<GPTConfig>;
 export type ModelSize = {
     nLayer: number;
     nHead: number;
     nEmbd: number;
 };
 export declare function getModelSizes(modelType: GPTModelType): Required<ModelSize>;
+export interface GenerationConfig {
+    doSample: boolean;
+    temperature: number;
+    topk: number;
+    seed: number;
+}
+export declare const DefaultGenerationConfig: Required<GenerationConfig>;
 export {};

package/dist/models/gpt/config.js CHANGED Viewed

@@ -1,6 +1,6 @@
 // for a benchmark of performance, see https://github.com/epfml/disco/pull/659
-export const DEFAULT_CONFIG = {
-    name: 'transformer',
+export const DefaultGPTConfig = {
+    name: 'transformer', // prefix for the model layer names
     lr: 0.001,
     weightDecay: 0,
     maxIter: 10,
@@ -9,18 +9,16 @@ export const DEFAULT_CONFIG = {
     evaluate: true,
     maxEvalBatches: 12,
     evaluateEvery: 100,
-    blockSize: 128,
-    vocabSize: 50258,
-    bias: true,
+    contextLength: 128,
+    vocabSize: 50257,
     debug: false,
     dropout: 0.2,
     residDrop: 0.2,
     embdDrop: 0.2,
-    tokEmb: true,
-    lmHead: true,
     nLayer: 3,
     nHead: 3,
     nEmbd: 48,
+    seed: Math.random(),
 };
 export function getModelSizes(modelType) {
     switch (modelType) {
@@ -40,3 +38,9 @@ export function getModelSizes(modelType) {
             return { nLayer: 3, nHead: 3, nEmbd: 48 };
     }
 }
+export const DefaultGenerationConfig = {
+    temperature: 1.0,
+    doSample: false,
+    seed: Math.random(),
+    topk: 50
+};

package/dist/models/gpt/index.d.ts CHANGED Viewed

@@ -1,23 +1,20 @@
 /**
- * this code is taken from gpt-tfjs with modifications from @peacefulotter and @lukemovement
+ * Source: https://github.com/zemlyansky/gpt-tfjs and https://github.com/karpathy/build-nanogpt
+ * With modifications from @peacefulotter, @lukemovement and the Disco team
  **/
 import * as tf from "@tensorflow/tfjs";
 import type { Batched, Dataset, DataFormat } from "../../index.js";
 import { WeightsContainer } from "../../index.js";
 import { BatchLogs, Model, EpochLogs } from "../index.js";
-import { type GPTConfig } from "./config.js";
+import type { GPTConfig, GenerationConfig } from './config.js';
 export type GPTSerialization = {
     weights: WeightsContainer;
     config?: GPTConfig;
 };
-interface PredictConfig {
-    temperature: number;
-    doSample: boolean;
-}
 export declare class GPT extends Model<"text"> {
     #private;
     private readonly model;
-    constructor(partialConfig?: GPTConfig, layersModel?: tf.LayersModel);
+    constructor(partialConfig?: Partial<GPTConfig>, layersModel?: tf.LayersModel);
     /**
      * The GPT train methods wraps the model.fitDataset call in a for loop to act as a generator (of logs)
      * This allows for getting logs and stopping training without callbacks.
@@ -28,7 +25,7 @@ export declare class GPT extends Model<"text"> {
      * @param tracker
      */
     train(trainingDataset: Dataset<Batched<DataFormat.ModelEncoded["text"]>>, validationDataset?: Dataset<Batched<DataFormat.ModelEncoded["text"]>>): AsyncGenerator<BatchLogs, EpochLogs>;
-    predict(batch: Batched<DataFormat.ModelEncoded["text"][0]>, options?: Partial<PredictConfig>): Promise<Batched<DataFormat.ModelEncoded["text"][1]>>;
+    predict(batch: Batched<DataFormat.ModelEncoded["text"][0]>, options?: Partial<GenerationConfig>): Promise<Batched<DataFormat.ModelEncoded["text"][1]>>;
     get config(): Required<GPTConfig>;
     get weights(): WeightsContainer;
     set weights(ws: WeightsContainer);
@@ -37,4 +34,3 @@ export declare class GPT extends Model<"text"> {
     extract(): tf.LayersModel;
     [Symbol.dispose](): void;
 }
-export {};

package/dist/models/gpt/index.js CHANGED Viewed

@@ -1,5 +1,6 @@
 /**
- * this code is taken from gpt-tfjs with modifications from @peacefulotter and @lukemovement
+ * Source: https://github.com/zemlyansky/gpt-tfjs and https://github.com/karpathy/build-nanogpt
+ * With modifications from @peacefulotter, @lukemovement and the Disco team
  **/
 import createDebug from "debug";
 import { List, Range } from "immutable";
@@ -7,12 +8,12 @@ import * as tf from "@tensorflow/tfjs";
 import { WeightsContainer } from "../../index.js";
 import { Model, EpochLogs } from "../index.js";
 import { GPTModel } from "./model.js";
-import { DEFAULT_CONFIG } from "./config.js";
 import evaluate from "./evaluate.js";
+import { DefaultGPTConfig, DefaultGenerationConfig } from './config.js';
 const debug = createDebug("discojs:models:gpt");
 export class GPT extends Model {
     model;
-    #blockSize;
+    #contextLength;
     #maxBatchCount;
     #vocabSize;
     constructor(partialConfig, layersModel) {
@@ -20,9 +21,9 @@ export class GPT extends Model {
         const model = new GPTModel(partialConfig, layersModel);
         model.compile();
         this.model = model;
-        this.#blockSize = partialConfig?.blockSize ?? DEFAULT_CONFIG.blockSize;
-        this.#maxBatchCount = partialConfig?.maxIter ?? DEFAULT_CONFIG.maxIter;
-        this.#vocabSize = partialConfig?.vocabSize ?? DEFAULT_CONFIG.vocabSize;
+        this.#contextLength = partialConfig?.contextLength ?? DefaultGPTConfig.contextLength;
+        this.#maxBatchCount = partialConfig?.maxIter ?? DefaultGPTConfig.maxIter;
+        this.#vocabSize = partialConfig?.vocabSize ?? DefaultGPTConfig.vocabSize;
     }
     /**
      * The GPT train methods wraps the model.fitDataset call in a for loop to act as a generator (of logs)
@@ -85,16 +86,21 @@ export class GPT extends Model {
         }));
     }
     async predict(batch, options) {
-        const config = {
-            temperature: 1.0,
-            doSample: false,
-            ...options,
-        };
+        // overwrite default with user config
+        const config = Object.assign({}, DefaultGenerationConfig, options);
         return List(await Promise.all(batch.map((tokens) => this.#predictSingle(tokens, config))));
     }
+    /**
+     * Generate the next token after the input sequence.
+     * In other words, takes an input tensor of shape (prompt length T) and returns a tensor of shape (T+1)
+     *
+     * @param token input tokens of shape (T,). T is truncated to the model's context length
+     * @param config generation config: temperature, doSample, topk
+     * @returns the next token predicted by the model
+     */
     async #predictSingle(tokens, config) {
         // slice input tokens if longer than context length
-        tokens = tokens.slice(-this.#blockSize);
+        tokens = tokens.slice(-this.#contextLength);
         const input = tf.tidy(() => tf.tensor1d(tokens.toArray(), "int32").expandDims(0));
         const logits = tf.tidy(() => {
             const output = this.model.predict(input);
@@ -111,9 +117,24 @@ export class GPT extends Model {
             .div(config.temperature)
             .softmax());
         logits.dispose();
-        const next = tf.tidy(() => config.doSample
-            ? tf.multinomial(probs, 1).squeeze([0])
-            : probs.argMax());
+        const next = tf.tidy(() => {
+            if (config.doSample) {
+                // returns topk biggest values among the `vocab_size` probabilities and the corresponding tokens indices
+                // both shapes are (config.topk,)
+                const { values: topkProbs, indices: topkTokens } = tf.topk(probs, config.topk);
+                // sample an index from the top-k probabilities
+                // e.g. [[0.1, 0.4, 0.3], [0.1, 0.2, 0.5]] -> [[1], [2]]
+                // note: multinomial does not need the input to sum to 1
+                const selectedIndices = tf.multinomial(topkProbs, 1, config.seed, false); // (B, )
+                // return the corresponding token from the sampled indices (one per sequence in the batch).
+                // if for some reason the probabilities are NaN, selectedIndices will be out of bounds
+                return topkTokens.gather(selectedIndices).squeeze([0]); // (1)
+            }
+            else {
+                // greedy decoding: return the token with the highest probability
+                return probs.argMax();
+            }
+        });
         probs.dispose();
         const ret = await next.array();
         next.dispose();