npm - @epfml/discojs - Versions diffs - 3.0.1-p20240820135253.0 → 3.0.1-p20240822103944.0 - Mend

@epfml/discojs 3.0.1-p20240820135253.0 → 3.0.1-p20240822103944.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

package/dist/dataset/data/data.d.ts +6 -7
package/dist/dataset/data/data.js +12 -7
package/dist/dataset/data/helpers.d.ts +10 -0
package/dist/dataset/data/helpers.js +97 -0
package/dist/dataset/data/image_data.d.ts +3 -3
package/dist/dataset/data/image_data.js +7 -2
package/dist/dataset/data/index.d.ts +0 -1
package/dist/dataset/data/preprocessing/text_preprocessing.js +23 -9
package/dist/dataset/data/tabular_data.d.ts +3 -3
package/dist/dataset/data/text_data.d.ts +3 -3
package/dist/dataset/dataset.d.ts +48 -5
package/dist/dataset/dataset.js +155 -1
package/dist/dataset/image.d.ts +14 -0
package/dist/dataset/image.js +21 -0
package/dist/dataset/index.d.ts +3 -5
package/dist/dataset/index.js +3 -3
package/dist/dataset/types.d.ts +4 -0
package/dist/dataset/types.js +2 -0
package/dist/index.d.ts +4 -0
package/dist/index.js +4 -0
package/dist/models/gpt/model.js +2 -0
package/dist/models/model.d.ts +1 -2
package/dist/models/tfjs.d.ts +4 -4
package/dist/models/tfjs.js +2 -1
package/dist/processing.d.ts +35 -0
package/dist/processing.js +89 -0
package/dist/training/disco.d.ts +7 -7
package/dist/training/disco.js +21 -19
package/dist/types.d.ts +3 -0
package/dist/types.js +1 -0
package/dist/validation/validator.d.ts +7 -23
package/dist/validation/validator.js +99 -105
package/package.json +1 -1
package/dist/dataset/data_loader/data_loader.d.ts +0 -13
package/dist/dataset/data_loader/data_loader.js +0 -2
package/dist/dataset/data_loader/image_loader.d.ts +0 -21
package/dist/dataset/data_loader/image_loader.js +0 -101
package/dist/dataset/data_loader/index.d.ts +0 -5
package/dist/dataset/data_loader/index.js +0 -4
package/dist/dataset/data_loader/tabular_loader.d.ts +0 -35
package/dist/dataset/data_loader/tabular_loader.js +0 -76
package/dist/dataset/data_loader/text_loader.d.ts +0 -14
package/dist/dataset/data_loader/text_loader.js +0 -25
package/dist/dataset/dataset_builder.d.ts +0 -51
package/dist/dataset/dataset_builder.js +0 -118

package/dist/dataset/data_loader/image_loader.js DELETED Viewed

@@ -1,101 +0,0 @@
-import { Range } from 'immutable';
-import * as tf from '@tensorflow/tfjs';
-import { ImageData } from '../data/index.js';
-import { DataLoader } from '../data_loader/index.js';
-/**
- * Image data loader whose instantiable implementation is delegated by the platform-dependent Disco subprojects, namely,
- * @epfml/discojs-web and @epfml/discojs-node.
- * Load labels and correctly match them with their respective images, with the following constraints:
- * 1. Images are given as 1 image/1 file;
- * 2. Labels are given as multiple labels/1 file, each label file can contain a different amount of labels.
- */
-export class ImageLoader extends DataLoader {
-    task;
-    constructor(task) {
-        super();
-        this.task = task;
-    }
-    async load(image, config) {
-        let tensorContainer;
-        if (config?.labels === undefined) {
-            tensorContainer = await this.readImageFrom(image, config?.channels);
-        }
-        else {
-            tensorContainer = {
-                xs: await this.readImageFrom(image, config?.channels),
-                ys: config.labels[0]
-            };
-        }
-        return tf.data.array([tensorContainer]);
-    }
-    async buildDataset(images, labels, indices, config) {
-        // Can't use arrow function for generator and need access to 'this'
-        // eslint-disable-next-line
-        const self = this;
-        async function* dataGenerator() {
-            const withLabels = config?.labels !== undefined;
-            let index = 0;
-            while (index < indices.length) {
-                const sample = await self.readImageFrom(images[indices[index]], config?.channels);
-                const label = withLabels ? labels[indices[index]] : undefined;
-                const value = withLabels ? { xs: sample, ys: label } : sample;
-                index++;
-                yield value;
-            }
-        }
-        // @ts-expect-error: For some reasons typescript refuses async generator but tensorflow do work with them
-        const dataset = tf.data.generator(dataGenerator);
-        return await ImageData.init(dataset, this.task, indices.length);
-    }
-    async loadAll(images, config) {
-        let labels = [];
-        const indices = Range(0, images.length).toArray();
-        if (config?.labels !== undefined) {
-            const labelList = this.task.trainingInformation?.LABEL_LIST;
-            if (labelList === undefined || !Array.isArray(labelList)) {
-                throw new Error('LABEL_LIST should be specified in the task training information');
-            }
-            const numberOfClasses = labelList.length;
-            // Map label strings to integer
-            const label_to_int = new Map(labelList.map((label_name, idx) => [label_name, idx]));
-            if (label_to_int.size !== numberOfClasses) {
-                throw new Error("Input labels aren't matching the task LABEL_LIST");
-            }
-            labels = config.labels.map(label_name => {
-                const label_int = label_to_int.get(label_name);
-                if (label_int === undefined) {
-                    throw new Error(`Found input label ${label_name} not specified in task LABEL_LIST`);
-                }
-                return label_int;
-            });
-            labels = await tf.oneHot(tf.tensor1d(labels, 'int32'), numberOfClasses).array();
-        }
-        if (config?.shuffle === undefined || config?.shuffle) {
-            this.shuffle(indices);
-        }
-        if (config?.validationSplit === undefined || config?.validationSplit === 0) {
-            const dataset = await this.buildDataset(images, labels, indices, config);
-            return {
-                train: dataset,
-                validation: undefined
-            };
-        }
-        const trainSize = Math.floor(images.length * (1 - config.validationSplit));
-        const trainIndices = indices.slice(0, trainSize);
-        const valIndices = indices.slice(trainSize);
-        const trainDataset = await this.buildDataset(images, labels, trainIndices, config);
-        const valDataset = await this.buildDataset(images, labels, valIndices, config);
-        return {
-            train: trainDataset,
-            validation: valDataset
-        };
-    }
-    shuffle(array) {
-        for (let i = 0; i < array.length; i++) {
-            const j = Math.floor(Math.random() * i);
-            const swap = array[i];
-            array[i] = array[j];
-            array[j] = swap;
-        }
-    }
-}

package/dist/dataset/data_loader/index.d.ts DELETED Viewed

@@ -1,5 +0,0 @@
-export type { DataConfig } from './data_loader.js';
-export { DataLoader } from './data_loader.js';
-export { ImageLoader } from './image_loader.js';
-export { TabularLoader } from './tabular_loader.js';
-export { TextLoader } from './text_loader.js';

package/dist/dataset/data_loader/index.js DELETED Viewed

@@ -1,4 +0,0 @@
-export { DataLoader } from './data_loader.js';
-export { ImageLoader } from './image_loader.js';
-export { TabularLoader } from './tabular_loader.js';
-export { TextLoader } from './text_loader.js';

package/dist/dataset/data_loader/tabular_loader.d.ts DELETED Viewed

@@ -1,35 +0,0 @@
-import type { Task } from '../../index.js';
-import type { Dataset, DataSplit } from '../index.js';
-import type { DataConfig } from './index.js';
-import { DataLoader } from './index.js';
-/**
- * Tabular data loader whose instantiable implementation is delegated by the platform-dependent Disco subprojects, namely,
- * @epfml/discojs-web and @epfml/discojs-node. Loads data from files whose entries are line-separated and consist of
- * character-separated features and label(s). Such files typically have the .csv extension.
- */
-export declare abstract class TabularLoader<Source> extends DataLoader<Source> {
-    private readonly task;
-    readonly delimiter: string;
-    constructor(task: Task, delimiter?: string);
-    /**
-     * Creates a CSV dataset object based off the given source.
-     * @param source File object, URL string or local file system path.
-     * @param csvConfig Object expected by TF.js to create a CSVDataset.
-     * @returns The CSVDataset object built upon the given source.
-     */
-    abstract loadDatasetFrom(source: Source, csvConfig: Record<string, unknown>): Promise<Dataset>;
-    /**
-     * Expects delimiter-separated tabular data made of N columns. The data may be
-     * potentially split among several sources. Every source should contain N-1
-     * feature columns and 1 single label column.
-     * @param source List of File objects, URLs or file system paths.
-     * @param config
-     * @returns A TF.js dataset built upon read tabular data stored in the given sources.
-     */
-    load(source: Source, config?: DataConfig): Promise<Dataset>;
-    /**
-     * Creates the CSV datasets based off the given sources, then fuses them into a single CSV
-     * dataset.
-     */
-    loadAll(sources: Source[], config: DataConfig): Promise<DataSplit>;
-}

package/dist/dataset/data_loader/tabular_loader.js DELETED Viewed

@@ -1,76 +0,0 @@
-import { List, Map, Set } from 'immutable';
-import { TabularData } from '../index.js';
-import { DataLoader } from './index.js';
-// Window size from which the dataset shuffling will sample
-const BUFFER_SIZE = 1000;
-/**
- * Tabular data loader whose instantiable implementation is delegated by the platform-dependent Disco subprojects, namely,
- * @epfml/discojs-web and @epfml/discojs-node. Loads data from files whose entries are line-separated and consist of
- * character-separated features and label(s). Such files typically have the .csv extension.
- */
-export class TabularLoader extends DataLoader {
-    task;
-    delimiter;
-    constructor(task, delimiter = ',') {
-        super();
-        this.task = task;
-        this.delimiter = delimiter;
-    }
-    /**
-     * Expects delimiter-separated tabular data made of N columns. The data may be
-     * potentially split among several sources. Every source should contain N-1
-     * feature columns and 1 single label column.
-     * @param source List of File objects, URLs or file system paths.
-     * @param config
-     * @returns A TF.js dataset built upon read tabular data stored in the given sources.
-     */
-    async load(source, config) {
-        /**
-         * Prepare the CSV config object based off the given features and labels.
-         * If labels is empty, then the returned dataset is comprised of samples only.
-         * Otherwise, each entry is of the form `{ xs, ys }` with `xs` as features and `ys`
-         * as labels.
-         */
-        if (config?.features === undefined) {
-            // TODO @s314cy
-            throw new Error('Not implemented');
-        }
-        const columnConfigs = Map(Set(config.features).map((feature) => [feature, { required: false, isLabel: false }])).merge(Set(config.labels).map((label) => [label, { required: true, isLabel: true }]));
-        const csvConfig = {
-            hasHeader: true,
-            columnConfigs: columnConfigs.toObject(),
-            configuredColumnsOnly: true,
-            delimiter: this.delimiter
-        };
-        const dataset = (await this.loadDatasetFrom(source, csvConfig)).map((t) => {
-            if (typeof t === 'object') {
-                if (('xs' in t) && ('ys' in t)) {
-                    const { xs, ys } = t;
-                    return {
-                        xs: Object.values(xs),
-                        ys: Object.values(ys)
-                    };
-                }
-                else {
-                    return t;
-                }
-            }
-            throw new TypeError('Expected TensorContainerObject');
-        });
-        return (config?.shuffle === undefined || config?.shuffle) ? dataset.shuffle(BUFFER_SIZE) : dataset;
-    }
-    /**
-     * Creates the CSV datasets based off the given sources, then fuses them into a single CSV
-     * dataset.
-     */
-    async loadAll(sources, config) {
-        const datasets = await Promise.all(sources.map(async (source) => await this.load(source, { ...config, shuffle: false })));
-        let dataset = List(datasets).reduce((acc, dataset) => acc.concatenate(dataset));
-        dataset = config?.shuffle === true ? dataset.shuffle(BUFFER_SIZE) : dataset;
-        const data = await TabularData.init(dataset, this.task);
-        // TODO: Implement validation split for tabular data (tricky due to streaming)
-        return {
-            train: data
-        };
-    }
-}

package/dist/dataset/data_loader/text_loader.d.ts DELETED Viewed

@@ -1,14 +0,0 @@
-import type { Task } from '../../index.js';
-import type { DataSplit, Dataset } from '../index.js';
-import { DataLoader, DataConfig } from './index.js';
-/**
- * Text data loader whose instantiable implementation is delegated by the platform-dependent Disco subprojects, namely,
- * @epfml/discojs-web and @epfml/discojs-node.
- */
-export declare abstract class TextLoader<S> extends DataLoader<S> {
-    private readonly task;
-    constructor(task: Task);
-    abstract loadDatasetFrom(source: S): Promise<Dataset>;
-    load(source: S, config?: DataConfig): Promise<Dataset>;
-    loadAll(sources: S[], config?: DataConfig): Promise<DataSplit>;
-}

package/dist/dataset/data_loader/text_loader.js DELETED Viewed

@@ -1,25 +0,0 @@
-import { TextData } from '../index.js';
-import { DataLoader } from './index.js';
-/**
- * Text data loader whose instantiable implementation is delegated by the platform-dependent Disco subprojects, namely,
- * @epfml/discojs-web and @epfml/discojs-node.
- */
-export class TextLoader extends DataLoader {
-    task;
-    constructor(task) {
-        super();
-        this.task = task;
-    }
-    async load(source, config) {
-        const dataset = await this.loadDatasetFrom(source);
-        // 1st arg: Stream shuffling buffer size
-        return (config?.shuffle === undefined || config?.shuffle) ? dataset.shuffle(1000, undefined, true) : dataset;
-    }
-    async loadAll(sources, config) {
-        const concatenated = (await Promise.all(sources.map(async (src) => await this.load(src, config))))
-            .reduce((acc, dataset) => acc.concatenate(dataset));
-        return {
-            train: await TextData.init(concatenated, this.task)
-        };
-    }
-}

package/dist/dataset/dataset_builder.d.ts DELETED Viewed

@@ -1,51 +0,0 @@
-import type { Task } from '../index.js';
-import type { DataSplit } from './data/index.js';
-import type { DataConfig, DataLoader } from './data_loader/data_loader.js';
-/**
- * Incrementally builds a dataset from the provided file sources. The sources may
- * either be file blobs or regular file system paths.
- */
-export declare class DatasetBuilder<Source> {
-    /**
-     * The data loader used to load the data contained in the provided files.
-     */
-    private readonly dataLoader;
-    /**
-     * The task for which the dataset should be built.
-     */
-    readonly task: Task;
-    /**
-     * The buffer of unlabelled file sources.
-     */
-    private _unlabeledSources;
-    /**
-     * The buffer of labelled file sources.
-     */
-    private _labeledSources;
-    constructor(
-    /**
-     * The data loader used to load the data contained in the provided files.
-     */
-    dataLoader: DataLoader<Source>,
-    /**
-     * The task for which the dataset should be built.
-     */
-    task: Task);
-    /**
-     * Adds the given file sources to the builder's buffer. Sources may be provided a label in the case
-     * of supervised learning.
-     * @param sources The array of file sources
-     * @param label The file sources label
-     */
-    addFiles(sources: Source[], label?: string): void;
-    /**
-     * Clears the file sources buffers. If a label is provided, only the file sources
-     * corresponding to the given label will be removed.
-     * @param label The file sources label
-     */
-    clearFiles(label?: string): void;
-    private getLabels;
-    build(config?: DataConfig): Promise<DataSplit>;
-    get size(): number;
-    get sources(): Source[];
-}

package/dist/dataset/dataset_builder.js DELETED Viewed

@@ -1,118 +0,0 @@
-import { Map } from 'immutable';
-/**
- * Incrementally builds a dataset from the provided file sources. The sources may
- * either be file blobs or regular file system paths.
- */
-export class DatasetBuilder {
-    dataLoader;
-    task;
-    /**
-     * The buffer of unlabelled file sources.
-     */
-    _unlabeledSources;
-    /**
-     * The buffer of labelled file sources.
-     */
-    _labeledSources;
-    constructor(
-    /**
-     * The data loader used to load the data contained in the provided files.
-     */
-    dataLoader,
-    /**
-     * The task for which the dataset should be built.
-     */
-    task) {
-        this.dataLoader = dataLoader;
-        this.task = task;
-        this._unlabeledSources = [];
-        // Map from label to sources
-        this._labeledSources = Map();
-    }
-    /**
-     * Adds the given file sources to the builder's buffer. Sources may be provided a label in the case
-     * of supervised learning.
-     * @param sources The array of file sources
-     * @param label The file sources label
-     */
-    addFiles(sources, label) {
-        if (label === undefined) {
-            this._unlabeledSources = this._unlabeledSources.concat(sources);
-        }
-        else {
-            const currentSources = this._labeledSources.get(label);
-            if (currentSources === undefined) {
-                this._labeledSources = this._labeledSources.set(label, sources);
-            }
-            else {
-                this._labeledSources = this._labeledSources.set(label, currentSources.concat(sources));
-            }
-        }
-    }
-    /**
-     * Clears the file sources buffers. If a label is provided, only the file sources
-     * corresponding to the given label will be removed.
-     * @param label The file sources label
-     */
-    clearFiles(label) {
-        if (label === undefined) {
-            this._unlabeledSources = [];
-        }
-        else {
-            this._labeledSources = this._labeledSources.delete(label);
-        }
-    }
-    getLabels() {
-        // We need to duplicate the labels as we need one for each source.
-        // Say for label A we have sources [img1, img2, img3], then we
-        // need labels [A, A, A].
-        let labels = [];
-        this._labeledSources.forEach((sources, label) => {
-            const sourcesLabels = Array.from({ length: sources.length }, (_) => label);
-            labels = labels.concat(sourcesLabels);
-        });
-        return labels.flat();
-    }
-    async build(config) {
-        // Require that at least one source collection is non-empty, but not both
-        if (this._unlabeledSources.length + this._labeledSources.size === 0) {
-            throw new Error('No input files connected'); // This error message is parsed in Trainer.vue
-        }
-        let dataTuple;
-        if (this._unlabeledSources.length > 0) {
-            let defaultConfig = {};
-            if (config?.inference === true) {
-                // Inferring model, no labels needed
-                defaultConfig = {
-                    features: this.task.trainingInformation.inputColumns,
-                    shuffle: true
-                };
-            }
-            else {
-                // Labels are contained in the given sources
-                defaultConfig = {
-                    features: this.task.trainingInformation.inputColumns,
-                    labels: this.task.trainingInformation.outputColumns,
-                    shuffle: true
-                };
-            }
-            dataTuple = await this.dataLoader.loadAll(this._unlabeledSources, { ...defaultConfig, ...config });
-        }
-        else {
-            // Labels are inferred from the file selection boxes
-            const defaultConfig = {
-                labels: this.getLabels(),
-                shuffle: true
-            };
-            const sources = this._labeledSources.valueSeq().toArray().flat();
-            dataTuple = await this.dataLoader.loadAll(sources, { ...defaultConfig, ...config });
-        }
-        return dataTuple;
-    }
-    get size() {
-        return Math.max(this._unlabeledSources.length, this._labeledSources.size);
-    }
-    get sources() {
-        return this._unlabeledSources.length > 0 ? this._unlabeledSources : this._labeledSources.valueSeq().toArray().flat();
-    }
-}