@epfml/discojs 3.0.1-p20240820135253.0 → 3.0.1-p20240822103944.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/dist/dataset/data/data.d.ts +6 -7
  2. package/dist/dataset/data/data.js +12 -7
  3. package/dist/dataset/data/helpers.d.ts +10 -0
  4. package/dist/dataset/data/helpers.js +97 -0
  5. package/dist/dataset/data/image_data.d.ts +3 -3
  6. package/dist/dataset/data/image_data.js +7 -2
  7. package/dist/dataset/data/index.d.ts +0 -1
  8. package/dist/dataset/data/preprocessing/text_preprocessing.js +23 -9
  9. package/dist/dataset/data/tabular_data.d.ts +3 -3
  10. package/dist/dataset/data/text_data.d.ts +3 -3
  11. package/dist/dataset/dataset.d.ts +48 -5
  12. package/dist/dataset/dataset.js +155 -1
  13. package/dist/dataset/image.d.ts +14 -0
  14. package/dist/dataset/image.js +21 -0
  15. package/dist/dataset/index.d.ts +3 -5
  16. package/dist/dataset/index.js +3 -3
  17. package/dist/dataset/types.d.ts +4 -0
  18. package/dist/dataset/types.js +2 -0
  19. package/dist/index.d.ts +4 -0
  20. package/dist/index.js +4 -0
  21. package/dist/models/gpt/model.js +2 -0
  22. package/dist/models/model.d.ts +1 -2
  23. package/dist/models/tfjs.d.ts +4 -4
  24. package/dist/models/tfjs.js +2 -1
  25. package/dist/processing.d.ts +35 -0
  26. package/dist/processing.js +89 -0
  27. package/dist/training/disco.d.ts +7 -7
  28. package/dist/training/disco.js +21 -19
  29. package/dist/types.d.ts +3 -0
  30. package/dist/types.js +1 -0
  31. package/dist/validation/validator.d.ts +7 -23
  32. package/dist/validation/validator.js +99 -105
  33. package/package.json +1 -1
  34. package/dist/dataset/data_loader/data_loader.d.ts +0 -13
  35. package/dist/dataset/data_loader/data_loader.js +0 -2
  36. package/dist/dataset/data_loader/image_loader.d.ts +0 -21
  37. package/dist/dataset/data_loader/image_loader.js +0 -101
  38. package/dist/dataset/data_loader/index.d.ts +0 -5
  39. package/dist/dataset/data_loader/index.js +0 -4
  40. package/dist/dataset/data_loader/tabular_loader.d.ts +0 -35
  41. package/dist/dataset/data_loader/tabular_loader.js +0 -76
  42. package/dist/dataset/data_loader/text_loader.d.ts +0 -14
  43. package/dist/dataset/data_loader/text_loader.js +0 -25
  44. package/dist/dataset/dataset_builder.d.ts +0 -51
  45. package/dist/dataset/dataset_builder.js +0 -118
@@ -1,101 +0,0 @@
1
- import { Range } from 'immutable';
2
- import * as tf from '@tensorflow/tfjs';
3
- import { ImageData } from '../data/index.js';
4
- import { DataLoader } from '../data_loader/index.js';
5
- /**
6
- * Image data loader whose instantiable implementation is delegated by the platform-dependent Disco subprojects, namely,
7
- * @epfml/discojs-web and @epfml/discojs-node.
8
- * Load labels and correctly match them with their respective images, with the following constraints:
9
- * 1. Images are given as 1 image/1 file;
10
- * 2. Labels are given as multiple labels/1 file, each label file can contain a different amount of labels.
11
- */
12
- export class ImageLoader extends DataLoader {
13
- task;
14
- constructor(task) {
15
- super();
16
- this.task = task;
17
- }
18
- async load(image, config) {
19
- let tensorContainer;
20
- if (config?.labels === undefined) {
21
- tensorContainer = await this.readImageFrom(image, config?.channels);
22
- }
23
- else {
24
- tensorContainer = {
25
- xs: await this.readImageFrom(image, config?.channels),
26
- ys: config.labels[0]
27
- };
28
- }
29
- return tf.data.array([tensorContainer]);
30
- }
31
- async buildDataset(images, labels, indices, config) {
32
- // Can't use arrow function for generator and need access to 'this'
33
- // eslint-disable-next-line
34
- const self = this;
35
- async function* dataGenerator() {
36
- const withLabels = config?.labels !== undefined;
37
- let index = 0;
38
- while (index < indices.length) {
39
- const sample = await self.readImageFrom(images[indices[index]], config?.channels);
40
- const label = withLabels ? labels[indices[index]] : undefined;
41
- const value = withLabels ? { xs: sample, ys: label } : sample;
42
- index++;
43
- yield value;
44
- }
45
- }
46
- // @ts-expect-error: For some reasons typescript refuses async generator but tensorflow do work with them
47
- const dataset = tf.data.generator(dataGenerator);
48
- return await ImageData.init(dataset, this.task, indices.length);
49
- }
50
- async loadAll(images, config) {
51
- let labels = [];
52
- const indices = Range(0, images.length).toArray();
53
- if (config?.labels !== undefined) {
54
- const labelList = this.task.trainingInformation?.LABEL_LIST;
55
- if (labelList === undefined || !Array.isArray(labelList)) {
56
- throw new Error('LABEL_LIST should be specified in the task training information');
57
- }
58
- const numberOfClasses = labelList.length;
59
- // Map label strings to integer
60
- const label_to_int = new Map(labelList.map((label_name, idx) => [label_name, idx]));
61
- if (label_to_int.size !== numberOfClasses) {
62
- throw new Error("Input labels aren't matching the task LABEL_LIST");
63
- }
64
- labels = config.labels.map(label_name => {
65
- const label_int = label_to_int.get(label_name);
66
- if (label_int === undefined) {
67
- throw new Error(`Found input label ${label_name} not specified in task LABEL_LIST`);
68
- }
69
- return label_int;
70
- });
71
- labels = await tf.oneHot(tf.tensor1d(labels, 'int32'), numberOfClasses).array();
72
- }
73
- if (config?.shuffle === undefined || config?.shuffle) {
74
- this.shuffle(indices);
75
- }
76
- if (config?.validationSplit === undefined || config?.validationSplit === 0) {
77
- const dataset = await this.buildDataset(images, labels, indices, config);
78
- return {
79
- train: dataset,
80
- validation: undefined
81
- };
82
- }
83
- const trainSize = Math.floor(images.length * (1 - config.validationSplit));
84
- const trainIndices = indices.slice(0, trainSize);
85
- const valIndices = indices.slice(trainSize);
86
- const trainDataset = await this.buildDataset(images, labels, trainIndices, config);
87
- const valDataset = await this.buildDataset(images, labels, valIndices, config);
88
- return {
89
- train: trainDataset,
90
- validation: valDataset
91
- };
92
- }
93
- shuffle(array) {
94
- for (let i = 0; i < array.length; i++) {
95
- const j = Math.floor(Math.random() * i);
96
- const swap = array[i];
97
- array[i] = array[j];
98
- array[j] = swap;
99
- }
100
- }
101
- }
@@ -1,5 +0,0 @@
1
- export type { DataConfig } from './data_loader.js';
2
- export { DataLoader } from './data_loader.js';
3
- export { ImageLoader } from './image_loader.js';
4
- export { TabularLoader } from './tabular_loader.js';
5
- export { TextLoader } from './text_loader.js';
@@ -1,4 +0,0 @@
1
- export { DataLoader } from './data_loader.js';
2
- export { ImageLoader } from './image_loader.js';
3
- export { TabularLoader } from './tabular_loader.js';
4
- export { TextLoader } from './text_loader.js';
@@ -1,35 +0,0 @@
1
- import type { Task } from '../../index.js';
2
- import type { Dataset, DataSplit } from '../index.js';
3
- import type { DataConfig } from './index.js';
4
- import { DataLoader } from './index.js';
5
- /**
6
- * Tabular data loader whose instantiable implementation is delegated by the platform-dependent Disco subprojects, namely,
7
- * @epfml/discojs-web and @epfml/discojs-node. Loads data from files whose entries are line-separated and consist of
8
- * character-separated features and label(s). Such files typically have the .csv extension.
9
- */
10
- export declare abstract class TabularLoader<Source> extends DataLoader<Source> {
11
- private readonly task;
12
- readonly delimiter: string;
13
- constructor(task: Task, delimiter?: string);
14
- /**
15
- * Creates a CSV dataset object based off the given source.
16
- * @param source File object, URL string or local file system path.
17
- * @param csvConfig Object expected by TF.js to create a CSVDataset.
18
- * @returns The CSVDataset object built upon the given source.
19
- */
20
- abstract loadDatasetFrom(source: Source, csvConfig: Record<string, unknown>): Promise<Dataset>;
21
- /**
22
- * Expects delimiter-separated tabular data made of N columns. The data may be
23
- * potentially split among several sources. Every source should contain N-1
24
- * feature columns and 1 single label column.
25
- * @param source List of File objects, URLs or file system paths.
26
- * @param config
27
- * @returns A TF.js dataset built upon read tabular data stored in the given sources.
28
- */
29
- load(source: Source, config?: DataConfig): Promise<Dataset>;
30
- /**
31
- * Creates the CSV datasets based off the given sources, then fuses them into a single CSV
32
- * dataset.
33
- */
34
- loadAll(sources: Source[], config: DataConfig): Promise<DataSplit>;
35
- }
@@ -1,76 +0,0 @@
1
- import { List, Map, Set } from 'immutable';
2
- import { TabularData } from '../index.js';
3
- import { DataLoader } from './index.js';
4
- // Window size from which the dataset shuffling will sample
5
- const BUFFER_SIZE = 1000;
6
- /**
7
- * Tabular data loader whose instantiable implementation is delegated by the platform-dependent Disco subprojects, namely,
8
- * @epfml/discojs-web and @epfml/discojs-node. Loads data from files whose entries are line-separated and consist of
9
- * character-separated features and label(s). Such files typically have the .csv extension.
10
- */
11
- export class TabularLoader extends DataLoader {
12
- task;
13
- delimiter;
14
- constructor(task, delimiter = ',') {
15
- super();
16
- this.task = task;
17
- this.delimiter = delimiter;
18
- }
19
- /**
20
- * Expects delimiter-separated tabular data made of N columns. The data may be
21
- * potentially split among several sources. Every source should contain N-1
22
- * feature columns and 1 single label column.
23
- * @param source List of File objects, URLs or file system paths.
24
- * @param config
25
- * @returns A TF.js dataset built upon read tabular data stored in the given sources.
26
- */
27
- async load(source, config) {
28
- /**
29
- * Prepare the CSV config object based off the given features and labels.
30
- * If labels is empty, then the returned dataset is comprised of samples only.
31
- * Otherwise, each entry is of the form `{ xs, ys }` with `xs` as features and `ys`
32
- * as labels.
33
- */
34
- if (config?.features === undefined) {
35
- // TODO @s314cy
36
- throw new Error('Not implemented');
37
- }
38
- const columnConfigs = Map(Set(config.features).map((feature) => [feature, { required: false, isLabel: false }])).merge(Set(config.labels).map((label) => [label, { required: true, isLabel: true }]));
39
- const csvConfig = {
40
- hasHeader: true,
41
- columnConfigs: columnConfigs.toObject(),
42
- configuredColumnsOnly: true,
43
- delimiter: this.delimiter
44
- };
45
- const dataset = (await this.loadDatasetFrom(source, csvConfig)).map((t) => {
46
- if (typeof t === 'object') {
47
- if (('xs' in t) && ('ys' in t)) {
48
- const { xs, ys } = t;
49
- return {
50
- xs: Object.values(xs),
51
- ys: Object.values(ys)
52
- };
53
- }
54
- else {
55
- return t;
56
- }
57
- }
58
- throw new TypeError('Expected TensorContainerObject');
59
- });
60
- return (config?.shuffle === undefined || config?.shuffle) ? dataset.shuffle(BUFFER_SIZE) : dataset;
61
- }
62
- /**
63
- * Creates the CSV datasets based off the given sources, then fuses them into a single CSV
64
- * dataset.
65
- */
66
- async loadAll(sources, config) {
67
- const datasets = await Promise.all(sources.map(async (source) => await this.load(source, { ...config, shuffle: false })));
68
- let dataset = List(datasets).reduce((acc, dataset) => acc.concatenate(dataset));
69
- dataset = config?.shuffle === true ? dataset.shuffle(BUFFER_SIZE) : dataset;
70
- const data = await TabularData.init(dataset, this.task);
71
- // TODO: Implement validation split for tabular data (tricky due to streaming)
72
- return {
73
- train: data
74
- };
75
- }
76
- }
@@ -1,14 +0,0 @@
1
- import type { Task } from '../../index.js';
2
- import type { DataSplit, Dataset } from '../index.js';
3
- import { DataLoader, DataConfig } from './index.js';
4
- /**
5
- * Text data loader whose instantiable implementation is delegated by the platform-dependent Disco subprojects, namely,
6
- * @epfml/discojs-web and @epfml/discojs-node.
7
- */
8
- export declare abstract class TextLoader<S> extends DataLoader<S> {
9
- private readonly task;
10
- constructor(task: Task);
11
- abstract loadDatasetFrom(source: S): Promise<Dataset>;
12
- load(source: S, config?: DataConfig): Promise<Dataset>;
13
- loadAll(sources: S[], config?: DataConfig): Promise<DataSplit>;
14
- }
@@ -1,25 +0,0 @@
1
- import { TextData } from '../index.js';
2
- import { DataLoader } from './index.js';
3
- /**
4
- * Text data loader whose instantiable implementation is delegated by the platform-dependent Disco subprojects, namely,
5
- * @epfml/discojs-web and @epfml/discojs-node.
6
- */
7
- export class TextLoader extends DataLoader {
8
- task;
9
- constructor(task) {
10
- super();
11
- this.task = task;
12
- }
13
- async load(source, config) {
14
- const dataset = await this.loadDatasetFrom(source);
15
- // 1st arg: Stream shuffling buffer size
16
- return (config?.shuffle === undefined || config?.shuffle) ? dataset.shuffle(1000, undefined, true) : dataset;
17
- }
18
- async loadAll(sources, config) {
19
- const concatenated = (await Promise.all(sources.map(async (src) => await this.load(src, config))))
20
- .reduce((acc, dataset) => acc.concatenate(dataset));
21
- return {
22
- train: await TextData.init(concatenated, this.task)
23
- };
24
- }
25
- }
@@ -1,51 +0,0 @@
1
- import type { Task } from '../index.js';
2
- import type { DataSplit } from './data/index.js';
3
- import type { DataConfig, DataLoader } from './data_loader/data_loader.js';
4
- /**
5
- * Incrementally builds a dataset from the provided file sources. The sources may
6
- * either be file blobs or regular file system paths.
7
- */
8
- export declare class DatasetBuilder<Source> {
9
- /**
10
- * The data loader used to load the data contained in the provided files.
11
- */
12
- private readonly dataLoader;
13
- /**
14
- * The task for which the dataset should be built.
15
- */
16
- readonly task: Task;
17
- /**
18
- * The buffer of unlabelled file sources.
19
- */
20
- private _unlabeledSources;
21
- /**
22
- * The buffer of labelled file sources.
23
- */
24
- private _labeledSources;
25
- constructor(
26
- /**
27
- * The data loader used to load the data contained in the provided files.
28
- */
29
- dataLoader: DataLoader<Source>,
30
- /**
31
- * The task for which the dataset should be built.
32
- */
33
- task: Task);
34
- /**
35
- * Adds the given file sources to the builder's buffer. Sources may be provided a label in the case
36
- * of supervised learning.
37
- * @param sources The array of file sources
38
- * @param label The file sources label
39
- */
40
- addFiles(sources: Source[], label?: string): void;
41
- /**
42
- * Clears the file sources buffers. If a label is provided, only the file sources
43
- * corresponding to the given label will be removed.
44
- * @param label The file sources label
45
- */
46
- clearFiles(label?: string): void;
47
- private getLabels;
48
- build(config?: DataConfig): Promise<DataSplit>;
49
- get size(): number;
50
- get sources(): Source[];
51
- }
@@ -1,118 +0,0 @@
1
- import { Map } from 'immutable';
2
- /**
3
- * Incrementally builds a dataset from the provided file sources. The sources may
4
- * either be file blobs or regular file system paths.
5
- */
6
- export class DatasetBuilder {
7
- dataLoader;
8
- task;
9
- /**
10
- * The buffer of unlabelled file sources.
11
- */
12
- _unlabeledSources;
13
- /**
14
- * The buffer of labelled file sources.
15
- */
16
- _labeledSources;
17
- constructor(
18
- /**
19
- * The data loader used to load the data contained in the provided files.
20
- */
21
- dataLoader,
22
- /**
23
- * The task for which the dataset should be built.
24
- */
25
- task) {
26
- this.dataLoader = dataLoader;
27
- this.task = task;
28
- this._unlabeledSources = [];
29
- // Map from label to sources
30
- this._labeledSources = Map();
31
- }
32
- /**
33
- * Adds the given file sources to the builder's buffer. Sources may be provided a label in the case
34
- * of supervised learning.
35
- * @param sources The array of file sources
36
- * @param label The file sources label
37
- */
38
- addFiles(sources, label) {
39
- if (label === undefined) {
40
- this._unlabeledSources = this._unlabeledSources.concat(sources);
41
- }
42
- else {
43
- const currentSources = this._labeledSources.get(label);
44
- if (currentSources === undefined) {
45
- this._labeledSources = this._labeledSources.set(label, sources);
46
- }
47
- else {
48
- this._labeledSources = this._labeledSources.set(label, currentSources.concat(sources));
49
- }
50
- }
51
- }
52
- /**
53
- * Clears the file sources buffers. If a label is provided, only the file sources
54
- * corresponding to the given label will be removed.
55
- * @param label The file sources label
56
- */
57
- clearFiles(label) {
58
- if (label === undefined) {
59
- this._unlabeledSources = [];
60
- }
61
- else {
62
- this._labeledSources = this._labeledSources.delete(label);
63
- }
64
- }
65
- getLabels() {
66
- // We need to duplicate the labels as we need one for each source.
67
- // Say for label A we have sources [img1, img2, img3], then we
68
- // need labels [A, A, A].
69
- let labels = [];
70
- this._labeledSources.forEach((sources, label) => {
71
- const sourcesLabels = Array.from({ length: sources.length }, (_) => label);
72
- labels = labels.concat(sourcesLabels);
73
- });
74
- return labels.flat();
75
- }
76
- async build(config) {
77
- // Require that at least one source collection is non-empty, but not both
78
- if (this._unlabeledSources.length + this._labeledSources.size === 0) {
79
- throw new Error('No input files connected'); // This error message is parsed in Trainer.vue
80
- }
81
- let dataTuple;
82
- if (this._unlabeledSources.length > 0) {
83
- let defaultConfig = {};
84
- if (config?.inference === true) {
85
- // Inferring model, no labels needed
86
- defaultConfig = {
87
- features: this.task.trainingInformation.inputColumns,
88
- shuffle: true
89
- };
90
- }
91
- else {
92
- // Labels are contained in the given sources
93
- defaultConfig = {
94
- features: this.task.trainingInformation.inputColumns,
95
- labels: this.task.trainingInformation.outputColumns,
96
- shuffle: true
97
- };
98
- }
99
- dataTuple = await this.dataLoader.loadAll(this._unlabeledSources, { ...defaultConfig, ...config });
100
- }
101
- else {
102
- // Labels are inferred from the file selection boxes
103
- const defaultConfig = {
104
- labels: this.getLabels(),
105
- shuffle: true
106
- };
107
- const sources = this._labeledSources.valueSeq().toArray().flat();
108
- dataTuple = await this.dataLoader.loadAll(sources, { ...defaultConfig, ...config });
109
- }
110
- return dataTuple;
111
- }
112
- get size() {
113
- return Math.max(this._unlabeledSources.length, this._labeledSources.size);
114
- }
115
- get sources() {
116
- return this._unlabeledSources.length > 0 ? this._unlabeledSources : this._labeledSources.valueSeq().toArray().flat();
117
- }
118
- }