@datagrok/eda 1.1.29 → 1.1.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@datagrok/eda",
3
3
  "friendlyName": "EDA",
4
- "version": "1.1.29",
4
+ "version": "1.1.30",
5
5
  "description": "Exploratory Data Analysis Tools",
6
6
  "dependencies": {
7
7
  "@datagrok-libraries/math": "^1.1.10",
package/src/package.ts CHANGED
@@ -31,6 +31,7 @@ import {markovCluster} from '@datagrok-libraries/ml/src/MCL/clustering-view';
31
31
  import {MCL_OPTIONS_TAG, MCLSerializableOptions} from '@datagrok-libraries/ml/src/MCL';
32
32
 
33
33
  import {getLinearRegressionParams, getPredictionByLinearRegression} from './regression';
34
+ import {PlsModel} from './pls/pls-ml';
34
35
  import {SoftmaxClassifier} from './softmax-classifier';
35
36
 
36
37
  export const _package = new DG.Package();
@@ -668,3 +669,68 @@ export function isApplicableSoftmax(df: DG.DataFrame, predictColumn: DG.Column):
668
669
  export function isInteractiveSoftmax(df: DG.DataFrame, predictColumn: DG.Column): boolean {
669
670
  return SoftmaxClassifier.isInteractive(df.columns, predictColumn);
670
671
  }
672
+
673
+ //name: trainPLSRegression
674
+ //meta.mlname: PLS Regression
675
+ //meta.mlrole: train
676
+ //input: dataframe df
677
+ //input: column predictColumn
678
+ //input: int components = 3 {min: 1; max: 10} [Number of latent components]
679
+ //output: dynamic model
680
+ export async function trainPLSRegression(df: DG.DataFrame, predictColumn: DG.Column, components: number): Promise<Uint8Array> {
681
+ const features = df.columns;
682
+
683
+ if (components > features.length)
684
+ throw new Error('Number of components is greater than features count');
685
+
686
+ const model = new PlsModel();
687
+ await model.fit(features, predictColumn, components);
688
+
689
+ return model.toBytes();
690
+ }
691
+
692
+ //name: applyPLSRegression
693
+ //meta.mlname: PLS Regression
694
+ //meta.mlrole: apply
695
+ //input: dataframe df
696
+ //input: dynamic model
697
+ //output: dataframe table
698
+ export function applyPLSRegression(df: DG.DataFrame, model: any): DG.DataFrame {
699
+ const unpackedModel = new PlsModel(model);
700
+ return DG.DataFrame.fromColumns([unpackedModel.predict(df.columns)]);
701
+ }
702
+
703
+ //name: isApplicablePLSRegression
704
+ //meta.mlname: PLS Regression
705
+ //meta.mlrole: isApplicable
706
+ //input: dataframe df
707
+ //input: column predictColumn
708
+ //output: bool result
709
+ export function isApplicablePLSRegression(df: DG.DataFrame, predictColumn: DG.Column): boolean {
710
+ return PlsModel.isApplicable(df.columns, predictColumn);
711
+ }
712
+
713
+ //name: visualizePLSRegression
714
+ //meta.mlname: PLS Regression
715
+ //meta.mlrole: visualize
716
+ //input: dataframe df
717
+ //input: column targetColumn
718
+ //input: column predictColumn
719
+ //input: dynamic model
720
+ //output: dynamic widget
721
+ export async function visualizePLSRegression(df: DG.DataFrame, targetColumn: DG.Column, predictColumn: DG.Column, model: any): Promise<any> {
722
+ const unpackedModel = new PlsModel(model);
723
+ const viewers = unpackedModel.viewers();
724
+
725
+ return viewers.map((v) => v.root);
726
+ }
727
+
728
+ //name: isInteractivePLSRegression
729
+ //meta.mlname: PLS Regression
730
+ //meta.mlrole: isInteractive
731
+ //input: dataframe df
732
+ //input: column predictColumn
733
+ //output: bool result
734
+ export function isInteractivePLSRegression(df: DG.DataFrame, predictColumn: DG.Column): boolean {
735
+ return PlsModel.isInteractive(df.columns, predictColumn);
736
+ }
@@ -34,6 +34,7 @@ export enum TITLE {
34
34
  SCORES = 'Scores',
35
35
  EXPL_VAR = 'Explained Variance',
36
36
  EXPLORE = 'Explore',
37
+ FEATURES = 'Feature names',
37
38
  }
38
39
 
39
40
  /** Tooltips */
@@ -0,0 +1,376 @@
1
+ // Predicitve tools based on the PLS method
2
+
3
+ import * as grok from 'datagrok-api/grok';
4
+ import * as ui from 'datagrok-api/ui';
5
+ import * as DG from 'datagrok-api/dg';
6
+
7
+ import {TITLE, RESULT_NAMES} from './pls-constants';
8
+ import {getPlsAnalysis, PlsOutput, getLines} from './pls-tools';
9
+ import {LINK} from './pls-constants';
10
+ import {getPredictionByLinearRegression} from '../regression';
11
+
12
+ // PLS ML specific constants
13
+ const EXTRA_ROWS = 1;
14
+ const SHIFT = 2;
15
+ const MIN_LOADINGS = 1;
16
+ const MIN_COLS_COUNT = SHIFT + MIN_LOADINGS;
17
+ const SIZE_ARR_LEN = 2;
18
+ const MODEL_IDX = 0;
19
+ const SCORES_IDX = 1;
20
+ const BYTES_PER_SIZES = SIZE_ARR_LEN * 4;
21
+ const BLOCK_SIZE = 64;
22
+
23
+ /** Interactivity tresholds */
24
+ enum INTERACTIVITY {
25
+ MAX_SAMLPES = 100000,
26
+ MAX_FEATURES = 1000,
27
+ };
28
+
29
+ /** Model specification */
30
+ type PlsModelSpecification = {
31
+ params: Float32Array,
32
+ names: string[],
33
+ loadings: Float32Array[],
34
+ dim: number,
35
+ components: number,
36
+ scores: DG.DataFrame,
37
+ }
38
+
39
+ /** PLS regression modeling tool */
40
+ export class PlsModel {
41
+ /** Check applicability */
42
+ static isApplicable(features: DG.ColumnList, predictColumn: DG.Column): boolean {
43
+ for (const col of features) {
44
+ if (!col.matches('numerical'))
45
+ return false;
46
+ }
47
+ if (!predictColumn.matches('numerical'))
48
+ return false;
49
+
50
+ return true;
51
+ }
52
+
53
+ /** Check interactivity */
54
+ static isInteractive(features: DG.ColumnList, predictColumn: DG.Column): boolean {
55
+ return (features.length <= INTERACTIVITY.MAX_FEATURES) &&
56
+ (predictColumn.length <= INTERACTIVITY.MAX_SAMLPES);
57
+ }
58
+
59
+ /** Specification of the PLS model */
60
+ private specn: PlsModelSpecification | null = null;
61
+
62
+ constructor(packedModel?: Uint8Array) {
63
+ if (packedModel) {
64
+ try {
65
+ // Extract model's bytes count
66
+ const sizeArr = new Uint32Array(packedModel.buffer, 0, SIZE_ARR_LEN); // 1-st element is a size of model bytes
67
+ const modelDfBytesCount = sizeArr[MODEL_IDX];
68
+ const scoresDfBytesCount = sizeArr[SCORES_IDX];
69
+
70
+ // Model's bytes
71
+ const modelBytes = new Uint8Array(packedModel.buffer, BYTES_PER_SIZES, modelDfBytesCount);
72
+
73
+ // Model as dataframe
74
+ const modelDf = DG.DataFrame.fromByteArray(modelBytes);
75
+ const rowCount = modelDf.rowCount;
76
+ const columns = modelDf.columns;
77
+ const colsCount = columns.length;
78
+
79
+ // Scores
80
+ const scoresBytes = new Uint8Array(packedModel.buffer, BYTES_PER_SIZES + modelDfBytesCount, scoresDfBytesCount);
81
+ const scores = DG.DataFrame.fromByteArray(scoresBytes);
82
+
83
+ if (colsCount < MIN_COLS_COUNT)
84
+ throw new Error('incorrect columns count');
85
+
86
+ // Extract names of features
87
+ const featureNames = columns.byName(TITLE.FEATURES).toList();
88
+
89
+ // Extract parameters of the linear model
90
+ const params = new Float32Array(rowCount);
91
+ params.set(columns.byName(TITLE.REGR_COEFS).getRawData());
92
+
93
+ // Extract loadings
94
+ const components = colsCount - SHIFT;
95
+ const loadings = new Array<Float32Array>(components);
96
+
97
+ for (let i = 0; i < components; ++i) {
98
+ loadings[i] = new Float32Array(rowCount);
99
+ loadings[i].set(columns.byIndex(i + SHIFT).getRawData());
100
+ }
101
+
102
+ this.specn = {
103
+ params: params,
104
+ loadings: loadings,
105
+ names: featureNames,
106
+ dim: rowCount - EXTRA_ROWS,
107
+ components: colsCount - SHIFT,
108
+ scores: scores,
109
+ };
110
+ } catch (error) {
111
+ throw new Error(`Failed to load model: ${(error instanceof Error ? error.message : 'the platform issue')}`);
112
+ }
113
+ }
114
+ }
115
+
116
+ /** Train model */
117
+ public async fit(features: DG.ColumnList, target: DG.Column, components: number) {
118
+ const analysis = await getPlsAnalysis({
119
+ table: DG.DataFrame.fromColumns([target]),
120
+ features: features,
121
+ predict: target,
122
+ components: components,
123
+ names: undefined,
124
+ });
125
+
126
+ // 1. Names of features
127
+ const featureNames = features.names();
128
+ featureNames.push('_'); // add extra item
129
+
130
+ // 2. Regression coefficients
131
+ const params = this.getRegrCoeffs(features, target, analysis.regressionCoefficients);
132
+
133
+ // 3. Loadings
134
+ const loadings = this.getLoadings(components, analysis.xLoadings);
135
+
136
+ // 4. Model specification
137
+ this.specn = {
138
+ names: featureNames,
139
+ params: params,
140
+ loadings: loadings,
141
+ components: components,
142
+ dim: features.length,
143
+ scores: this.getScoresDf(analysis),
144
+ };
145
+
146
+ // 4. Compute explained variances
147
+ this.computeExplVars(target.length, components, analysis.yLoadings);
148
+ } // fit
149
+
150
+ /** Return x-loadings with extra items reserved for explained variances */
151
+ private getLoadings(components: number, loadingsCols: DG.Column[]): Float32Array[] {
152
+ const res = Array<Float32Array>(components);
153
+ const len = loadingsCols[0].length + EXTRA_ROWS;
154
+
155
+ for (let i = 0; i < components; ++i) {
156
+ res[i] = new Float32Array(len);
157
+ res[i].set(loadingsCols[i].getRawData());
158
+ }
159
+
160
+ return res;
161
+ }
162
+
163
+ /** Return regression coefficients */
164
+ private getRegrCoeffs(features: DG.ColumnList, target: DG.Column, regrCoefsCol: DG.Column): Float32Array {
165
+ const dim = features.length;
166
+ const params = new Float32Array(dim + EXTRA_ROWS);
167
+ const paramsByPLS = regrCoefsCol.getRawData();
168
+
169
+ let tmpSum = 0;
170
+
171
+ for (let i = 0; i < dim; ++i) {
172
+ params[i] = paramsByPLS[i];
173
+ tmpSum += paramsByPLS[i] * features.byIndex(i).stats.avg;
174
+ }
175
+
176
+ // compute bias
177
+ params[dim] = target.stats.avg - tmpSum;
178
+
179
+ return params;
180
+ }
181
+
182
+ /** Return explained variances */
183
+ private computeExplVars(samplesCount: number, components: number, yLoadings: DG.Column) {
184
+ if (this.specn === null)
185
+ throw new Error('Failed to compute explained variances');
186
+
187
+ const raw = yLoadings.getRawData();
188
+ const dim = this.specn.loadings[0].length - EXTRA_ROWS;
189
+
190
+ // Compute, source: the paper https://doi.org/10.1002/cem.2589
191
+ let explVar = raw[0]**2 / samplesCount;
192
+
193
+ this.specn.loadings[0][dim] = explVar;
194
+
195
+ for (let comp = 1; comp < components; ++comp) {
196
+ explVar += raw[comp]**2 / samplesCount;
197
+ this.specn.loadings[comp][dim] = explVar;
198
+ }
199
+ }
200
+
201
+ /** Return packed model */
202
+ public toBytes(): Uint8Array {
203
+ if (this.specn === null)
204
+ throw new Error('Failed to pack untrained model');
205
+
206
+ // 1. Store model params in dataframe
207
+ const modelDf = DG.DataFrame.fromColumns([
208
+ DG.Column.fromStrings(TITLE.FEATURES, this.specn.names),
209
+ DG.Column.fromFloat32Array(TITLE.REGR_COEFS, this.specn.params),
210
+ ]);
211
+
212
+ this.specn.loadings.forEach((array, idx) => modelDf.columns.add(DG.Column.fromFloat32Array(
213
+ `${TITLE.XLOADING}${idx + 1}`,
214
+ array,
215
+ )));
216
+
217
+ // 2. Pack model dataframe
218
+ const modelDfBytes = modelDf.toByteArray();
219
+ const modelDfBytesCount = modelDfBytes.length;
220
+
221
+ const scoresBytes = this.specn.scores.toByteArray();
222
+ const scoresBytesCount = scoresBytes.length;
223
+
224
+ const requiredBytes = modelDfBytesCount + scoresBytesCount + BYTES_PER_SIZES;
225
+
226
+ const packedModel = new Uint8Array((Math.ceil(requiredBytes / BLOCK_SIZE) + 1) * BLOCK_SIZE);
227
+
228
+ // 4 bytes for storing model's bytes count
229
+ const sizeArr = new Uint32Array(packedModel.buffer, 0, SIZE_ARR_LEN);
230
+ sizeArr[MODEL_IDX] = modelDfBytesCount;
231
+ sizeArr[SCORES_IDX] = scoresBytesCount;
232
+
233
+ // Store model's bytes
234
+ packedModel.set(modelDfBytes, BYTES_PER_SIZES);
235
+
236
+ // Store scores bytes
237
+ packedModel.set(scoresBytes, BYTES_PER_SIZES + modelDfBytesCount);
238
+
239
+ return packedModel;
240
+ } // toBytes
241
+
242
+ /** Return prediction */
243
+ public predict(features: DG.ColumnList): DG.Column {
244
+ if (this.specn === null)
245
+ throw new Error('Predicting failed: model is not trained');
246
+
247
+ return getPredictionByLinearRegression(features, this.specn.params);
248
+ }
249
+
250
+ /** Return loadings and regression coefficients viewers */
251
+ private loadingsParamsViewers(): DG.Viewer[] {
252
+ if (this.specn === null)
253
+ throw new Error('Failed to create loadings and parameters viewers: untrained model');
254
+
255
+ const viewers: DG.Viewer[] = [];
256
+
257
+ const dim = this.specn.dim;
258
+
259
+ // Parameters and loadings dataframe
260
+ const loadingsDf = DG.DataFrame.fromColumns([
261
+ DG.Column.fromStrings(TITLE.FEATURES, this.specn.names.slice(0, -1)),
262
+ DG.Column.fromFloat32Array(TITLE.REGR_COEFS, this.specn.params, dim),
263
+ ]);
264
+
265
+ const columns = loadingsDf.columns;
266
+ const shift = columns.length;
267
+ const components = this.specn.components;
268
+
269
+ this.specn.loadings.forEach((arr, idx) => loadingsDf.columns.add(
270
+ DG.Column.fromFloat32Array(`${TITLE.XLOADING}${idx + 1}`, arr, dim),
271
+ ));
272
+
273
+ // Loading scatterplot
274
+ viewers.push(DG.Viewer.scatterPlot(loadingsDf, {
275
+ title: TITLE.LOADINGS,
276
+ xColumnName: columns.byIndex(shift).name,
277
+ yColumnName: columns.byIndex(shift + (components > 1 ? 1 : 0)).name,
278
+ markerType: DG.MARKER_TYPE.CIRCLE,
279
+ labels: TITLE.FEATURES,
280
+ help: LINK.LOADINGS,
281
+ }));
282
+
283
+ // Regression coefficients barchart
284
+ viewers.push(DG.Viewer.barChart(loadingsDf, {
285
+ title: TITLE.REGR_COEFS,
286
+ splitColumnName: TITLE.FEATURES,
287
+ valueColumnName: TITLE.REGR_COEFS,
288
+ valueAggrType: DG.AGG.AVG,
289
+ help: LINK.COEFFS,
290
+ showValueSelector: false,
291
+ showStackSelector: false,
292
+ }));
293
+
294
+ return viewers;
295
+ } // getLoadingsParamsViewers
296
+
297
+ /** Return explained variances viewer */
298
+ private explVarsViewer(): DG.Viewer {
299
+ if (this.specn === null)
300
+ throw new Error('Failed to create exaplained variances viewer: untrained model');
301
+
302
+ const components = this.specn.components;
303
+ const dim = this.specn.dim;
304
+
305
+ const compNames = new Array<string>(components);
306
+ const explVars = new Float32Array(components);
307
+
308
+ compNames[0] = `${RESULT_NAMES.COMP} 1`;
309
+ explVars[0] = this.specn.loadings[0][dim];
310
+
311
+ for (let i = 1; i < components; ++i) {
312
+ compNames[i] = `${RESULT_NAMES.COMPS} ${i + 1}`;
313
+ explVars[i] = this.specn.loadings[i][dim];
314
+ }
315
+
316
+ return DG.Viewer.barChart(DG.DataFrame.fromColumns([
317
+ DG.Column.fromStrings(RESULT_NAMES.COMPS, compNames),
318
+ DG.Column.fromFloat32Array(TITLE.EXPL_VAR, explVars),
319
+ ]), {
320
+ title: TITLE.EXPL_VAR,
321
+ splitColumnName: RESULT_NAMES.COMPS,
322
+ valueColumnName: TITLE.EXPL_VAR,
323
+ valueAggrType: DG.AGG.AVG,
324
+ help: LINK.EXPL_VARS,
325
+ showCategorySelector: false,
326
+ showStackSelector: false,
327
+ showValueSelector: false,
328
+ });
329
+ }
330
+
331
+ /** Returns viewers */
332
+ public viewers(): DG.Viewer[] {
333
+ if (this.specn === null)
334
+ throw new Error('Failed to create viewers: untrained model');
335
+
336
+ const viewers = this.loadingsParamsViewers();
337
+ viewers.push(
338
+ this.explVarsViewer(),
339
+ this.getScoresScatter(),
340
+ );
341
+
342
+ return viewers;
343
+ }
344
+
345
+ /** Return dataframe with scores */
346
+ private getScoresDf(analysis: PlsOutput): DG.DataFrame {
347
+ const tScores = analysis.tScores;
348
+ const uScores = analysis.uScores;
349
+
350
+ tScores.forEach((col, idx) => col.name = `${TITLE.XSCORE}${idx + 1}`);
351
+ uScores.forEach((col, idx) => col.name = `${TITLE.YSCORE}${idx + 1}`);
352
+
353
+ return DG.DataFrame.fromColumns(tScores.concat(uScores));
354
+ }
355
+
356
+ /** Return scores scatter */
357
+ private getScoresScatter(): DG.Viewer {
358
+ if (this.specn === null)
359
+ throw new Error('Failed to create scores scatter: untrained model');
360
+
361
+ const names = this.specn.scores.columns.names();
362
+
363
+ const scatter = DG.Viewer.scatterPlot(this.specn.scores, {
364
+ title: TITLE.SCORES,
365
+ xColumnName: names[0],
366
+ yColumnName: names[1],
367
+ markerType: DG.MARKER_TYPE.CIRCLE,
368
+ help: LINK.SCORES,
369
+ showViewerFormulaLines: true,
370
+ });
371
+
372
+ scatter.meta.formulaLines.addAll(getLines(names));
373
+
374
+ return scatter;
375
+ }
376
+ };
@@ -30,9 +30,43 @@ export type PlsInput = {
30
30
  features: DG.ColumnList,
31
31
  predict: DG.Column,
32
32
  components: number,
33
- names : DG.Column | null,
33
+ names : DG.Column | undefined,
34
34
  };
35
35
 
36
+ /** Return lines */
37
+ export function getLines(names: string[]): DG.FormulaLine[] {
38
+ const lines: DG.FormulaLine[] = [];
39
+
40
+ const addLine = (formula: string, radius: number) => {
41
+ lines.push({
42
+ type: 'line',
43
+ formula: formula,
44
+ width: LINE_WIDTH,
45
+ visible: true,
46
+ title: ' ',
47
+ min: -radius,
48
+ max: radius,
49
+ color: COLOR.CIRCLE,
50
+ });
51
+ };
52
+
53
+ names.forEach((xName) => {
54
+ const x = '${' + xName + '}';
55
+ lines.push({type: 'line', formula: `${x} = 0`, width: LINE_WIDTH, visible: true, title: ' ', color: COLOR.AXIS});
56
+
57
+ names.forEach((yName) => {
58
+ const y = '${' + yName + '}';
59
+
60
+ RADIUS.forEach((r) => {
61
+ addLine(y + ` = sqrt(${r*r} - ${x} * ${x})`, r);
62
+ addLine(y + ` = -sqrt(${r*r} - ${x} * ${x})`, r);
63
+ });
64
+ });
65
+ });
66
+
67
+ return lines;
68
+ }
69
+
36
70
  /** Partial least square regression (PLS) */
37
71
  export async function getPlsAnalysis(input: PlsInput): Promise<PlsOutput> {
38
72
  checkWasmDimensionReducerInputs(input.features, input.components);
@@ -149,36 +183,7 @@ async function performMVA(input: PlsInput, analysisType: PLS_ANALYSIS): Promise<
149
183
  });
150
184
 
151
185
  // 4.3) create lines & circles
152
- const lines = [] as DG.FormulaLine[];
153
-
154
- const addLine = (formula: string, radius: number) => {
155
- lines.push({
156
- type: 'line',
157
- formula: formula,
158
- width: LINE_WIDTH,
159
- visible: true,
160
- title: ' ',
161
- min: -radius,
162
- max: radius,
163
- color: COLOR.CIRCLE,
164
- });
165
- };
166
-
167
- scoreNames.forEach((xName) => {
168
- const x = '${' + xName + '}';
169
- lines.push({type: 'line', formula: `${x} = 0`, width: LINE_WIDTH, visible: true, title: ' ', color: COLOR.AXIS});
170
-
171
- scoreNames.forEach((yName) => {
172
- const y = '${' + yName + '}';
173
-
174
- RADIUS.forEach((r) => {
175
- addLine(y + ` = sqrt(${r*r} - ${x} * ${x})`, r);
176
- addLine(y + ` = -sqrt(${r*r} - ${x} * ${x})`, r);
177
- });
178
- });
179
- });
180
-
181
- scoresScatter.meta.formulaLines.addAll(lines);
186
+ scoresScatter.meta.formulaLines.addAll(getLines(scoreNames));
182
187
  view.addViewer(scoresScatter);
183
188
 
184
189
  // 5. Explained Variances
@@ -334,8 +339,11 @@ export async function runMVA(analysisType: PLS_ANALYSIS): Promise<void> {
334
339
  };
335
340
 
336
341
  // names of samples
337
- let names = (strCols.length > 0) ? strCols[0] : null;
338
- const namesInputs = ui.input.column(TITLE.NAMES, {table: table, value: names!, onValueChanged: () => names = predictInput.value,
342
+ let names = (strCols.length > 0) ? strCols[0] : undefined;
343
+ const namesInputs = ui.input.column(TITLE.NAMES, {
344
+ table: table,
345
+ value: names,
346
+ onValueChanged: () => names = predictInput.value ?? undefined,
339
347
  filter: (col: DG.Column) => col.type === DG.COLUMN_TYPE.STRING},
340
348
  );
341
349
  namesInputs.setTooltip(HINT.NAMES);
package/src/regression.ts CHANGED
@@ -191,7 +191,7 @@ async function getLinearRegressionParamsUsingPLS(features: DG.ColumnList,
191
191
  features: features,
192
192
  predict: targets,
193
193
  components: components,
194
- names: null,
194
+ names: undefined,
195
195
  });
196
196
 
197
197
  return plsAnalysis.regressionCoefficients.getRawData() as Float32Array;