@datagrok/eda 1.1.18 → 1.1.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,376 @@
1
+ // Tools for multivariate analysis by PLS
2
+
3
+ import * as grok from 'datagrok-api/grok';
4
+ import * as ui from 'datagrok-api/ui';
5
+ import * as DG from 'datagrok-api/dg';
6
+
7
+ import {PLS_ANALYSIS, ERROR_MSG, TITLE, HINT, LINK, COMPONENTS, INT, TIMEOUT,
8
+ RESULT_NAMES, WASM_OUTPUT_IDX, RADIUS, LINE_WIDTH, COLOR, X_COORD, Y_COORD,
9
+ DEMO_INTRO_MD, DEMO_RESULTS_MD, DELAY, DEMO_RESULTS} from './pls-constants';
10
+ import {checkWasmDimensionReducerInputs, checkColumnType, checkMissingVals} from '../utils';
11
+ import {_partialLeastSquareRegressionInWebWorker} from '../../wasm/EDAAPI';
12
+ import {carsDataframe} from '../data-generators';
13
+
14
+ const min = Math.min;
15
+ const max = Math.max;
16
+
17
+ /** PLS analysis results */
18
+ export type PlsOutput = {
19
+ prediction: DG.Column<DG.COLUMN_TYPE.FLOAT>,
20
+ regressionCoefficients: DG.Column<DG.COLUMN_TYPE.FLOAT>,
21
+ tScores: DG.Column<DG.COLUMN_TYPE.FLOAT>[],
22
+ uScores: DG.Column<DG.COLUMN_TYPE.FLOAT>[],
23
+ xLoadings: DG.Column<DG.COLUMN_TYPE.FLOAT>[],
24
+ yLoadings: DG.Column<DG.COLUMN_TYPE.FLOAT>,
25
+ };
26
+
27
+ /** PLS analysis input */
28
+ export type PlsInput = {
29
+ table: DG.DataFrame,
30
+ features: DG.ColumnList,
31
+ predict: DG.Column,
32
+ components: number,
33
+ names : DG.Column | null,
34
+ };
35
+
36
+ /** Partial least square regression (PLS) */
37
+ export async function getPlsAnalysis(input: PlsInput): Promise<PlsOutput> {
38
+ checkWasmDimensionReducerInputs(input.features, input.components);
39
+
40
+ // Check the responce column
41
+ checkColumnType(input.predict);
42
+ checkMissingVals(input.predict);
43
+
44
+ const result = await _partialLeastSquareRegressionInWebWorker(
45
+ input.table,
46
+ input.features,
47
+ input.predict,
48
+ input.components,
49
+ );
50
+
51
+ return {
52
+ prediction: result[WASM_OUTPUT_IDX.PREDICTION],
53
+ regressionCoefficients: result[WASM_OUTPUT_IDX.REGR_COEFFS],
54
+ tScores: result[WASM_OUTPUT_IDX.T_SCORES],
55
+ uScores: result[WASM_OUTPUT_IDX.U_SCORES],
56
+ xLoadings: result[WASM_OUTPUT_IDX.X_LOADINGS],
57
+ yLoadings: result[WASM_OUTPUT_IDX.Y_LOADINGS],
58
+ };
59
+ }
60
+
61
+ /** Perform multivariate analysis using the PLS regression */
62
+ async function performMVA(input: PlsInput, analysisType: PLS_ANALYSIS): Promise<void> {
63
+ const result = await getPlsAnalysis(input);
64
+
65
+ const plsCols = result.tScores;
66
+ const cols = input.table.columns;
67
+ const featuresNames = input.features.names();
68
+ const prefix = (analysisType === PLS_ANALYSIS.COMPUTE_COMPONENTS) ? RESULT_NAMES.PREFIX : TITLE.XSCORE;
69
+
70
+ // add PLS components to the table
71
+ plsCols.forEach((col, idx) => {
72
+ col.name = cols.getUnusedName(`${prefix}${idx + 1}`);
73
+ cols.add(col);
74
+ });
75
+
76
+ if (analysisType === PLS_ANALYSIS.COMPUTE_COMPONENTS)
77
+ return;
78
+
79
+ const view = grok.shell.tableView(input.table.name);
80
+
81
+ // 0.1 Buffer table
82
+ const buffer = DG.DataFrame.fromColumns([
83
+ DG.Column.fromStrings(TITLE.FEATURE, featuresNames),
84
+ result.regressionCoefficients,
85
+ ]);
86
+
87
+ // 0.2. Add X-Loadings
88
+ result.xLoadings.forEach((col, idx) => {
89
+ col.name = buffer.columns.getUnusedName(`${TITLE.XLOADING}${idx + 1}`);
90
+ buffer.columns.add(col);
91
+ });
92
+
93
+ // 1. Predicted vs Reference scatter plot
94
+ const pred = result.prediction;
95
+ pred.name = cols.getUnusedName(`${input.predict.name} ${RESULT_NAMES.SUFFIX}`);
96
+ cols.add(pred);
97
+ const predictVsReferScatter = view.addViewer(DG.Viewer.scatterPlot(input.table, {
98
+ title: TITLE.MODEL,
99
+ xColumnName: input.predict.name,
100
+ yColumnName: pred.name,
101
+ showRegressionLine: true,
102
+ markerType: DG.MARKER_TYPE.CIRCLE,
103
+ labels: input.names?.name,
104
+ help: LINK.MODEL,
105
+ }));
106
+
107
+ // 2. Regression Coefficients Bar Chart
108
+ result.regressionCoefficients.name = TITLE.REGR_COEFS;
109
+ const regrCoeffsBar = view.addViewer(DG.Viewer.barChart(buffer, {
110
+ title: TITLE.REGR_COEFS,
111
+ splitColumnName: TITLE.FEATURE,
112
+ valueColumnName: result.regressionCoefficients.name,
113
+ valueAggrType: DG.AGG.AVG,
114
+ help: LINK.COEFFS,
115
+ showValueSelector: false,
116
+ showStackSelector: false,
117
+ }));
118
+
119
+ // 3. Loadings Scatter Plot
120
+ result.xLoadings.forEach((col, idx) => col.name = `${TITLE.XLOADING}${idx + 1}`);
121
+ const loadingsScatter = view.addViewer(DG.Viewer.scatterPlot(buffer, {
122
+ title: TITLE.LOADINGS,
123
+ xColumnName: `${TITLE.XLOADING}1`,
124
+ yColumnName: `${TITLE.XLOADING}${result.xLoadings.length > 1 ? '2' : '1'}`,
125
+ markerType: DG.MARKER_TYPE.CIRCLE,
126
+ labels: TITLE.FEATURE,
127
+ help: LINK.LOADINGS,
128
+ }));
129
+
130
+ // 4. Scores Scatter Plot
131
+
132
+ // 4.1) data
133
+ const scoreNames = plsCols.map((col) => col.name);
134
+ result.uScores.forEach((col, idx) => {
135
+ col.name = cols.getUnusedName(`${TITLE.YSCORE}${idx + 1}`);
136
+ cols.add(col);
137
+ scoreNames.push(col.name);
138
+ });
139
+
140
+ // 4.2) create scatter
141
+ const scoresScatter = DG.Viewer.scatterPlot(input.table, {
142
+ title: TITLE.SCORES,
143
+ xColumnName: plsCols[0].name,
144
+ yColumnName: (plsCols.length > 1) ? plsCols[1].name : result.uScores[0],
145
+ markerType: DG.MARKER_TYPE.CIRCLE,
146
+ labels: input.names?.name,
147
+ help: LINK.SCORES,
148
+ showViewerFormulaLines: true,
149
+ });
150
+
151
+ // 4.3) create lines & circles
152
+ const lines = [] as DG.FormulaLine[];
153
+
154
+ const addLine = (formula: string, radius: number) => {
155
+ lines.push({
156
+ type: 'line',
157
+ formula: formula,
158
+ width: LINE_WIDTH,
159
+ visible: true,
160
+ title: ' ',
161
+ min: -radius,
162
+ max: radius,
163
+ color: COLOR.CIRCLE,
164
+ })};
165
+
166
+ scoreNames.forEach((xName) => {
167
+ const x = '${' + xName + '}';
168
+ lines.push({type: 'line', formula: `${x} = 0`, width: LINE_WIDTH, visible: true, title: ' ', color: COLOR.AXIS});
169
+
170
+ scoreNames.forEach((yName) => {
171
+ const y = '${' + yName + '}';
172
+
173
+ RADIUS.forEach((r) => {
174
+ addLine(y + ` = sqrt(${r*r} - ${x} * ${x})`, r);
175
+ addLine(y + ` = -sqrt(${r*r} - ${x} * ${x})`, r);
176
+ });
177
+ });
178
+ });
179
+
180
+ scoresScatter.meta.formulaLines.addAll(lines);
181
+ view.addViewer(scoresScatter);
182
+
183
+ // 5. Explained Variances
184
+
185
+ // 5.1) computation, source: the paper https://doi.org/10.1002/cem.2589
186
+ // here, we use notations from this paper
187
+ const q = result.yLoadings.getRawData();
188
+ const p = result.xLoadings.map((col) => col.getRawData());
189
+ const n = input.table.rowCount;
190
+ const m = featuresNames.length;
191
+ const A = input.components;
192
+ const yExplVars = new Float32Array(A);
193
+ const compNames = [] as string[];
194
+ const xExplVars: Float32Array[] = [];
195
+ for (let i = 0; i < m; ++i)
196
+ xExplVars.push(new Float32Array(A));
197
+
198
+ yExplVars[0] = q[0]**2 / n;
199
+ compNames.push(`1 ${RESULT_NAMES.COMP}`);
200
+ xExplVars.forEach((arr, idx) => {arr[0] = p[0][idx]**2 / n;});
201
+
202
+ for (let comp = 1; comp < A; ++comp) {
203
+ yExplVars[comp] = yExplVars[comp - 1] + q[comp]**2 / n;
204
+ xExplVars.forEach((arr, idx) => arr[comp] = arr[comp - 1] + p[comp][idx]**2 / n);
205
+ compNames.push(`${comp + 1} ${RESULT_NAMES.COMPS}`);
206
+ }
207
+
208
+ // 5.2) create df
209
+ const explVarsDF = DG.DataFrame.fromColumns([
210
+ DG.Column.fromStrings(TITLE.COMPONENTS, compNames),
211
+ DG.Column.fromFloat32Array(input.predict.name, yExplVars),
212
+ ]);
213
+
214
+ xExplVars.forEach((arr, idx) => explVarsDF.columns.add(DG.Column.fromFloat32Array(featuresNames[idx], arr)));
215
+
216
+ // 5.3) bar chart
217
+ const explVarsBar = view.addViewer(DG.Viewer.barChart(explVarsDF, {
218
+ title: TITLE.EXPL_VAR,
219
+ splitColumnName: TITLE.COMPONENTS,
220
+ valueColumnName: input.predict.name,
221
+ valueAggrType: DG.AGG.AVG,
222
+ help: LINK.EXPL_VARS,
223
+ showCategorySelector: false,
224
+ showStackSelector: false,
225
+ }));
226
+
227
+ // emphasize viewers in the demo case
228
+ if (analysisType === PLS_ANALYSIS.DEMO) {
229
+ const pages = [predictVsReferScatter, scoresScatter, loadingsScatter, regrCoeffsBar, explVarsBar].map((viewer, idx) => {
230
+ return {
231
+ text: DEMO_RESULTS[idx].text,
232
+ showNextTo: viewer.root,
233
+ }
234
+ });
235
+
236
+ const wizard = ui.hints.addTextHint({title: TITLE.EXPLORE, pages: pages});
237
+ wizard.helpUrl = LINK.MVA;
238
+ grok.shell.windows.help.showHelp(ui.markdown(DEMO_RESULTS_MD));
239
+ }
240
+ } // performMVA
241
+
242
+ /** Run multivariate analysis (PLS) */
243
+ export async function runMVA(analysisType: PLS_ANALYSIS): Promise<void> {
244
+ const table = grok.shell.t;
245
+
246
+ if (table === null) {
247
+ grok.shell.warning(ERROR_MSG.NO_DF);
248
+ return;
249
+ }
250
+
251
+ if (table.rowCount === 0) {
252
+ grok.shell.warning(ERROR_MSG.EMPTY_DF);
253
+ return;
254
+ }
255
+
256
+ const numColNames = [] as string[];
257
+ const numCols = [] as DG.Column[];
258
+ const strCols = [] as DG.Column[];
259
+
260
+ const isValidNumeric = (col: DG.Column) =>
261
+ ((col.type === DG.COLUMN_TYPE.INT) || (col.type === DG.COLUMN_TYPE.FLOAT)) &&
262
+ (col.stats.missingValueCount === 0);
263
+
264
+ table.columns.toList().forEach((col) => {
265
+ if (isValidNumeric(col)) {
266
+ numColNames.push(col.name);
267
+ numCols.push(col);
268
+ } else if (col.type === DG.COLUMN_TYPE.STRING)
269
+ strCols.push(col);
270
+ });
271
+
272
+ if (numColNames.length === 0) {
273
+ grok.shell.warning(ERROR_MSG.NO_COLS);
274
+ return;
275
+ }
276
+
277
+ if (numColNames.length === 1) {
278
+ grok.shell.warning(ERROR_MSG.ONE_COL);
279
+ return;
280
+ }
281
+
282
+ // responce (to predict)
283
+ let predict = numCols[numCols.length - 1];
284
+ const predictInput = ui.columnInput(TITLE.PREDICT, table, predict, () => {
285
+ predict = predictInput.value!;
286
+ updateIputs();
287
+ },
288
+ {filter: (col: DG.Column) => isValidNumeric(col)},
289
+ );
290
+ predictInput.setTooltip(HINT.PREDICT);
291
+
292
+ // predictors (features)
293
+ let features: DG.Column[];
294
+ const featuresInput = ui.columnsInput(TITLE.USING, table, () => {}, {available: numColNames});
295
+ featuresInput.onInput(() => updateIputs());
296
+ featuresInput.setTooltip(HINT.FEATURES);
297
+
298
+ // components count
299
+ let components = min(numColNames.length - 1, COMPONENTS.DEFAULT as number);
300
+ const componentsInput = ui.input.forProperty(DG.Property.fromOptions({
301
+ name: TITLE.COMPONENTS,
302
+ inputType: INT,
303
+ defaultValue: components,
304
+ //@ts-ignore
305
+ showPlusMinus: true,
306
+ min: COMPONENTS.MIN,
307
+ }));
308
+ componentsInput.onInput(() => updateIputs());
309
+ componentsInput.setTooltip(HINT.COMPONENTS);
310
+
311
+ let dlgTitle: string;
312
+ let dlgHelpUrl: string;
313
+ let dlgRunBtnTooltip: string;
314
+
315
+ if (analysisType === PLS_ANALYSIS.COMPUTE_COMPONENTS) {
316
+ dlgTitle = TITLE.PLS;
317
+ dlgHelpUrl = LINK.PLS;
318
+ dlgRunBtnTooltip = HINT.PLS;
319
+ } else {
320
+ dlgTitle = TITLE.MVA;
321
+ dlgHelpUrl = LINK.MVA;
322
+ dlgRunBtnTooltip = HINT.MVA;
323
+ }
324
+
325
+ const updateIputs = () => {
326
+ featuresInput.value = featuresInput.value.filter((col) => col !== predict);
327
+ features = featuresInput.value;
328
+
329
+ componentsInput.value = min(max(componentsInput.value ?? components, COMPONENTS.MIN), features.length);
330
+ components = componentsInput.value;
331
+
332
+ dlg.getButton(TITLE.RUN).disabled = (features.length === 0) || (components <= 0);
333
+ };
334
+
335
+ // names of samples
336
+ let names = (strCols.length > 0) ? strCols[0] : null;
337
+ const namesInputs = ui.columnInput(TITLE.NAMES, table, names, () => names = predictInput.value,
338
+ {filter: (col: DG.Column) => col.type === DG.COLUMN_TYPE.STRING},
339
+ );
340
+ namesInputs.setTooltip(HINT.NAMES);
341
+ namesInputs.root.hidden = (strCols.length === 0) || (analysisType === PLS_ANALYSIS.COMPUTE_COMPONENTS);
342
+
343
+ const dlg = ui.dialog({title: dlgTitle, helpUrl: dlgHelpUrl})
344
+ .add(ui.form([predictInput, featuresInput, componentsInput, namesInputs]))
345
+ .addButton(TITLE.RUN, async () => {
346
+ dlg.close();
347
+
348
+ await performMVA({
349
+ table: table,
350
+ features: DG.DataFrame.fromColumns(features).columns,
351
+ predict: predict,
352
+ components: components,
353
+ names: names,
354
+ }, analysisType);
355
+ }, undefined, dlgRunBtnTooltip)
356
+ .show({x: X_COORD, y: Y_COORD});
357
+
358
+ // the following delay provides correct styles (see https://reddata.atlassian.net/browse/GROK-15196)
359
+ setTimeout(() => {
360
+ featuresInput.value = numCols.filter((col) => col !== predict);
361
+ features = featuresInput.value;
362
+ }, TIMEOUT);
363
+
364
+ grok.shell.v.append(dlg.root);
365
+ } // runMVA
366
+
367
+ /** Run multivariate analysis demo */
368
+ export async function runDemoMVA(): Promise<void> {
369
+ grok.shell.addTableView(carsDataframe());
370
+ grok.shell.windows.help.visible = true;
371
+ grok.shell.windows.help.showHelp(ui.markdown(DEMO_INTRO_MD));
372
+ grok.shell.windows.showContextPanel = false;
373
+ grok.shell.windows.showProperties = false;
374
+
375
+ await runMVA(PLS_ANALYSIS.DEMO);
376
+ }
package/src/utils.ts CHANGED
@@ -32,12 +32,18 @@ const INCORRECT_STEPS_MES = 'steps must be non-negative.';
32
32
  const INCORRECT_CYCLES_MES = 'cycles must be positive.';
33
33
  const INCORRECT_CUTOFF_MES = 'cutoff must be non-negative.';
34
34
 
35
- // Check column type
35
+ /** Check column type */
36
36
  export function checkColumnType(col: DG.Column): void {
37
37
  if ((col.type != DG.COLUMN_TYPE.FLOAT) && (col.type != DG.COLUMN_TYPE.INT))
38
38
  throw new Error(UNSUPPORTED_COLUMN_TYPE_MES + col.type);
39
39
  }
40
40
 
41
+ /** Check missing values */
42
+ export function checkMissingVals(col: DG.Column): void {
43
+ if (col.stats.missingValueCount > 0 )
44
+ throw new Error(`The column '${col.name}' has missing values.`);
45
+ }
46
+
41
47
  // Check dimension reducer inputs
42
48
  export function checkDimensionReducerInputs(features: DG.ColumnList, components: number): void {
43
49
  if (components < COMP_MIN)
@@ -46,8 +52,10 @@ export function checkDimensionReducerInputs(features: DG.ColumnList, components:
46
52
  if (components > features.length)
47
53
  throw new Error(COMP_EXCESS);
48
54
 
49
- for (const col of features)
55
+ for (const col of features) {
50
56
  checkColumnType(col);
57
+ checkMissingVals(col);
58
+ }
51
59
  }
52
60
 
53
61
  // Check UMAP inputs
package/wasm/EDA.js CHANGED
@@ -125,11 +125,18 @@ var partialLeastSquareRegression = {
125
125
  ref: 'componentsCount',
126
126
  value: 'data'
127
127
  }
128
+ },
129
+ yLoadings: {
130
+ type: 'newFloatColumn',
131
+ numOfRows: {
132
+ ref: 'componentsCount',
133
+ value: 'data'
134
+ }
128
135
  }
129
136
  },
130
137
  output: {
131
138
  type: 'objects',
132
- source: ['prediction', 'regressionCoefficients', 'tScores', 'uScores', 'xLoadings']
139
+ source: ['prediction', 'regressionCoefficients', 'tScores', 'uScores', 'xLoadings', 'yLoadings']
133
140
  }
134
141
  }; // partialLeastSquareRegression
135
142
 
package/wasm/EDA.wasm CHANGED
Binary file
package/wasm/PLS/PLS.h CHANGED
@@ -56,7 +56,8 @@ namespace pls {
56
56
  Float * regressionCoefficientsPtr,
57
57
  Float * predictorScoresPtr,
58
58
  Float * responceScoresPtr,
59
- Float * predictorLoadingsPtr) noexcept;
59
+ Float * predictorLoadingsPtr,
60
+ Float * responceLoadingsPtr) noexcept;
60
61
  };
61
62
 
62
63
  #endif
package/wasm/PLS/pls.cpp CHANGED
@@ -33,7 +33,8 @@ int pls::partialLeastSquareExtended(Float * predictorColumnsDataPtr,
33
33
  Float * regressionCoefficientsPtr,
34
34
  Float * predictorScoresPtr,
35
35
  Float * responceScoresPtr,
36
- Float * predictorLoadingsPtr) noexcept
36
+ Float * predictorLoadingsPtr,
37
+ Float * responceLoadingsPtr) noexcept
37
38
  {
38
39
  // check correctness of arguments
39
40
  if (componentsCount <= 0 || componentsCount > columnCount)
@@ -108,7 +109,7 @@ int pls::partialLeastSquareExtended(Float * predictorColumnsDataPtr,
108
109
  Map<Matrix<Float, Dynamic, Dynamic, ColMajor>> U(responceScoresPtr, rowCount, componentsCount);
109
110
 
110
111
  // Y-loadings, q
111
- Vector<Float, Dynamic> q(componentsCount);
112
+ Map<Vector<Float, Dynamic>> q(responceLoadingsPtr, componentsCount);
112
113
 
113
114
  // PLS1 routine auxiliry vectors
114
115
  Vector<Float, Dynamic> normTau(componentsCount);
@@ -30,7 +30,9 @@ extern "C" {
30
30
  int predictionScoresColumnsColumnCount,
31
31
  float * predictionLoadingsColumns,
32
32
  int predictionLoadingsColumnsRowCount,
33
- int predictionLoadingsColumnsColumnCount);
33
+ int predictionLoadingsColumnsColumnCount,
34
+ float * yLoadingsColumn,
35
+ int yLoadingsColumnLength);
34
36
  }
35
37
 
36
38
  #include "PLS\PLS.h"
@@ -45,30 +47,33 @@ extern "C" {
45
47
  //output: column_list tScores [new(predict.rowCount, componentsCount)]
46
48
  //output: column_list uScores [new(predict.rowCount, componentsCount)]
47
49
  //output: column_list xLoadings [new(features.columnCount, componentsCount)]
50
+ //output: column yLoadings [new(componentsCount)]
48
51
  EMSCRIPTEN_KEEPALIVE
49
- int partialLeastSquareRegression(float * predictorColumns,
52
+ int partialLeastSquareRegression(float * featuresColumns,
50
53
  int rowCount,
51
54
  int columnCount,
52
- float * responseColumn,
53
- int responceColumnLength,
55
+ float * predictColumn,
56
+ int predictColumnLength,
54
57
  int componentsCount,
55
58
  float * predictionColumn,
56
59
  int predictionColumnLength,
57
60
  float * regressionCoefficients,
58
61
  int regressionCoefficientsLength,
59
- float * predictorScoresColumns,
60
- int predictorScoresColumnsRowCount,
61
- int predictorScoresColumnsColumnCount,
62
- float * predictionScoresColumns,
63
- int predictionScoresColumnsRowCount,
64
- int predictionScoresColumnsColumnCount,
65
- float * predictionLoadingsColumns,
66
- int predictionLoadingsColumnsRowCount,
67
- int predictionLoadingsColumnsColumnCount)
62
+ float * tScoresColumns,
63
+ int tScoresColumnsRowCount,
64
+ int tScoresColumnsColumnCount,
65
+ float * uScoresColumns,
66
+ int uScoresColumnsRowCount,
67
+ int uScoresColumnsColumnCount,
68
+ float * xLoadingsColumns,
69
+ int xLoadingsColumnsRowCount,
70
+ int xLoadingsColumnsColumnCount,
71
+ float * yLoadingsColumn,
72
+ int yLoadingsColumnLength)
68
73
  {
69
- return pls::partialLeastSquareExtended(predictorColumns, rowCount, columnCount,
70
- responseColumn, componentsCount, predictionColumn, regressionCoefficients,
71
- predictorScoresColumns, predictionScoresColumns, predictionLoadingsColumns);
74
+ return pls::partialLeastSquareExtended(featuresColumns, rowCount, columnCount,
75
+ predictColumn, componentsCount, predictionColumn, regressionCoefficients,
76
+ tScoresColumns, uScoresColumns, xLoadingsColumns, yLoadingsColumn);
72
77
  }
73
78
 
74
79