@datagrok/eda 1.0.3 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,49 +1,50 @@
1
1
  {
2
- "name": "@datagrok/eda",
3
- "friendlyName": "EDA",
4
- "version": "1.0.3",
5
- "description": "Eploratory Data Analysis tools",
6
- "dependencies": {
7
- "datagrok-api": "latest",
8
- "cash-dom": "latest",
9
- "dayjs": "latest",
10
- "@datagrok-libraries/utils": "latest",
11
- "@datagrok-libraries/tutorials": "^1.2.1"
12
- },
13
- "author": {
14
- "name": "Viktor Makarichev",
15
- "email": "vmakarichev@datagrok.ai"
16
- },
17
- "devDependencies": {
18
- "webpack": "latest",
19
- "webpack-cli": "latest",
20
- "ts-loader": "latest",
21
- "typescript": "latest"
22
- },
23
- "scripts": {
24
- "link-all": "",
25
- "debug-eda": "webpack && grok publish",
26
- "release-eda": "webpack && grok publish --release",
27
- "build-eda": "webpack",
28
- "build": "webpack",
29
- "debug-eda-dev": "webpack && grok publish dev",
30
- "release-eda-dev": "webpack && grok publish dev --release",
31
- "debug-eda-local": "webpack && grok publish local",
32
- "release-eda-local": "webpack && grok publish local --release"
33
- },
34
- "canEdit": [
35
- "Developers"
36
- ],
37
- "canView": [
38
- "All users"
39
- ],
40
- "repository": {
41
- "type": "git",
42
- "url": "https://github.com/datagrok-ai/public.git",
43
- "directory": "packages/EDA"
44
- },
45
- "category": "Machine Learning",
46
- "sources": [
47
- "wasm/EDA.js"
48
- ]
49
- }
2
+ "name": "@datagrok/eda",
3
+ "friendlyName": "EDA",
4
+ "version": "1.1.0",
5
+ "description": "Exploratory Data Analysis Tools",
6
+ "dependencies": {
7
+ "datagrok-api": "latest",
8
+ "cash-dom": "latest",
9
+ "dayjs": "latest",
10
+ "@datagrok-libraries/utils": "latest",
11
+ "@datagrok-libraries/tutorials": "^1.3.3"
12
+ },
13
+ "author": {
14
+ "name": "Viktor Makarichev",
15
+ "email": "vmakarichev@datagrok.ai"
16
+ },
17
+ "devDependencies": {
18
+ "webpack": "latest",
19
+ "webpack-cli": "latest",
20
+ "ts-loader": "latest",
21
+ "typescript": "latest"
22
+ },
23
+ "scripts": {
24
+ "link-all": "npm link datagrok-api @datagrok-libraries/utils @datagrok-libraries/tutorials",
25
+ "debug-eda": "webpack && grok publish",
26
+ "release-eda": "webpack && grok publish --release",
27
+ "build-eda": "webpack",
28
+ "build": "webpack",
29
+ "debug-eda-dev": "webpack && grok publish dev",
30
+ "release-eda-dev": "webpack && grok publish dev --release",
31
+ "debug-eda-local": "webpack && grok publish local",
32
+ "release-eda-local": "webpack && grok publish local --release",
33
+ "build-all": "npm --prefix ./../../js-api run build && npm --prefix ./../../libraries/utils run build && npm --prefix ./../../libraries/tutorials run build && npm run build"
34
+ },
35
+ "canEdit": [
36
+ "Developers"
37
+ ],
38
+ "canView": [
39
+ "All users"
40
+ ],
41
+ "repository": {
42
+ "type": "git",
43
+ "url": "https://github.com/datagrok-ai/public.git",
44
+ "directory": "packages/EDA"
45
+ },
46
+ "category": "Machine Learning",
47
+ "sources": [
48
+ "wasm/EDA.js"
49
+ ]
50
+ }
package/src/EDAui.ts CHANGED
@@ -7,16 +7,16 @@ import * as DG from 'datagrok-api/dg';
7
7
  // Rename PCA columns
8
8
  export function renamePCAcolumns(pcaTable: DG.DataFrame): DG.DataFrame {
9
9
  for (const col of pcaTable.columns.toList())
10
- col.name = '_PCA' + col.name;
10
+ col.name = 'PCA' + col.name;
11
11
 
12
12
  return pcaTable;
13
13
  }
14
14
 
15
15
  // Predicted vs Reference scatter plot
16
- export function predictedVersusReferenceScatterPlot(reference: DG.Column, prediction: DG.Column): DG.Viewer {
16
+ export function predictedVersusReferenceScatterPlot(samplesNames: DG.Column, reference: DG.Column, prediction: DG.Column): DG.Viewer {
17
17
  prediction.name = reference.name + '(predicted)';
18
18
 
19
- let dfReferencePrediction = DG.DataFrame.fromColumns([reference, prediction]);
19
+ let dfReferencePrediction = DG.DataFrame.fromColumns([samplesNames, reference, prediction]);
20
20
  dfReferencePrediction.name = 'Reference vs. Predicted';
21
21
 
22
22
  return DG.Viewer.scatterPlot(dfReferencePrediction,
@@ -24,7 +24,8 @@ export function predictedVersusReferenceScatterPlot(reference: DG.Column, predic
24
24
  x: reference.name,
25
25
  y: prediction.name,
26
26
  showRegressionLine: true,
27
- markerType: 'circle'
27
+ markerType: 'circle',
28
+ labels: samplesNames.name
28
29
  });
29
30
  }
30
31
 
@@ -46,9 +47,9 @@ export function regressionCoefficientsBarChart(features: DG.ColumnList, regressi
46
47
  }
47
48
 
48
49
  // Scores Scatter Plot
49
- export function scoresScatterPlot(xScores: Array<DG.Column>, yScores: Array<DG.Column>): DG.Viewer {
50
+ export function scoresScatterPlot(samplesNames: DG.Column, xScores: Array<DG.Column>, yScores: Array<DG.Column>): DG.Viewer {
50
51
 
51
- let scoresColumns = [];
52
+ let scoresColumns = [samplesNames];
52
53
 
53
54
  for (let i = 0; i < xScores.length; i++) {
54
55
  xScores[i].name = `x.score.t${i+1}`;
@@ -63,12 +64,15 @@ export function scoresScatterPlot(xScores: Array<DG.Column>, yScores: Array<DG.C
63
64
  let scores = DG.DataFrame.fromColumns(scoresColumns);
64
65
  scores.name = 'Scores';
65
66
  //grok.shell.addTableView(scores);
67
+
68
+ const index = xScores.length > 1 ? 1 : 0;
66
69
 
67
70
  return DG.Viewer.scatterPlot(scores,
68
71
  { title: scores.name,
69
72
  x: xScores[0].name,
70
- y: yScores[0].name,
71
- markerType: 'circle'
73
+ y: xScores[index].name,
74
+ markerType: 'circle',
75
+ labels: samplesNames.name
72
76
  });
73
77
  }
74
78
 
@@ -100,19 +104,19 @@ export function loadingScatterPlot(features: DG.ColumnList, xLoadings: Array<DG.
100
104
  }
101
105
 
102
106
  // Add PLS visualization
103
- export function addPLSvisualization(table: DG.DataFrame, features: DG.ColumnList, predict: DG.Column, plsOutput: any): void {
107
+ export function addPLSvisualization(table: DG.DataFrame, samplesNames: DG.Column, features: DG.ColumnList, predict: DG.Column, plsOutput: any): void {
104
108
 
105
109
  let view = grok.shell.getTableView(table.name);
106
110
 
107
111
  // 1. Predicted vs Reference scatter plot
108
- view.addViewer(predictedVersusReferenceScatterPlot(predict, plsOutput[0]));
112
+ view.addViewer(predictedVersusReferenceScatterPlot(samplesNames, predict, plsOutput[0]));
109
113
 
110
114
  // 2. Regression Coefficients Bar Chart
111
- view.addViewer(regressionCoefficientsBarChart(features, plsOutput[1]));
115
+ view.addViewer(regressionCoefficientsBarChart(features, plsOutput[1]));
112
116
 
113
- // 3. Scores Scatter Plot
114
- view.addViewer(scoresScatterPlot(plsOutput[2], plsOutput[3]));
115
-
116
- // 4. Loading Scatter Plot
117
+ // 3. Loading Scatter Plot
117
118
  view.addViewer(loadingScatterPlot(features, plsOutput[4]));
119
+
120
+ // 4. Scores Scatter Plot
121
+ view.addViewer(scoresScatterPlot(samplesNames, plsOutput[2], plsOutput[3]));
118
122
  }
package/src/package.ts CHANGED
@@ -7,8 +7,8 @@ import {DemoScript} from '@datagrok-libraries/tutorials/src/demo-script';
7
7
 
8
8
  import {_initEDAAPI} from '../wasm/EDAAPI';
9
9
  import {computePCA, computePLS} from './EDAtools';
10
- import {renamePCAcolumns, addPLSvisualization} from './EDAui';
11
- import {demoPLS} from './demos';
10
+ import {renamePCAcolumns, addPLSvisualization, regressionCoefficientsBarChart,
11
+ scoresScatterPlot, predictedVersusReferenceScatterPlot} from './EDAui';
12
12
  import {carsDataframe, testDataForBinaryClassification} from './dataGenerators';
13
13
  import {LINEAR, RBF, POLYNOMIAL, SIGMOID,
14
14
  getTrainedModel, getPrediction, showTrainReport, getPackedModel} from './svm';
@@ -25,7 +25,7 @@ export async function init(): Promise<void> {
25
25
  await _initEDAAPI();
26
26
  }
27
27
 
28
- //top-menu: Tools | Data Science | PCA
28
+ //top-menu: Tools | Data Science | Principal Component Analysis...
29
29
  //name: PCA
30
30
  //description: Principal component analysis (PCA).
31
31
  //input: dataframe table
@@ -40,50 +40,70 @@ export async function PCA(table: DG.DataFrame, features: DG.ColumnList, componen
40
40
  return renamePCAcolumns(await computePCA(table, features, components, center, scale));
41
41
  }
42
42
 
43
- //top-menu: Tools | Data Science | PLS
44
- //name: PLS
43
+ //top-menu: Tools | Data Science | Multivariate Analysis (PLS)...
44
+ //name: Multivariate Analysis (PLS)
45
45
  //description: Partial least square regression (PLS).
46
46
  //input: dataframe table
47
+ //input: column names
47
48
  //input: column_list features
48
49
  //input: column predict
49
50
  //input: int components = 3
50
- export async function PLS(table: DG.DataFrame, features: DG.ColumnList, predict: DG.Column, components: number): Promise<void> {
51
+ export async function PLS(table: DG.DataFrame, names: DG.Column, features: DG.ColumnList,
52
+ predict: DG.Column, components: number): Promise<void>
53
+ {
51
54
  const plsResults = await computePLS(table, features, predict, components);
52
- addPLSvisualization(table, features, predict, plsResults);
55
+ addPLSvisualization(table, names, features, predict, plsResults);
53
56
  }
54
57
 
55
58
  //name: MVA demo
56
- //description: Multivariate analysis (PLS) demo.
57
- //meta.demoPath: Data analysis | Multivariate analysis
58
- export async function demoScript(): Promise<any> {
59
- const demoScript = new DemoScript('Multivariate analysis',
60
- 'Provides partial least sqaure regression analysis of the given data.');
59
+ //description: Multidimensional data analysis using partial least squares (PLS) regression. It reduces the predictors to a smaller set of uncorrelated components and performs least squares regression on them.
60
+ //meta.demoPath: Compute | Multivariate analysis
61
+ //meta.isDemoScript: True
62
+ export async function demoMultivariateAnalysis(): Promise<any> {
63
+ const demoScript = new DemoScript('Partial least squares regression',
64
+ 'Analysis of multidimensional data.');
61
65
 
62
66
  const cars = carsDataframe();
63
67
 
64
68
  const components = 3;
69
+ const names = cars.columns.byName('model');
65
70
  const predict = cars.columns.byName('price');
66
71
  const features = cars.columns.remove('price').remove('model');
67
72
  const plsOutput = await computePLS(cars, features, predict, components);
68
73
 
69
74
  const sourceCars = carsDataframe();
70
- grok.shell.addTableView(sourceCars);
71
75
  sourceCars.name = 'Cars';
72
- let view = grok.shell.getTableView(sourceCars.name);
76
+ let view: any;
77
+ let dialog: any;
73
78
 
74
79
  await demoScript
75
- .step('Run', async () => {}, {description: 'Test dataframe is loaded, and multivariate analysis is performed.', delay: 0})
76
- .step('Study', async () => {addPLSvisualization(sourceCars, features, predict, plsOutput)}, {description: 'Investigate results.', delay: 4000})
77
- .step('Try', async () => {
78
- const params = { items: 10000, features: 100, components: 3};
79
- const itemsProp = DG.Property.js('items', DG.TYPE.INT);
80
- const featuresProp = DG.Property.js('features', DG.TYPE.INT);
81
- const componentsProp = DG.Property.js('components', DG.TYPE.INT);
82
- ui.dialog({title:'Set'})
83
- .add(ui.input.form(params, [itemsProp, featuresProp, componentsProp]))
84
- .addButton('Run', async () => await demoPLS(params.items, params.features, params.components))
85
- .show();
86
- }, {description: 'Random walk test dataframe of the given size is generated, and its multivariate analysis is performed.'})
80
+ .step('Data', async () => {
81
+ grok.shell.addTableView(sourceCars);
82
+ view = grok.shell.getTableView(sourceCars.name);
83
+ }, {description: 'Each car has many features - patterns extraction is complicated.', delay: 0})
84
+ .step('Model', async () => {
85
+ dialog = ui.dialog({title:'Multivariate Analysis (PLS)'})
86
+ .add(ui.tableInput('Table', sourceCars))
87
+ .add(ui.columnsInput('Features', cars, features.toList, {available: undefined, checked: features.names()}))
88
+ .add(ui.columnInput('Names', cars, names, undefined))
89
+ .add(ui.columnInput('Predict', cars, predict, undefined))
90
+ .add(ui.intInput('Components', components, undefined))
91
+ .onOK(() => {
92
+ grok.shell.info('Multivariate analysis has been already performed.');
93
+ })
94
+ .show({x: 400, y: 140});
95
+ }, {description: 'Predict car price by its other features.', delay: 0})
96
+ .step('Regression coeffcicients', async () =>
97
+ {
98
+ dialog.close();
99
+ view.addViewer(regressionCoefficientsBarChart(features, plsOutput[1]))},
100
+ {description: 'The feature "diesel" affects the price the most.', delay: 0})
101
+ .step('Scores', async () =>
102
+ {view.addViewer(scoresScatterPlot(names, plsOutput[2], plsOutput[3]))},
103
+ {description: 'Similarities & dissimilarities: alfaromeo and mercedes are different.', delay: 0})
104
+ .step('Prediction', async () =>
105
+ {view.addViewer(predictedVersusReferenceScatterPlot(names, predict, plsOutput[0]))},
106
+ {description: 'Closer to the line means better price prediction.', delay: 0})
87
107
  .start();
88
108
  }
89
109
 
package/wasm/EDA.wasm CHANGED
Binary file
package/wasm/PLS/pls.cpp CHANGED
@@ -12,177 +12,6 @@ using namespace Eigen;
12
12
  using pls::Float;
13
13
  using pls::Double;
14
14
 
15
- /* Partial Least Square (PLS1).
16
- predictorColumnsDataPtr - data from columns that are used for prediction
17
- rowCount - number of rows
18
- columnCount - number of columns
19
- responseColumnDataPtr - data from column that is predicted, i.e. responce
20
- componentsCount - number of components that extracted in PLS
21
- predictionDataPtr - prediction obtained using PLS (its size is equal to the size of responce)
22
- regressionCoefficients - coeffcient of linear regression that are computed (their size is eqaul to the number of columns)
23
- */
24
- int pls::partialLeastSquare(Float * predictorColumnsDataPtr,
25
- const int rowCount,
26
- const int columnCount,
27
- Float * responseColumnDataPtr,
28
- const int componentsCount,
29
- Float * predictionDataPtr,
30
- Float * regressionCoefficients) noexcept
31
- {
32
- // check correctness of arguments
33
- if (componentsCount <= 0 || componentsCount > columnCount)
34
- return UNCORRECT_ARGUMENTS_ERROR;
35
-
36
- // Further, notation from the paper https://doi.org/10.1002/cem.2589 is used (see Algorithm 2).
37
-
38
- // create matrix, which is associated with predictor data
39
- Map < Matrix<Float, Dynamic, Dynamic, ColMajor>> D(predictorColumnsDataPtr, rowCount, columnCount);
40
-
41
- // compute mean value of each column of D
42
- Vector<Float, Dynamic> mu = D.colwise().mean();
43
-
44
- // mean-centered version of D
45
- Matrix<Float, Dynamic, Dynamic, ColMajor> X = D.rowwise() - mu.transpose();
46
-
47
- // vector for standard deviations of X
48
- Vector<Float, Dynamic> stdDevX(columnCount);
49
-
50
- Float rowCountSqrt = sqrt(static_cast<Float>(rowCount));
51
-
52
- // normilizing X-columns
53
- for (int i = 0; i < columnCount; i++)
54
- {
55
- stdDevX(i) = X.col(i).norm() / rowCountSqrt;
56
- X.col(i) = X.col(i) / stdDevX(i);
57
- }
58
-
59
- // create a vector, which is associated with responce or predicted data
60
- Map<Vector<Float, Dynamic>> ySource(responseColumnDataPtr, rowCount);
61
-
62
- // mean value of the responce
63
- Vector<Float, 1> meanY;
64
- meanY(0) = ySource.mean();
65
-
66
- // mean-centered version of the responce
67
- Vector<Float, Dynamic> y = ySource.rowwise() - meanY;
68
-
69
- // standard deviation
70
- Float stdDevY = sqrt(y.squaredNorm() / rowCount);
71
-
72
- // normalizing
73
- y /= stdDevY;
74
-
75
- // create a vector, which is associtated with regression coefficients
76
- Map<Vector<Float, Dynamic>> b(regressionCoefficients, columnCount);
77
-
78
- // create a vector, which is associated with prediction data
79
- Map<Vector<Float, Dynamic>> prediction(predictionDataPtr, rowCount);
80
-
81
- // PLS1 algorithm routine
82
-
83
- Matrix<Float, Dynamic, Dynamic, ColMajor> W(columnCount, componentsCount);
84
-
85
- Matrix<Float, Dynamic, Dynamic, ColMajor> P(columnCount, componentsCount);
86
-
87
- Matrix<Float, Dynamic, Dynamic, ColMajor> T(rowCount, componentsCount);
88
-
89
- Vector<Float, Dynamic> normTau(componentsCount);
90
-
91
- Vector<Float, Dynamic> q(componentsCount);
92
-
93
- Vector<Float, Dynamic> normV(componentsCount);
94
-
95
- // PLS1 algorithm: see Algorithm 2 in https://doi.org/10.1002/cem.2589
96
-
97
- Vector<Float, Dynamic> w = (X.transpose() * y);
98
-
99
- normV(0) = w.norm();
100
-
101
- // prevent division by zero
102
- if (normV(0) == static_cast<Float>(0))
103
- return METHOD_ERROR;
104
-
105
- w = w / normV(0);
106
-
107
- W.col(0) = w;
108
-
109
- Vector<Float, Dynamic> t = X * w;
110
-
111
- normTau(0) = t.norm();
112
-
113
- // prevent division by zero
114
- if (normTau(0) == static_cast<Float>(0))
115
- return METHOD_ERROR;
116
-
117
- t = t / normTau(0);
118
-
119
- T.col(0) = t;
120
-
121
- Vector<Float, Dynamic> p = X.transpose() * t;
122
-
123
- P.col(0) = p;
124
-
125
- q(0) = t.transpose() * y;
126
-
127
- for (int a = 1; a < componentsCount; a++)
128
- {
129
- w = normV(a - 1) * (w - p / normTau(a - 1));
130
-
131
- normV(a) = w.norm();
132
-
133
- // prevent division by zero
134
- if (normV(a) == static_cast<Float>(0))
135
- return METHOD_ERROR;
136
-
137
- w = w / normV(a);
138
-
139
- W.col(a) = w;
140
-
141
- t = X * w;
142
-
143
- t = t - T.leftCols(a) * (T.leftCols(a).transpose() * t);
144
-
145
- normTau(a) = t.norm();
146
-
147
- // prevent division by zero
148
- if (normTau(a) == static_cast<Float>(0))
149
- return METHOD_ERROR;
150
-
151
- t = t / normTau(a);
152
-
153
- T.col(a) = t;
154
-
155
- p = X.transpose() * t;
156
-
157
- P.col(a) = p;
158
-
159
- q(a) = t.transpose() * y;
160
- } // for a
161
-
162
- // compute coefficients of regression
163
- Matrix<Float, Dynamic, Dynamic> H = P.transpose() * W;
164
-
165
- // chech existence of inverse matrix
166
- if (H.determinant() == static_cast<Float>(0))
167
- return METHOD_ERROR;
168
-
169
- b = W * H.inverse() * q;
170
-
171
- for (int i = 0; i < columnCount; i++)
172
- b(i) *= stdDevY / stdDevX(i);
173
-
174
- // TODO: to discuss a constant term of the regression
175
- // a constant term
176
- //Vector<Float, 1> shift;
177
- //shift(0) = ySource(0) - D.row(0) * b;
178
- //q(0) - P.col(0).transpose().dot(b);
179
- //prediction = (D * b).rowwise() + shift;
180
-
181
- prediction = D * b;
182
-
183
- return NO_ERROR;
184
- } // partialLeastSquare
185
-
186
15
  /* Partial Least Square (PLS1) - extended version: scores data is provided.
187
16
  predictorColumnsDataPtr - data from columns that are used for prediction (X)
188
17
  rowCount - number of rows
@@ -358,7 +187,7 @@ int pls::partialLeastSquareExtended(Float * predictorColumnsDataPtr,
358
187
  // compute coefficients of regression
359
188
  Matrix<Float, Dynamic, Dynamic> H = P.transpose() * W;
360
189
 
361
- // chech existence of inverse matrix
190
+ // check existence of inverse matrix
362
191
  if (H.determinant() == static_cast<Float>(0))
363
192
  return METHOD_ERROR;
364
193
 
@@ -370,7 +199,7 @@ int pls::partialLeastSquareExtended(Float * predictorColumnsDataPtr,
370
199
  b(i) *= stdDevY / stdDevX(i);
371
200
 
372
201
  // compute predictions
373
- prediction = D * b;
202
+ prediction = D * b;
374
203
 
375
204
  // Remove the following comments in order to print and verify results
376
205
  //cout << "\nW_star:\n" << Wstar << endl;
package/src/demos.ts DELETED
@@ -1,38 +0,0 @@
1
- /* Do not change these import lines to match external modules in webpack configuration */
2
- import * as grok from 'datagrok-api/grok';
3
- import * as ui from 'datagrok-api/ui';
4
- import * as DG from 'datagrok-api/dg';
5
-
6
- import {computePLS} from './EDAtools';
7
- import {addPLSvisualization} from './EDAui';
8
-
9
- // Demo multivariate analysis (PLS)
10
- export async function demoPLS(rowCount: number, colCount: number, componentsCount: number): Promise<void> {
11
- // check inputs
12
- if ((rowCount <= 0) || (colCount <= 0) || (componentsCount <= 0) || (componentsCount > colCount)) {
13
- const bal = new DG.Balloon;
14
- bal.error('Incorrect inputs.');
15
- return;
16
- }
17
-
18
- // further, custom interface is provided
19
-
20
- const PREDICT = 'Reference';
21
-
22
- const bigDemoTable = grok.data.testData('random walk', rowCount, colCount);
23
- bigDemoTable.name = `${rowCount} x ${colCount}`;
24
-
25
- for (const col of bigDemoTable.columns)
26
- col.name = 'Feature ' + col.name;
27
- bigDemoTable.columns.byIndex(0).name = PREDICT;
28
-
29
- grok.shell.addTableView(bigDemoTable);
30
- let predict = bigDemoTable.columns.byName(PREDICT);
31
- let features = bigDemoTable.columns.remove(PREDICT);
32
-
33
- const plsResults = await computePLS(bigDemoTable, features, predict, componentsCount);
34
-
35
- addPLSvisualization(bigDemoTable, features, predict, plsResults);
36
-
37
- bigDemoTable.columns.add(predict);
38
- }