@datagrok/eda 1.1.3 → 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@datagrok/eda",
3
3
  "friendlyName": "EDA",
4
- "version": "1.1.3",
4
+ "version": "1.1.4",
5
5
  "description": "Exploratory Data Analysis Tools",
6
6
  "dependencies": {
7
7
  "@datagrok-libraries/ml": "^6.3.39",
@@ -11,6 +11,7 @@
11
11
  "cash-dom": "^8.1.1",
12
12
  "datagrok-api": "^1.16.0",
13
13
  "dayjs": "^1.11.9",
14
+ "jstat": "^1.9.6",
14
15
  "umap-js": "^1.3.3"
15
16
  },
16
17
  "author": {
package/src/eda-ui.ts CHANGED
@@ -112,7 +112,7 @@ export function loadingScatterPlot(features: DG.ColumnList, xLoadings: Array<DG.
112
112
  // Add PLS visualization
113
113
  export function addPLSvisualization(table: DG.DataFrame, samplesNames: DG.Column, features: DG.ColumnList, predict: DG.Column, plsOutput: any): void {
114
114
 
115
- let view = grok.shell.getTableView(table.name);
115
+ const view = grok.shell.getTableView(table.name);
116
116
 
117
117
  // 1. Predicted vs Reference scatter plot
118
118
  view.addViewer(predictedVersusReferenceScatterPlot(samplesNames, predict, plsOutput[0]));
@@ -126,3 +126,10 @@ export function addPLSvisualization(table: DG.DataFrame, samplesNames: DG.Column
126
126
  // 4. Scores Scatter Plot
127
127
  view.addViewer(scoresScatterPlot(samplesNames, plsOutput[2], plsOutput[3]));
128
128
  }
129
+
130
+ // Add one-way ANOVA results
131
+ export function addOneWayAnovaVizualization(table: DG.DataFrame, factors: DG.Column, values: DG.Column, anova: DG.DataFrame) {
132
+ const view = grok.shell.getTableView(table.name);
133
+ view.addViewer(DG.Viewer.boxPlot(DG.DataFrame.fromColumns([factors, values])));
134
+ view.addViewer(DG.Viewer.grid(anova));
135
+ }
package/src/package.ts CHANGED
@@ -8,11 +8,13 @@ import {DemoScript} from '@datagrok-libraries/tutorials/src/demo-script';
8
8
  import {_initEDAAPI} from '../wasm/EDAAPI';
9
9
  import {computePCA, computePLS, computeUMAP, computeTSNE, computeSPE} from './eda-tools';
10
10
  import {addPrefixToEachColumnName, addPLSvisualization, regressionCoefficientsBarChart,
11
- scoresScatterPlot, predictedVersusReferenceScatterPlot} from './eda-ui';
11
+ scoresScatterPlot, predictedVersusReferenceScatterPlot, addOneWayAnovaVizualization} from './eda-ui';
12
12
  import {carsDataframe, testDataForBinaryClassification} from './data-generators';
13
13
  import {LINEAR, RBF, POLYNOMIAL, SIGMOID,
14
14
  getTrainedModel, getPrediction, showTrainReport, getPackedModel} from './svm';
15
15
 
16
+ import {oneWayAnova} from './stat-tools';
17
+
16
18
  export const _package = new DG.Package();
17
19
 
18
20
  //name: info
@@ -25,14 +27,14 @@ export async function init(): Promise<void> {
25
27
  await _initEDAAPI();
26
28
  }
27
29
 
28
- //top-menu: ML | Dimension Reduction | PCA...
30
+ //top-menu: ML | Dimensionality Reduction | PCA...
29
31
  //name: PCA
30
32
  //description: Principal component analysis (PCA).
31
- //input: dataframe table {category: Data}
32
- //input: column_list features {type: numerical; category: Data}
33
- //input: int components = 2 {caption: Components; category: Hyperparameters} [Number of components.]
34
- //input: bool center = false {category: Hyperparameters} [Indicating whether the variables should be shifted to be zero centered.]
35
- //input: bool scale = false {category: Hyperparameters} [Indicating whether the variables should be scaled to have unit variance.]
33
+ //input: dataframe table
34
+ //input: column_list features {type: numerical}
35
+ //input: int components = 2 {caption: Components} [Number of components.]
36
+ //input: bool center = false [Indicating whether the variables should be shifted to be zero centered.]
37
+ //input: bool scale = false [Indicating whether the variables should be scaled to have unit variance.]
36
38
  //output: dataframe result {action:join(table)}
37
39
  export async function PCA(table: DG.DataFrame, features: DG.ColumnList, components: number,
38
40
  center: boolean, scale: boolean): Promise<DG.DataFrame>
@@ -42,7 +44,7 @@ export async function PCA(table: DG.DataFrame, features: DG.ColumnList, componen
42
44
  return pcaTable;
43
45
  }
44
46
 
45
- //top-menu: ML | Dimension Reduction | UMAP...
47
+ //top-menu: ML | Dimensionality Reduction | UMAP...
46
48
  //name: UMAP
47
49
  //description: Uniform Manifold Approximation and Projection (UMAP).
48
50
  //input: dataframe table {category: Data}
@@ -59,7 +61,7 @@ export async function UMAP(table: DG.DataFrame, features: DG.ColumnList, compone
59
61
  return await computeUMAP(features, components, epochs, neighbors, minDist, spread);
60
62
  }
61
63
 
62
- //top-menu: ML | Dimension Reduction | t-SNE...
64
+ //top-menu: ML | Dimensionality Reduction | t-SNE...
63
65
  //name: t-SNE
64
66
  //description: t-distributed stochastic neighbor embedding (t-SNE).
65
67
  //input: dataframe table {category: Data}
@@ -75,7 +77,7 @@ export async function tSNE(table: DG.DataFrame, features: DG.ColumnList, compone
75
77
  return await computeTSNE(features, components, learningRate, perplexity, iterations);
76
78
  }
77
79
 
78
- //top-menu: ML | Dimension Reduction | SPE...
80
+ //top-menu: ML | Dimensionality Reduction | SPE...
79
81
  //name: SPE
80
82
  //description: Stochastic proximity embedding (SPE).
81
83
  //input: dataframe table {category: Data}
@@ -92,7 +94,7 @@ export async function SPE(table: DG.DataFrame, features: DG.ColumnList, dimensio
92
94
  return await computeSPE(features, dimension, steps, cycles, cutoff, lambda);
93
95
  }
94
96
 
95
- //top-menu: ML | Multivariate Analysis (PLS)...
97
+ //top-menu: ML | Analyze | Multivariate Analysis...
96
98
  //name: Multivariate Analysis (PLS)
97
99
  //description: Multidimensional data analysis using partial least squares (PLS) regression. It reduces the predictors to a smaller set of uncorrelated components and performs least squares regression on them.
98
100
  //input: dataframe table
@@ -318,3 +320,16 @@ export async function trainSigmoidKernelSVM(df: DG.DataFrame, predict_column: st
318
320
  export async function applySigmoidKernelSVM(df: DG.DataFrame, model: any): Promise<DG.DataFrame> {
319
321
  return await getPrediction(df, model);
320
322
  }
323
+
324
+ //top-menu: ML | Analysis of Variances (ANOVA)...
325
+ //name: One-way ANOVA
326
+ //description: One-way analysis of variances (ANOVA) determines whether the examined factor has a significant impact on the studied feature.
327
+ //input: dataframe table
328
+ //input: column factor {type: categorical}
329
+ //input: column feature {type: numerical}
330
+ //input: double significance = 0.05 [The significance level is a value from the interval (0, 1) specifying the criterion used for rejecting the null hypothesis.]
331
+ //input: bool validate = false [Indicates whether the normality of distribution and an eqaulity of varainces should be checked.]
332
+ export function anova(table: DG.DataFrame, factor: DG.Column, feature: DG.Column, significance: number, validate: boolean) {
333
+ const res = oneWayAnova(factor, feature, significance, validate);
334
+ addOneWayAnovaVizualization(table, factor, feature, res);
335
+ }
@@ -0,0 +1,266 @@
1
+ // Statistic tools
2
+
3
+ /* REFERENCES
4
+
5
+ [1] One-way analysis of variance, https://en.wikipedia.org/wiki/One-way_analysis_of_variance
6
+
7
+ [2] G.W. Heiman. Basic Statistics for the Behavioral Sciences, 6th ed. Wadsworth Publishing, 2010
8
+
9
+ [3] F-test of equality of variances, https://en.wikipedia.org/wiki/F-test_of_equality_of_variances
10
+
11
+ [4] S. McKillup. Statistics Explained, Cambridge University Press, 2005
12
+
13
+ */
14
+
15
+ import * as grok from 'datagrok-api/grok';
16
+ import * as ui from 'datagrok-api/ui';
17
+ import * as DG from 'datagrok-api/dg';
18
+
19
+ //@ts-ignore: no types
20
+ import * as jStat from 'jstat';
21
+
22
+ enum ERROR_MSG {
23
+ NON_EQUAL_FACTORS_VALUES_SIZE = 'non-equal sizes of factor and values arrays. INPUT ERROR.',
24
+ INCORRECT_SIGNIFICANCE_LEVEL = 'incorrect significance level. It must be from the interval (0, 1). INPUT ERROR.',
25
+ INCORRECT_SAMPLE_SIZE = 'incorrect size of sample. DATA FACTORIZAING ERROR.',
26
+ NON_EQUAL_VARIANCES = 'variances are not equal.',
27
+ NON_NORMAL_DISTRIB = 'non-normal distribution.',
28
+ UNSUPPORTED_COLUMN_TYPE = 'unsupported column type.',
29
+ INCORRECT_CATEGORIES_COL_TYPE = 'incorrect categories column type.',
30
+ ANOVA_FAILED_JUST_ONE_CAT = 'ANOVA filed: there should be at least 2 categories.'
31
+ };
32
+
33
+ type SampleData = {
34
+ sum: number,
35
+ sumOfSquares: number,
36
+ size: number,
37
+ };
38
+
39
+ /** One-way ANOVA computation results. The classic notations are used (see [2], p. 290). */
40
+ type OneWayAnova = {
41
+ /** sum of squares between groups, SSbn */
42
+ ssBn: number,
43
+ /** sum of squares within groups, SSnn */
44
+ ssWn: number,
45
+ /** total sum of squares, SStot */
46
+ ssTot: number,
47
+ /** degrees of freedom between groups, DFbn */
48
+ dfBn: number,
49
+ /** degrees of freedom within groups, DFwn */
50
+ dfWn: number,
51
+ /** total degrees of freedom, DFtot */
52
+ dfTot: number,
53
+ /** mean square between groups, MSbn */
54
+ msBn: number,
55
+ /** mean square within groups, MSwn */
56
+ msWn: number,
57
+ /** Fobt, value of F-statistics, Fstat */
58
+ fStat: number,
59
+ /** p-value corresponding to F-statistics, pValue */
60
+ pValue: number,
61
+ };
62
+
63
+ /** Categorical column */
64
+ type CatCol = DG.Column<DG.COLUMN_TYPE.STRING>;
65
+
66
+ /** Numerical column */
67
+ type NumCol = DG.Column<DG.COLUMN_TYPE.FLOAT> | DG.Column<DG.COLUMN_TYPE.INT>;
68
+
69
+ /** Create dataframe with one-way ANOVA results. */
70
+ export function getOneWayAnovaDF(anova: OneWayAnova, alpha: number, fCritical: number, hypothesis: string, testResult: string): DG.DataFrame {
71
+ return DG.DataFrame.fromColumns([
72
+ DG.Column.fromStrings('Source of variance', ['Between groups', 'Within groups', 'Total', '', hypothesis, '', testResult]),
73
+ DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'Sum of squares', [anova.ssBn, anova.ssWn, anova.ssTot, null, null, null, null]),
74
+ DG.Column.fromList(DG.COLUMN_TYPE.INT, 'Degrees of freedom', [anova.dfBn, anova.dfWn, anova.dfTot, null, null, null, null]),
75
+ DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'Mean square', [anova.msBn, anova.msWn, null, null, null, null, null]),
76
+ DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'F-statistics', [anova.fStat, null, null, null, null, null, null]),
77
+ DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'p-value', [anova.pValue, null, null, null, null, null, null]),
78
+ DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, `${alpha}-critical value`, [fCritical, null, null, null, null, null, null]),
79
+ ]);
80
+ } // getOneWayAnovaDF
81
+
82
+ /** Check correctness of significance level. */
83
+ export function checkSignificanceLevel(alpha: number) {
84
+ if ((alpha <= 0) || (alpha >= 1))
85
+ throw new Error(ERROR_MSG.INCORRECT_SIGNIFICANCE_LEVEL);
86
+ }
87
+
88
+ /** Compute unbiased variance.*/
89
+ export function getVariance(data: SampleData): number {
90
+ // The applied formulas can be found in [4] (see p. 63)
91
+ const size = data.size;
92
+
93
+ if (size <= 0)
94
+ throw new Error(ERROR_MSG.INCORRECT_SAMPLE_SIZE);
95
+
96
+ if (size === 1)
97
+ return 0;
98
+
99
+ return (data.sumOfSquares - (data.sum) ** 2 / size) / (size - 1);
100
+ } // getVariance
101
+
102
+ /** Check equality of variances of 2 samples. F-test is performed.*/
103
+ function areVarsEqual(xData: SampleData, yData: SampleData, alpha: number = 0.05): boolean {
104
+ // The applied approach can be found in [3]
105
+ checkSignificanceLevel(alpha);
106
+
107
+ const xVar = getVariance(xData);
108
+ const yVar = getVariance(yData);
109
+
110
+ if (yVar === 0)
111
+ return (xVar === yVar);
112
+
113
+ const fStat = xVar / yVar;
114
+ const fCrit = jStat.centralF.inv(1 - alpha, xData.size - 1, yData.size - 1);
115
+
116
+ return (fStat < fCrit);
117
+ } // areVarsEqual
118
+
119
+ export class FactorizedData {
120
+ private isNormDistrib: boolean | undefined = undefined;
121
+ private categories: string[] = [];
122
+ private sums!: Float64Array;
123
+ private sumsOfSquares!: Float64Array;
124
+ private subSampleSizes!: Int32Array;
125
+ private size!: number;
126
+ private catCount!: number;
127
+
128
+ constructor(categories: CatCol, values: NumCol, checkNormality: boolean = false, alpha: number = 0.05) {
129
+ if (categories.type !== DG.COLUMN_TYPE.STRING)
130
+ throw new Error();
131
+
132
+ if (categories.length !== values.length)
133
+ throw new Error(ERROR_MSG.NON_EQUAL_FACTORS_VALUES_SIZE);
134
+
135
+ this.setStats(categories, values, checkNormality, alpha);
136
+ }
137
+
138
+ public isNormal(): boolean | undefined {
139
+ return true;
140
+ }
141
+
142
+ /** Check equality of variances of factorized data. */
143
+ public areVarsEqual(alpha: number = 0.05): boolean {
144
+ const K = this.catCount;
145
+
146
+ if (K === 1)
147
+ return true;
148
+
149
+ const first: SampleData = {sum: this.sums[0], sumOfSquares: this.sumsOfSquares[0], size: this.subSampleSizes[0]};
150
+
151
+ for (let i = 1; i < K; ++i)
152
+ if(!areVarsEqual(first, {sum: this.sums[i], sumOfSquares: this.sumsOfSquares[i], size: this.subSampleSizes[i]}, alpha))
153
+ return false;
154
+
155
+ return true;
156
+ } // areVarsEqual
157
+
158
+ /** Perform one-way ANOVA computations. */
159
+ public getOneWayAnova(): OneWayAnova {
160
+ // Further, notations and formulas from (see [2], p. 290) are used.
161
+
162
+ const K = this.catCount;
163
+
164
+ if (K === 1)
165
+ throw new Error(ERROR_MSG.ANOVA_FAILED_JUST_ONE_CAT);
166
+
167
+ let sum = 0;
168
+ let sumOfSquares = 0;
169
+ let N = this.size;
170
+ let buf = 0;
171
+
172
+ for (let i = 0; i < K; ++i) {
173
+ sum += this.sums[i];
174
+ sumOfSquares += this.sumsOfSquares[i];
175
+ buf += this.sums[i] ** 2 / this.subSampleSizes[i];
176
+ }
177
+
178
+ const ssTot = sumOfSquares - sum ** 2 / N;
179
+ const ssBn = buf - sum ** 2 / N;
180
+ const ssWn = ssTot - ssBn;
181
+
182
+ const dfBn = K - 1;
183
+ const dfWn = N - K;
184
+ const dfTot = N - 1;
185
+
186
+ const msBn = ssBn / dfBn;
187
+ const msWn = ssWn / dfWn;
188
+
189
+ const fStat = msBn / msWn;
190
+
191
+ return {
192
+ ssBn: ssBn,
193
+ ssWn: ssWn,
194
+ ssTot: ssTot,
195
+ dfBn: dfBn,
196
+ dfWn: dfWn,
197
+ dfTot: dfTot,
198
+ msBn: msBn,
199
+ msWn: msWn,
200
+ fStat: fStat,
201
+ pValue: 1 - jStat.centralF.cdf(fStat, dfBn, dfWn)
202
+ };
203
+ } // getOneWayAnova
204
+
205
+ /** Compute sum & sums of squares with respect to factor levels. */
206
+ private setStats(categories: CatCol, values: NumCol, checkNormality: boolean = false, alpha: number = 0.05): void {
207
+ // TODO: provide check normality feature
208
+ const type = values.type;
209
+ const size = values.length;
210
+
211
+ switch (type) {
212
+ case DG.COLUMN_TYPE.INT:
213
+ case DG.COLUMN_TYPE.FLOAT:
214
+ this.categories = categories.categories;
215
+ const catCount = this.categories.length;
216
+ this.catCount = catCount;
217
+ this.size = size;
218
+
219
+ const vals = values.getRawData();
220
+ const cats = categories.getRawData();
221
+
222
+ const sums = new Float64Array(catCount).fill(0);
223
+ const sumsOfSquares = new Float64Array(catCount).fill(0);
224
+ const subSampleSizes = new Int32Array(catCount).fill(0);
225
+
226
+ for (let i = 0; i < size; ++i) {
227
+ const c = cats[i];
228
+ sums[c] += vals[i];
229
+ sumsOfSquares[c] += vals[i] ** 2;
230
+ ++subSampleSizes[c];
231
+ }
232
+
233
+ this.sums = sums;
234
+ this.sumsOfSquares = sumsOfSquares;
235
+ this.subSampleSizes = subSampleSizes;
236
+
237
+ break;
238
+
239
+ default:
240
+ throw new Error(ERROR_MSG.UNSUPPORTED_COLUMN_TYPE);
241
+ }
242
+ } // setStats
243
+ } // FactorizedData
244
+
245
+ /** Perform one-way analysis of variances. */
246
+ export function oneWayAnova(categores: CatCol, values: NumCol, alpha: number = 0.05, validate: boolean = false): DG.DataFrame {
247
+ checkSignificanceLevel(alpha);
248
+
249
+ const factorized = new FactorizedData(categores, values, validate, alpha);
250
+
251
+ if (validate) {
252
+ if(!factorized.areVarsEqual(alpha))
253
+ throw new Error(ERROR_MSG.NON_EQUAL_VARIANCES);
254
+
255
+ if (!factorized.isNormal())
256
+ throw new Error(ERROR_MSG.NON_NORMAL_DISTRIB);
257
+ }
258
+
259
+ const anova = factorized.getOneWayAnova();
260
+ const fCrit = jStat.centralF.inv(1 - alpha, anova.dfBn, anova.dfWn);
261
+
262
+ const hypothesis = `THE NULL HYPOTHESIS: the "${categores.name}" factor does not produce a significant difference in the "${values.name}" feature.`;
263
+ const testResult = `Test result: ${(anova.fStat > fCrit) ? 'REJECTED.' : 'FAILED TO REJECT.'}`;
264
+
265
+ return getOneWayAnovaDF(anova, alpha, fCrit, hypothesis, testResult);
266
+ } // oneWayAnova
package/src/utils.ts CHANGED
@@ -14,7 +14,7 @@ const MAX_ELEMENTS_COUNT = 100000000;
14
14
 
15
15
  // Error messages
16
16
  const COMP_POSITVE_MES = 'components must be positive.';
17
- const COMP_EXCESS = 'components must not be greater than feautures count.';
17
+ const COMP_EXCESS = 'components must not be greater than features count.';
18
18
  const INCORERRECT_MIN_MAX_MES = 'min must be less than max.';
19
19
  const INCORERRECT_FEATURES_MES = 'features must be positive.';
20
20
  const INCORERRECT_SAMPLES_MES = 'samples must be positive.';