@datagrok/eda 1.2.1 → 1.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -4
- package/dist/111.js +1 -1
- package/dist/111.js.map +1 -1
- package/dist/153.js +1 -1
- package/dist/153.js.map +1 -1
- package/dist/234.js +1 -1
- package/dist/234.js.map +1 -1
- package/dist/260.js +1 -1
- package/dist/260.js.map +1 -1
- package/dist/348.js +1 -1
- package/dist/348.js.map +1 -1
- package/dist/377.js +1 -1
- package/dist/377.js.map +1 -1
- package/dist/412.js +1 -1
- package/dist/412.js.map +1 -1
- package/dist/531.js +1 -1
- package/dist/531.js.map +1 -1
- package/dist/583.js +1 -1
- package/dist/583.js.map +1 -1
- package/dist/603.js +1 -1
- package/dist/603.js.map +1 -1
- package/dist/656.js +1 -1
- package/dist/656.js.map +1 -1
- package/dist/682.js +1 -1
- package/dist/682.js.map +1 -1
- package/dist/705.js +1 -1
- package/dist/705.js.map +1 -1
- package/dist/727.js +1 -1
- package/dist/727.js.map +1 -1
- package/dist/763.js +1 -1
- package/dist/763.js.map +1 -1
- package/dist/778.js +1 -1
- package/dist/778.js.map +1 -1
- package/dist/783.js +1 -1
- package/dist/783.js.map +1 -1
- package/dist/793.js +1 -1
- package/dist/793.js.map +1 -1
- package/dist/91.js +1 -1
- package/dist/91.js.map +1 -1
- package/dist/950.js +1 -1
- package/dist/950.js.map +1 -1
- package/dist/980.js +1 -1
- package/dist/980.js.map +1 -1
- package/dist/990.js +1 -1
- package/dist/990.js.map +1 -1
- package/dist/package-test.js +1 -1
- package/dist/package-test.js.map +1 -1
- package/dist/package.js +1 -1
- package/dist/package.js.map +1 -1
- package/package.json +11 -10
- package/src/anova/anova-tools.ts +308 -0
- package/src/anova/anova-ui.ts +258 -0
- package/src/eda-ui.ts +0 -9
- package/src/global.d.ts +13 -0
- package/src/missing-values-imputation/ui-constants.ts +2 -0
- package/src/missing-values-imputation/ui.ts +7 -7
- package/src/package-test.ts +7 -1
- package/src/package.ts +6 -12
- package/src/tests/anova-tests.ts +87 -0
- package/src/tests/linear-methods-tests.ts +1 -1
package/package.json
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@datagrok/eda",
|
|
3
3
|
"friendlyName": "EDA",
|
|
4
|
-
"version": "1.2.
|
|
4
|
+
"version": "1.2.3",
|
|
5
5
|
"description": "Exploratory Data Analysis Tools",
|
|
6
6
|
"dependencies": {
|
|
7
7
|
"@datagrok-libraries/math": "^1.2.0",
|
|
8
8
|
"@datagrok-libraries/ml": "^6.7.0",
|
|
9
|
-
"@datagrok-libraries/tutorials": "^1.4.
|
|
9
|
+
"@datagrok-libraries/tutorials": "^1.4.2",
|
|
10
10
|
"@datagrok-libraries/utils": "^4.3.0",
|
|
11
11
|
"@keckelt/tsne": "^1.0.2",
|
|
12
12
|
"@webgpu/types": "^0.1.40",
|
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
"jstat": "^1.9.6",
|
|
17
17
|
"source-map-loader": "^4.0.1",
|
|
18
18
|
"umap-js": "^1.3.3",
|
|
19
|
-
"worker-loader": "
|
|
19
|
+
"worker-loader": "^3.0.8"
|
|
20
20
|
},
|
|
21
21
|
"author": {
|
|
22
22
|
"name": "Viktor Makarichev",
|
|
@@ -25,14 +25,14 @@
|
|
|
25
25
|
"devDependencies": {
|
|
26
26
|
"@typescript-eslint/eslint-plugin": "^5.32.0",
|
|
27
27
|
"@typescript-eslint/parser": "^5.32.0",
|
|
28
|
-
"css-loader": "
|
|
28
|
+
"css-loader": "^7.1.2",
|
|
29
29
|
"eslint": "^8.21.0",
|
|
30
30
|
"eslint-config-google": "^0.14.0",
|
|
31
|
-
"style-loader": "
|
|
32
|
-
"ts-loader": "
|
|
33
|
-
"typescript": "
|
|
34
|
-
"webpack": "
|
|
35
|
-
"webpack-cli": "
|
|
31
|
+
"style-loader": "^4.0.0",
|
|
32
|
+
"ts-loader": "^9.5.1",
|
|
33
|
+
"typescript": "^5.6.3",
|
|
34
|
+
"webpack": "^5.95.0",
|
|
35
|
+
"webpack-cli": "^5.1.4"
|
|
36
36
|
},
|
|
37
37
|
"scripts": {
|
|
38
38
|
"link-all": "npm link datagrok-api @datagrok-libraries/utils @datagrok-libraries/tutorials",
|
|
@@ -40,6 +40,7 @@
|
|
|
40
40
|
"release-eda": "webpack && grok publish --release",
|
|
41
41
|
"build-eda": "webpack",
|
|
42
42
|
"build": "webpack",
|
|
43
|
+
"test": "grok test",
|
|
43
44
|
"debug-eda-dev": "webpack && grok publish dev",
|
|
44
45
|
"release-eda-dev": "webpack && grok publish dev --release",
|
|
45
46
|
"debug-eda-local": "webpack && grok publish local",
|
|
@@ -92,4 +93,4 @@
|
|
|
92
93
|
}
|
|
93
94
|
}
|
|
94
95
|
}
|
|
95
|
-
}
|
|
96
|
+
}
|
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
// Analysis of Variances (ANOVA): computations
|
|
2
|
+
|
|
3
|
+
/* REFERENCES
|
|
4
|
+
|
|
5
|
+
[1] One-way analysis of variance, https://en.wikipedia.org/wiki/One-way_analysis_of_variance
|
|
6
|
+
|
|
7
|
+
[2] G.W. Heiman. Basic Statistics for the Behavioral Sciences, 6th ed. Wadsworth Publishing, 2010
|
|
8
|
+
|
|
9
|
+
[3] F-test of equality of variances, https://en.wikipedia.org/wiki/F-test_of_equality_of_variances
|
|
10
|
+
|
|
11
|
+
[4] S. McKillup. Statistics Explained, Cambridge University Press, 2005
|
|
12
|
+
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
import * as grok from 'datagrok-api/grok';
|
|
16
|
+
import * as ui from 'datagrok-api/ui';
|
|
17
|
+
import * as DG from 'datagrok-api/dg';
|
|
18
|
+
|
|
19
|
+
//@ts-ignore: no types
|
|
20
|
+
import * as jStat from 'jstat';
|
|
21
|
+
|
|
22
|
+
import {getNullValue} from '../missing-values-imputation/knn-imputer';
|
|
23
|
+
|
|
24
|
+
enum ERROR_MSG {
|
|
25
|
+
NON_EQUAL_FACTORS_VALUES_SIZE = 'non-equal sizes of factor and values arrays',
|
|
26
|
+
INCORRECT_SIGNIFICANCE_LEVEL = 'incorrect significance level',
|
|
27
|
+
NON_EQUAL_VARIANCES = 'non-equal variances',
|
|
28
|
+
NON_NORMAL_DISTRIB = 'non-normal distribution',
|
|
29
|
+
UNSUPPORTED_COLUMN_TYPE = 'unsupported feature column type',
|
|
30
|
+
INCORRECT_CATEGORIES_COL_TYPE = 'incorrect categories column type',
|
|
31
|
+
SINGLE_FACTOR = 'single category features',
|
|
32
|
+
CATS_EQUAL_SIZE = 'single value in each category',
|
|
33
|
+
NO_FEATURE_VARIATION = 'no feature variation',
|
|
34
|
+
NO_FEATURE_VARIATION_WITHIN_GROUPS = 'no feature variation within groups',
|
|
35
|
+
};
|
|
36
|
+
|
|
37
|
+
type SampleData = {
|
|
38
|
+
sum: number,
|
|
39
|
+
sumOfSquares: number,
|
|
40
|
+
size: number,
|
|
41
|
+
};
|
|
42
|
+
|
|
43
|
+
/** One-way ANOVA computation results. The classic notations are used (see [2], p. 290). */
|
|
44
|
+
type OneWayAnova = {
|
|
45
|
+
/** sum of squares between groups, SSbn */
|
|
46
|
+
ssBn: number,
|
|
47
|
+
/** sum of squares within groups, SSnn */
|
|
48
|
+
ssWn: number,
|
|
49
|
+
/** total sum of squares, SStot */
|
|
50
|
+
ssTot: number,
|
|
51
|
+
/** degrees of freedom between groups, DFbn */
|
|
52
|
+
dfBn: number,
|
|
53
|
+
/** degrees of freedom within groups, DFwn */
|
|
54
|
+
dfWn: number,
|
|
55
|
+
/** total degrees of freedom, DFtot */
|
|
56
|
+
dfTot: number,
|
|
57
|
+
/** mean square between groups, MSbn */
|
|
58
|
+
msBn: number,
|
|
59
|
+
/** mean square within groups, MSwn */
|
|
60
|
+
msWn: number,
|
|
61
|
+
/** Fobt, value of F-statistics, Fstat */
|
|
62
|
+
fStat: number,
|
|
63
|
+
/** p-value corresponding to F-statistics, pValue */
|
|
64
|
+
pValue: number,
|
|
65
|
+
};
|
|
66
|
+
|
|
67
|
+
/** One-way ANOVA report */
|
|
68
|
+
export type OneWayAnovaReport = {
|
|
69
|
+
anovaTable: OneWayAnova,
|
|
70
|
+
fCritical: number,
|
|
71
|
+
significance: number,
|
|
72
|
+
};
|
|
73
|
+
|
|
74
|
+
/** Categorical column */
|
|
75
|
+
type CatCol = DG.Column<DG.COLUMN_TYPE.STRING | DG.COLUMN_TYPE.BOOL>;
|
|
76
|
+
|
|
77
|
+
/** Numerical column */
|
|
78
|
+
type NumCol = DG.Column<DG.COLUMN_TYPE.FLOAT> | DG.Column<DG.COLUMN_TYPE.INT>;
|
|
79
|
+
|
|
80
|
+
/** Check correctness of significance level. */
|
|
81
|
+
export function checkSignificanceLevel(alpha: number) {
|
|
82
|
+
if ((alpha <= 0) || (alpha >= 1))
|
|
83
|
+
throw new Error(ERROR_MSG.INCORRECT_SIGNIFICANCE_LEVEL);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/** Compute unbiased variance.*/
|
|
87
|
+
export function getVariance(data: SampleData): number {
|
|
88
|
+
// The applied formulas can be found in [4] (see p. 63)
|
|
89
|
+
const size = data.size;
|
|
90
|
+
|
|
91
|
+
if (size <= 1)
|
|
92
|
+
return 0;
|
|
93
|
+
|
|
94
|
+
return (data.sumOfSquares - (data.sum) ** 2 / size) / (size - 1);
|
|
95
|
+
} // getVariance
|
|
96
|
+
|
|
97
|
+
/** Check equality of variances of 2 samples. F-test is performed.*/
|
|
98
|
+
function areVarsEqual(xData: SampleData, yData: SampleData, alpha: number): boolean {
|
|
99
|
+
// The applied approach can be found in [3]
|
|
100
|
+
checkSignificanceLevel(alpha);
|
|
101
|
+
|
|
102
|
+
const xVar = getVariance(xData);
|
|
103
|
+
const yVar = getVariance(yData);
|
|
104
|
+
|
|
105
|
+
if ((xVar === 0) || (yVar === 0))
|
|
106
|
+
return (xVar === yVar);
|
|
107
|
+
|
|
108
|
+
const fStat = xVar / yVar;
|
|
109
|
+
const fCrit = jStat.centralF.inv(1 - alpha, xData.size - 1, yData.size - 1);
|
|
110
|
+
|
|
111
|
+
return (fStat < fCrit);
|
|
112
|
+
} // areVarsEqual
|
|
113
|
+
|
|
114
|
+
export class FactorizedData {
|
|
115
|
+
private sums!: Float64Array;
|
|
116
|
+
private sumsOfSquares!: Float64Array;
|
|
117
|
+
private subSampleSizes!: Int32Array;
|
|
118
|
+
private size!: number;
|
|
119
|
+
private catCount!: number;
|
|
120
|
+
private nullsCount = 0;
|
|
121
|
+
|
|
122
|
+
constructor(categories: CatCol, values: NumCol, uniqueCount: number) {
|
|
123
|
+
if (categories.length !== values.length)
|
|
124
|
+
throw new Error(ERROR_MSG.NON_EQUAL_FACTORS_VALUES_SIZE);
|
|
125
|
+
|
|
126
|
+
if (values.stats.stdev > 0)
|
|
127
|
+
this.setStats(categories, values, uniqueCount);
|
|
128
|
+
else
|
|
129
|
+
throw new Error(ERROR_MSG.NO_FEATURE_VARIATION);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/** Check equality of variances of factorized data. */
|
|
133
|
+
public areVarsEqual(alpha: number): boolean {
|
|
134
|
+
const K = this.catCount;
|
|
135
|
+
|
|
136
|
+
if (K === 1)
|
|
137
|
+
return true;
|
|
138
|
+
|
|
139
|
+
const first: SampleData = {sum: this.sums[0], sumOfSquares: this.sumsOfSquares[0], size: this.subSampleSizes[0]};
|
|
140
|
+
|
|
141
|
+
for (let i = 1; i < K; ++i) {
|
|
142
|
+
if (!areVarsEqual(first, {sum: this.sums[i], sumOfSquares: this.sumsOfSquares[i],
|
|
143
|
+
size: this.subSampleSizes[i]}, alpha))
|
|
144
|
+
return false;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
return true;
|
|
148
|
+
} // areVarsEqual
|
|
149
|
+
|
|
150
|
+
/** Perform one-way ANOVA computations. */
|
|
151
|
+
public getOneWayAnova(): OneWayAnova {
|
|
152
|
+
// Further, notations and formulas from (see [2], p. 290) are used.
|
|
153
|
+
|
|
154
|
+
let sum = 0;
|
|
155
|
+
let sumOfSquares = 0;
|
|
156
|
+
let buf = 0;
|
|
157
|
+
let K = this.catCount;
|
|
158
|
+
let nonEmptyCategories = K;
|
|
159
|
+
|
|
160
|
+
for (let i = 0; i < K; ++i) {
|
|
161
|
+
if (this.subSampleSizes[i] !== 0) {
|
|
162
|
+
sum += this.sums[i];
|
|
163
|
+
sumOfSquares += this.sumsOfSquares[i];
|
|
164
|
+
buf += this.sums[i] ** 2 / this.subSampleSizes[i];
|
|
165
|
+
} else
|
|
166
|
+
--nonEmptyCategories;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
K = nonEmptyCategories;
|
|
170
|
+
|
|
171
|
+
if (K === 1)
|
|
172
|
+
throw new Error(ERROR_MSG.SINGLE_FACTOR);
|
|
173
|
+
|
|
174
|
+
const N = this.size - this.nullsCount;
|
|
175
|
+
if (N === K)
|
|
176
|
+
throw new Error(ERROR_MSG.CATS_EQUAL_SIZE);
|
|
177
|
+
|
|
178
|
+
const ssTot = sumOfSquares - sum ** 2 / N;
|
|
179
|
+
const ssBn = buf - sum ** 2 / N;
|
|
180
|
+
const ssWn = ssTot - ssBn;
|
|
181
|
+
|
|
182
|
+
if (ssWn === 0)
|
|
183
|
+
throw new Error(ERROR_MSG.NO_FEATURE_VARIATION_WITHIN_GROUPS);
|
|
184
|
+
|
|
185
|
+
const dfBn = K - 1;
|
|
186
|
+
const dfWn = N - K;
|
|
187
|
+
const dfTot = N - 1;
|
|
188
|
+
|
|
189
|
+
const msBn = ssBn / dfBn;
|
|
190
|
+
const msWn = ssWn / dfWn;
|
|
191
|
+
|
|
192
|
+
const fStat = msBn / msWn;
|
|
193
|
+
|
|
194
|
+
return {
|
|
195
|
+
ssBn: ssBn,
|
|
196
|
+
ssWn: ssWn,
|
|
197
|
+
ssTot: ssTot,
|
|
198
|
+
dfBn: dfBn,
|
|
199
|
+
dfWn: dfWn,
|
|
200
|
+
dfTot: dfTot,
|
|
201
|
+
msBn: msBn,
|
|
202
|
+
msWn: msWn,
|
|
203
|
+
fStat: fStat,
|
|
204
|
+
pValue: 1 - jStat.centralF.cdf(fStat, dfBn, dfWn),
|
|
205
|
+
};
|
|
206
|
+
} // getOneWayAnova
|
|
207
|
+
|
|
208
|
+
/** Compute sum & sums of squares with respect to factor levels. */
|
|
209
|
+
private setStats(categories: CatCol, features: NumCol, uniqueCount: number): void {
|
|
210
|
+
const type = features.type;
|
|
211
|
+
const size = features.length;
|
|
212
|
+
const featuresNull = getNullValue(features);
|
|
213
|
+
|
|
214
|
+
switch (type) {
|
|
215
|
+
case DG.COLUMN_TYPE.INT:
|
|
216
|
+
case DG.COLUMN_TYPE.FLOAT:
|
|
217
|
+
const catCount = uniqueCount;
|
|
218
|
+
this.catCount = catCount;
|
|
219
|
+
this.size = size;
|
|
220
|
+
|
|
221
|
+
const vals = features.getRawData();
|
|
222
|
+
const cats = categories.getRawData();
|
|
223
|
+
|
|
224
|
+
const sums = new Float64Array(catCount).fill(0);
|
|
225
|
+
const sumsOfSquares = new Float64Array(catCount).fill(0);
|
|
226
|
+
const subSampleSizes = new Int32Array(catCount).fill(0);
|
|
227
|
+
|
|
228
|
+
let cat: number;
|
|
229
|
+
|
|
230
|
+
if (categories.type == DG.COLUMN_TYPE.BOOL) {
|
|
231
|
+
let catIdx = 0;
|
|
232
|
+
let shift = 0;
|
|
233
|
+
let packed = cats[0];
|
|
234
|
+
const MAX_SHIFT = 8 * cats.BYTES_PER_ELEMENT - 1;
|
|
235
|
+
|
|
236
|
+
for (let i = 0; i < size; ++i) {
|
|
237
|
+
cat = 1 & (packed >> shift);
|
|
238
|
+
|
|
239
|
+
if (vals[i] !== featuresNull) {
|
|
240
|
+
sums[cat] += vals[i];
|
|
241
|
+
sumsOfSquares[cat] += vals[i] ** 2;
|
|
242
|
+
++subSampleSizes[cat];
|
|
243
|
+
} else
|
|
244
|
+
++this.nullsCount;
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
++shift;
|
|
248
|
+
|
|
249
|
+
if (shift > MAX_SHIFT) {
|
|
250
|
+
shift = 0;
|
|
251
|
+
++catIdx;
|
|
252
|
+
packed = cats[catIdx];
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
} else {
|
|
256
|
+
const categoriesNull = categories.stats.missingValueCount > 0 ? getNullValue(categories) : -1;
|
|
257
|
+
|
|
258
|
+
for (let i = 0; i < size; ++i) {
|
|
259
|
+
cat = cats[i];
|
|
260
|
+
|
|
261
|
+
if ((cat === categoriesNull) || (vals[i] === featuresNull)) {
|
|
262
|
+
++this.nullsCount;
|
|
263
|
+
continue;
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
sums[cat] += vals[i];
|
|
267
|
+
sumsOfSquares[cat] += vals[i] ** 2;
|
|
268
|
+
++subSampleSizes[cat];
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
this.sums = sums;
|
|
273
|
+
this.sumsOfSquares = sumsOfSquares;
|
|
274
|
+
this.subSampleSizes = subSampleSizes;
|
|
275
|
+
|
|
276
|
+
break;
|
|
277
|
+
|
|
278
|
+
default:
|
|
279
|
+
throw new Error(ERROR_MSG.UNSUPPORTED_COLUMN_TYPE);
|
|
280
|
+
}
|
|
281
|
+
} // setStats
|
|
282
|
+
} // FactorizedData
|
|
283
|
+
|
|
284
|
+
/** Perform one-way analysis of variances. */
|
|
285
|
+
export function oneWayAnova(categores: CatCol, values: NumCol, alpha: number,
|
|
286
|
+
toValidate: boolean = true): OneWayAnovaReport {
|
|
287
|
+
checkSignificanceLevel(alpha);
|
|
288
|
+
|
|
289
|
+
const uniqueCount = categores.stats.uniqueCount;
|
|
290
|
+
|
|
291
|
+
if (uniqueCount < 2)
|
|
292
|
+
throw new Error(ERROR_MSG.SINGLE_FACTOR);
|
|
293
|
+
|
|
294
|
+
const factorized = new FactorizedData(categores, values, uniqueCount);
|
|
295
|
+
|
|
296
|
+
if (toValidate) {
|
|
297
|
+
if (!factorized.areVarsEqual(alpha))
|
|
298
|
+
throw new Error(ERROR_MSG.NON_EQUAL_VARIANCES);
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
const anova = factorized.getOneWayAnova();
|
|
302
|
+
|
|
303
|
+
return {
|
|
304
|
+
anovaTable: anova,
|
|
305
|
+
fCritical: jStat.centralF.inv(1 - alpha, anova.dfBn, anova.dfWn),
|
|
306
|
+
significance: alpha,
|
|
307
|
+
};
|
|
308
|
+
} // oneWayAnova
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
// Analysis of Variances (ANOVA): UI
|
|
2
|
+
|
|
3
|
+
import * as grok from 'datagrok-api/grok';
|
|
4
|
+
import * as ui from 'datagrok-api/ui';
|
|
5
|
+
import * as DG from 'datagrok-api/dg';
|
|
6
|
+
|
|
7
|
+
import {oneWayAnova, OneWayAnovaReport} from './anova-tools';
|
|
8
|
+
|
|
9
|
+
const FEATURE_TYPES = [DG.COLUMN_TYPE.INT, DG.COLUMN_TYPE.FLOAT] as string[];
|
|
10
|
+
const FACTOR_TYPES = [DG.COLUMN_TYPE.STRING, DG.COLUMN_TYPE.BOOL] as string[];
|
|
11
|
+
|
|
12
|
+
const ANOVA_HELP_URL = '/help/explore/anova';
|
|
13
|
+
|
|
14
|
+
/** Significance const */
|
|
15
|
+
enum SIGNIFICANCE {
|
|
16
|
+
DEFAULT = 0.05,
|
|
17
|
+
MIN = 0.01,
|
|
18
|
+
MAX = 0.99,
|
|
19
|
+
INFIMUM = 0,
|
|
20
|
+
SUPREMUM = 1,
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
/** Default names */
|
|
24
|
+
enum DEFAULT {
|
|
25
|
+
FACTOR = 'race',
|
|
26
|
+
FEATURE = 'age',
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
/** Add one-way ANOVA results */
|
|
30
|
+
function addVizualization(df: DG.DataFrame, factorsName: string, featuresName: string, report: OneWayAnovaReport) {
|
|
31
|
+
const test = report.anovaTable.fStat > report.fCritical;
|
|
32
|
+
|
|
33
|
+
const shortConclusion = test ?
|
|
34
|
+
`"${factorsName}" affects the "${featuresName}"` :
|
|
35
|
+
`"${factorsName}" doesn't affect the "${featuresName}"`;
|
|
36
|
+
|
|
37
|
+
const view = grok.shell.getTableView(df.name);
|
|
38
|
+
const boxPlot = DG.Viewer.boxPlot(df, {
|
|
39
|
+
categoryColumnNames: [factorsName],
|
|
40
|
+
valueColumnName: featuresName,
|
|
41
|
+
showPValue: false,
|
|
42
|
+
showStatistics: false,
|
|
43
|
+
description: shortConclusion,
|
|
44
|
+
showColorSelector: false,
|
|
45
|
+
});
|
|
46
|
+
const boxPlotNode = view.dockManager.dock(boxPlot.root, DG.DOCK_TYPE.RIGHT, null, 'ANOVA');
|
|
47
|
+
|
|
48
|
+
const hypoMd = ui.markdown(`**H0:** the "${factorsName}"
|
|
49
|
+
factor does not produce a significant difference in the "${featuresName}" feature.`);
|
|
50
|
+
ui.tooltip.bind(hypoMd, 'Null hypothesis');
|
|
51
|
+
|
|
52
|
+
const testMd = ui.markdown(`**Test result:** ${test ?
|
|
53
|
+
'means differ significantly.' :
|
|
54
|
+
'means do not differ significantly.'}`,
|
|
55
|
+
);
|
|
56
|
+
|
|
57
|
+
const tooltipDiv = test ?
|
|
58
|
+
ui.divV([
|
|
59
|
+
ui.p(`Reject the null hypothesis, since F > F-critical:
|
|
60
|
+
${report.anovaTable.fStat.toFixed(2)} > ${report.fCritical.toFixed(2)}.`),
|
|
61
|
+
ui.h2('There is a significant difference among sample averages.'),
|
|
62
|
+
]) :
|
|
63
|
+
ui.divV([
|
|
64
|
+
ui.p(`Fail to reject the null hypothesis, since F < F-critical:
|
|
65
|
+
${report.anovaTable.fStat.toFixed(2)} < ${report.fCritical.toFixed(2)}.`),
|
|
66
|
+
ui.h2('There is no significant difference among sample averages.'),
|
|
67
|
+
]);
|
|
68
|
+
|
|
69
|
+
ui.tooltip.bind(testMd, () => tooltipDiv);
|
|
70
|
+
|
|
71
|
+
const divResult = ui.divV([
|
|
72
|
+
hypoMd,
|
|
73
|
+
testMd,
|
|
74
|
+
ui.link('Learn more',
|
|
75
|
+
() => window.open('https://en.wikipedia.org/wiki/F-test', '_blank'),
|
|
76
|
+
'Click to open in a new tab',
|
|
77
|
+
),
|
|
78
|
+
]);
|
|
79
|
+
divResult.style.marginLeft = '20px';
|
|
80
|
+
|
|
81
|
+
const hypoNode = grok.shell.dockManager.dock(divResult, DG.DOCK_TYPE.DOWN, boxPlotNode, 'F-test', 0.3);
|
|
82
|
+
|
|
83
|
+
const reportViewer = getAnovaGrid(report);
|
|
84
|
+
grok.shell.dockManager.dock(reportViewer.root, DG.DOCK_TYPE.FILL, hypoNode, 'Analysis');
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/** Create dataframe with one-way ANOVA results. */
|
|
88
|
+
function getAnovaGrid(report: OneWayAnovaReport): DG.Grid {
|
|
89
|
+
const anova = report.anovaTable;
|
|
90
|
+
|
|
91
|
+
const grid = DG.Viewer.grid(DG.DataFrame.fromColumns([
|
|
92
|
+
DG.Column.fromStrings('Source of variance', ['Between groups', 'Within groups', 'Total']),
|
|
93
|
+
DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'SS', [anova.ssBn, anova.ssWn, anova.ssTot]),
|
|
94
|
+
DG.Column.fromList(DG.COLUMN_TYPE.INT, 'DF', [anova.dfBn, anova.dfWn, anova.dfTot]),
|
|
95
|
+
DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'MS', [anova.msBn, anova.msWn, null]),
|
|
96
|
+
DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'F', [anova.fStat, null, null]),
|
|
97
|
+
DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'F-critical', [report.fCritical, null, null]),
|
|
98
|
+
DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'p-value', [anova.pValue, null, null]),
|
|
99
|
+
]));
|
|
100
|
+
|
|
101
|
+
const tooltip = new Map([
|
|
102
|
+
['Source of variance', 'List of the explored variation sources'],
|
|
103
|
+
['SS', 'Sum of squares (SS)'],
|
|
104
|
+
['DF', 'Degrees of freedom (DF)'],
|
|
105
|
+
['MS', 'Mean square (MS)'],
|
|
106
|
+
['F', 'F-statistics (F)'],
|
|
107
|
+
['F-critical', `${report.significance}-critical value of F-statistics (F)`],
|
|
108
|
+
['p-value', `Probability to obtain F-statistics (F) greater than the actual observation.`],
|
|
109
|
+
]);
|
|
110
|
+
|
|
111
|
+
grid.onCellTooltip(function(cell, x, y) {
|
|
112
|
+
if (cell.isColHeader) {
|
|
113
|
+
ui.tooltip.show(ui.divV([ui.p(tooltip.get(cell.tableColumn!.name)!)]), x, y);
|
|
114
|
+
return true;
|
|
115
|
+
}
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
grid.helpUrl = ANOVA_HELP_URL;
|
|
119
|
+
|
|
120
|
+
return grid;
|
|
121
|
+
} // getOneWayAnovaDF
|
|
122
|
+
|
|
123
|
+
/** Return warning div */
|
|
124
|
+
function getWarning(msg: string): HTMLElement {
|
|
125
|
+
return ui.divV([
|
|
126
|
+
ui.markdown(`ANOVA cannot be performed:
|
|
127
|
+
|
|
128
|
+
${msg}`),
|
|
129
|
+
ui.link('Learn more',
|
|
130
|
+
() => window.open('https://en.wikipedia.org/wiki/Analysis_of_variance#Assumptions', '_blank'),
|
|
131
|
+
'Click to open in a new tab',
|
|
132
|
+
),
|
|
133
|
+
]);
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
/** Run one-way analysis of variances */
|
|
137
|
+
export function runOneWayAnova(): void {
|
|
138
|
+
/** current dataframe */
|
|
139
|
+
const df: DG.DataFrame | null = grok.shell.t;
|
|
140
|
+
|
|
141
|
+
if (df === null) {
|
|
142
|
+
grok.shell.warning('No dataframe is opened');
|
|
143
|
+
return;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
const columns = df.columns;
|
|
147
|
+
const factorColNames = [] as string[];
|
|
148
|
+
const featureColNames = [] as string[];
|
|
149
|
+
|
|
150
|
+
for (const col of columns) {
|
|
151
|
+
if (FEATURE_TYPES.includes(col.type))
|
|
152
|
+
featureColNames.push(col.name);
|
|
153
|
+
else if (FACTOR_TYPES.includes(col.type))
|
|
154
|
+
factorColNames.push(col.name);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
const factorColsCount = factorColNames.length;
|
|
158
|
+
if (factorColsCount < 1) {
|
|
159
|
+
grok.shell.warning(ui.markdown(`No acceptable factor columns:
|
|
160
|
+
|
|
161
|
+
- type: ${FACTOR_TYPES.join(', ')}
|
|
162
|
+
- at least two categories`,
|
|
163
|
+
));
|
|
164
|
+
return;
|
|
165
|
+
};
|
|
166
|
+
|
|
167
|
+
let factor = df.col(DEFAULT.FACTOR);
|
|
168
|
+
|
|
169
|
+
if (factor === null) {
|
|
170
|
+
let minIdx = 0;
|
|
171
|
+
let minCount = columns.byName(factorColNames[0]).categories.length;
|
|
172
|
+
let current: number;
|
|
173
|
+
|
|
174
|
+
for (let i = 1; i < factorColsCount; ++i) {
|
|
175
|
+
current = columns.byName(factorColNames[i]).categories.length;
|
|
176
|
+
if (current < minCount) {
|
|
177
|
+
minCount = current;
|
|
178
|
+
minIdx = i;
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
factor = columns.byName(factorColNames[minIdx]);
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
if (featureColNames.length < 1) {
|
|
186
|
+
grok.shell.warning(ui.markdown(`No acceptable feature columns:
|
|
187
|
+
|
|
188
|
+
- type: ${FEATURE_TYPES.join(', ')}`,
|
|
189
|
+
));
|
|
190
|
+
return;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
const factorInput = ui.input.column('Category', {
|
|
194
|
+
table: df,
|
|
195
|
+
value: factor,
|
|
196
|
+
tooltipText: 'Column with factor values',
|
|
197
|
+
onValueChanged: (col) => factor = col,
|
|
198
|
+
filter: (col: DG.Column) => factorColNames.includes(col.name),
|
|
199
|
+
nullable: false,
|
|
200
|
+
});
|
|
201
|
+
|
|
202
|
+
let feature = df.col(DEFAULT.FEATURE);
|
|
203
|
+
if (feature === null)
|
|
204
|
+
feature = columns.byName(featureColNames[0]);
|
|
205
|
+
|
|
206
|
+
const featureInput = ui.input.column('Feature', {
|
|
207
|
+
table: df,
|
|
208
|
+
value: feature,
|
|
209
|
+
tooltipText: 'Column with feature values',
|
|
210
|
+
onValueChanged: (col) => feature = col,
|
|
211
|
+
filter: (col: DG.Column) => featureColNames.includes(col.name),
|
|
212
|
+
nullable: false,
|
|
213
|
+
});
|
|
214
|
+
|
|
215
|
+
let significance = SIGNIFICANCE.DEFAULT;
|
|
216
|
+
const signInput = ui.input.float('Alpha', {
|
|
217
|
+
min: SIGNIFICANCE.MIN,
|
|
218
|
+
max: SIGNIFICANCE.MAX,
|
|
219
|
+
value: significance,
|
|
220
|
+
nullable: false,
|
|
221
|
+
tooltipText: 'Significance level',
|
|
222
|
+
onValueChanged: (value) => {
|
|
223
|
+
significance = value;
|
|
224
|
+
runBtn.disabled = (significance <= SIGNIFICANCE.INFIMUM) || (significance >= SIGNIFICANCE.SUPREMUM);
|
|
225
|
+
},
|
|
226
|
+
});
|
|
227
|
+
|
|
228
|
+
const dlg = ui.dialog({title: 'ANOVA', helpUrl: ANOVA_HELP_URL});
|
|
229
|
+
const view = grok.shell.getTableView(df.name);
|
|
230
|
+
view.root.appendChild(dlg.root);
|
|
231
|
+
dlg.addButton('Run', () => {
|
|
232
|
+
dlg.close();
|
|
233
|
+
|
|
234
|
+
try {
|
|
235
|
+
const res = oneWayAnova(factor!, feature!, significance);
|
|
236
|
+
addVizualization(df, factor!.name, feature!.name, res);
|
|
237
|
+
} catch (error) {
|
|
238
|
+
if (error instanceof Error) {
|
|
239
|
+
grok.shell.warning(getWarning(error.message));
|
|
240
|
+
|
|
241
|
+
view.addViewer(DG.VIEWER.BOX_PLOT, {
|
|
242
|
+
categoryColumnNames: [factor!.name],
|
|
243
|
+
valueColumnName: feature!.name,
|
|
244
|
+
showStatistics: false,
|
|
245
|
+
showPValue: false,
|
|
246
|
+
});
|
|
247
|
+
} else
|
|
248
|
+
grok.shell.error('ANOVA fails: the platform issue');
|
|
249
|
+
}
|
|
250
|
+
}, undefined, 'Perform analysis of variances');
|
|
251
|
+
|
|
252
|
+
const runBtn = dlg.getButton('Run');
|
|
253
|
+
|
|
254
|
+
dlg.add(factorInput)
|
|
255
|
+
.add(featureInput)
|
|
256
|
+
.add(signInput)
|
|
257
|
+
.show();
|
|
258
|
+
} // runOneWayAnova
|
package/src/eda-ui.ts
CHANGED
|
@@ -131,12 +131,3 @@ export function addPLSvisualization(
|
|
|
131
131
|
// 4. Scores Scatter Plot
|
|
132
132
|
view.addViewer(scoresScatterPlot(samplesNames, plsOutput[2], plsOutput[3]));
|
|
133
133
|
}
|
|
134
|
-
|
|
135
|
-
// Add one-way ANOVA results
|
|
136
|
-
export function addOneWayAnovaVizualization(
|
|
137
|
-
table: DG.DataFrame, factors: DG.Column, values: DG.Column, anova: DG.DataFrame,
|
|
138
|
-
) {
|
|
139
|
-
const view = grok.shell.getTableView(table.name);
|
|
140
|
-
view.addViewer(DG.Viewer.boxPlot(DG.DataFrame.fromColumns([factors, values])));
|
|
141
|
-
view.addViewer(DG.Viewer.grid(anova));
|
|
142
|
-
}
|
package/src/global.d.ts
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import * as grokNamespace from 'datagrok-api/grok';
|
|
2
|
+
import * as uiNamespace from 'datagrok-api/ui';
|
|
3
|
+
import * as DGNamespace from 'datagrok-api/dg';
|
|
4
|
+
import * as rxjsNamespace from 'rxjs';
|
|
5
|
+
import $Namespace from 'cash-dom';
|
|
6
|
+
|
|
7
|
+
declare global {
|
|
8
|
+
const grok: typeof grokNamespace;
|
|
9
|
+
const ui: typeof uiNamespace;
|
|
10
|
+
const DG: typeof DGNamespace;
|
|
11
|
+
const rjxs: typeof rxjsNamespace;
|
|
12
|
+
const $: typeof $Namespace;
|
|
13
|
+
}
|
|
@@ -2,7 +2,7 @@ import * as grok from 'datagrok-api/grok';
|
|
|
2
2
|
import * as ui from 'datagrok-api/ui';
|
|
3
3
|
import * as DG from 'datagrok-api/dg';
|
|
4
4
|
|
|
5
|
-
import {TITLE, KNN_IMPUTER, ERROR_MSG, HINT} from './ui-constants';
|
|
5
|
+
import {TITLE, KNN_IMPUTER, ERROR_MSG, HINT, MAX_INPUT_NAME_LENGTH} from './ui-constants';
|
|
6
6
|
import {SUPPORTED_COLUMN_TYPES, METRIC_TYPE, DISTANCE_TYPE, MetricInfo, DEFAULT, MIN_NEIGHBORS,
|
|
7
7
|
impute, getMissingValsIndices, areThereFails, imputeFailed} from './knn-imputer';
|
|
8
8
|
|
|
@@ -190,7 +190,7 @@ export async function runKNNImputer(df?: DG.DataFrame): Promise<void> {
|
|
|
190
190
|
|
|
191
191
|
// Metrics components
|
|
192
192
|
const featuresMetrics = new Map<string, MetricInfo>();
|
|
193
|
-
const metricInfoInputs = new Map<string,
|
|
193
|
+
const metricInfoInputs = new Map<string, HTMLElement>();
|
|
194
194
|
const metricsDiv = ui.divV([]);
|
|
195
195
|
metricsDiv.style.overflow = 'auto';
|
|
196
196
|
|
|
@@ -214,7 +214,7 @@ export async function runKNNImputer(df?: DG.DataFrame): Promise<void> {
|
|
|
214
214
|
|
|
215
215
|
// The following should provide a slider (see th bug https://reddata.atlassian.net/browse/GROK-14431)
|
|
216
216
|
const prop = DG.Property.fromOptions({
|
|
217
|
-
'name': name,
|
|
217
|
+
'name': name.length < MAX_INPUT_NAME_LENGTH ? name : name.slice(0, MAX_INPUT_NAME_LENGTH).concat('...'),
|
|
218
218
|
'inputType': 'Float',
|
|
219
219
|
'min': 0,
|
|
220
220
|
'max': 10,
|
|
@@ -229,11 +229,11 @@ export async function runKNNImputer(df?: DG.DataFrame): Promise<void> {
|
|
|
229
229
|
distInfo.weight = value ?? settings.defaultWeight;
|
|
230
230
|
featuresMetrics.set(name, distInfo);
|
|
231
231
|
});
|
|
232
|
-
|
|
232
|
+
ui.tooltip.bind(weightInput.captionLabel, name);
|
|
233
|
+
ui.tooltip.bind(weightInput.input, HINT.WEIGHT);
|
|
233
234
|
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
metricsDiv.append(div);
|
|
235
|
+
metricInfoInputs.set(name, weightInput.root);
|
|
236
|
+
metricsDiv.append(weightInput.root);
|
|
237
237
|
});
|
|
238
238
|
|
|
239
239
|
// The main dialog
|