@datagrok/eda 1.2.1 → 1.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +7 -0
- package/dist/package-test.js +1 -1
- package/dist/package-test.js.map +1 -1
- package/dist/package.js +1 -1
- package/dist/package.js.map +1 -1
- package/package.json +1 -1
- package/src/anova/anova-tools.ts +312 -0
- package/src/anova/anova-ui.ts +258 -0
- package/src/eda-ui.ts +0 -9
- package/src/package-test.ts +7 -1
- package/src/package.ts +5 -11
- package/src/tests/anova-tests.ts +87 -0
package/package.json
CHANGED
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
// Analysis of Variances (ANOVA): computations
|
|
2
|
+
|
|
3
|
+
/* REFERENCES
|
|
4
|
+
|
|
5
|
+
[1] One-way analysis of variance, https://en.wikipedia.org/wiki/One-way_analysis_of_variance
|
|
6
|
+
|
|
7
|
+
[2] G.W. Heiman. Basic Statistics for the Behavioral Sciences, 6th ed. Wadsworth Publishing, 2010
|
|
8
|
+
|
|
9
|
+
[3] F-test of equality of variances, https://en.wikipedia.org/wiki/F-test_of_equality_of_variances
|
|
10
|
+
|
|
11
|
+
[4] S. McKillup. Statistics Explained, Cambridge University Press, 2005
|
|
12
|
+
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
import * as grok from 'datagrok-api/grok';
|
|
16
|
+
import * as ui from 'datagrok-api/ui';
|
|
17
|
+
import * as DG from 'datagrok-api/dg';
|
|
18
|
+
|
|
19
|
+
//@ts-ignore: no types
|
|
20
|
+
import * as jStat from 'jstat';
|
|
21
|
+
|
|
22
|
+
import {getNullValue} from '../missing-values-imputation/knn-imputer';
|
|
23
|
+
|
|
24
|
+
enum ERROR_MSG {
|
|
25
|
+
NON_EQUAL_FACTORS_VALUES_SIZE = 'non-equal sizes of factor and values arrays',
|
|
26
|
+
INCORRECT_SIGNIFICANCE_LEVEL = 'incorrect significance level',
|
|
27
|
+
NON_EQUAL_VARIANCES = 'non-equal variances',
|
|
28
|
+
NON_NORMAL_DISTRIB = 'non-normal distribution',
|
|
29
|
+
UNSUPPORTED_COLUMN_TYPE = 'unsupported feature column type',
|
|
30
|
+
INCORRECT_CATEGORIES_COL_TYPE = 'incorrect categories column type',
|
|
31
|
+
SINGLE_FACTOR = 'single category features',
|
|
32
|
+
CATS_EQUAL_SIZE = 'single value in each category',
|
|
33
|
+
NO_FEATURE_VARIATION = 'no feature variation',
|
|
34
|
+
NO_FEATURE_VARIATION_WITHIN_GROUPS = 'no feature variation within groups',
|
|
35
|
+
};
|
|
36
|
+
|
|
37
|
+
type SampleData = {
|
|
38
|
+
sum: number,
|
|
39
|
+
sumOfSquares: number,
|
|
40
|
+
size: number,
|
|
41
|
+
};
|
|
42
|
+
|
|
43
|
+
/** One-way ANOVA computation results. The classic notations are used (see [2], p. 290). */
|
|
44
|
+
type OneWayAnova = {
|
|
45
|
+
/** sum of squares between groups, SSbn */
|
|
46
|
+
ssBn: number,
|
|
47
|
+
/** sum of squares within groups, SSnn */
|
|
48
|
+
ssWn: number,
|
|
49
|
+
/** total sum of squares, SStot */
|
|
50
|
+
ssTot: number,
|
|
51
|
+
/** degrees of freedom between groups, DFbn */
|
|
52
|
+
dfBn: number,
|
|
53
|
+
/** degrees of freedom within groups, DFwn */
|
|
54
|
+
dfWn: number,
|
|
55
|
+
/** total degrees of freedom, DFtot */
|
|
56
|
+
dfTot: number,
|
|
57
|
+
/** mean square between groups, MSbn */
|
|
58
|
+
msBn: number,
|
|
59
|
+
/** mean square within groups, MSwn */
|
|
60
|
+
msWn: number,
|
|
61
|
+
/** Fobt, value of F-statistics, Fstat */
|
|
62
|
+
fStat: number,
|
|
63
|
+
/** p-value corresponding to F-statistics, pValue */
|
|
64
|
+
pValue: number,
|
|
65
|
+
};
|
|
66
|
+
|
|
67
|
+
/** One-way ANOVA report */
|
|
68
|
+
export type OneWayAnovaReport = {
|
|
69
|
+
anovaTable: OneWayAnova,
|
|
70
|
+
fCritical: number,
|
|
71
|
+
significance: number,
|
|
72
|
+
};
|
|
73
|
+
|
|
74
|
+
/** Categorical column */
|
|
75
|
+
type CatCol = DG.Column<DG.COLUMN_TYPE.STRING | DG.COLUMN_TYPE.BOOL>;
|
|
76
|
+
|
|
77
|
+
/** Numerical column */
|
|
78
|
+
type NumCol = DG.Column<DG.COLUMN_TYPE.FLOAT> | DG.Column<DG.COLUMN_TYPE.INT>;
|
|
79
|
+
|
|
80
|
+
/** Check correctness of significance level. */
|
|
81
|
+
export function checkSignificanceLevel(alpha: number) {
|
|
82
|
+
if ((alpha <= 0) || (alpha >= 1))
|
|
83
|
+
throw new Error(ERROR_MSG.INCORRECT_SIGNIFICANCE_LEVEL);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/** Compute unbiased variance.*/
|
|
87
|
+
export function getVariance(data: SampleData): number {
|
|
88
|
+
// The applied formulas can be found in [4] (see p. 63)
|
|
89
|
+
const size = data.size;
|
|
90
|
+
|
|
91
|
+
if (size <= 1)
|
|
92
|
+
return 0;
|
|
93
|
+
|
|
94
|
+
return (data.sumOfSquares - (data.sum) ** 2 / size) / (size - 1);
|
|
95
|
+
} // getVariance
|
|
96
|
+
|
|
97
|
+
/** Check equality of variances of 2 samples. F-test is performed.*/
|
|
98
|
+
function areVarsEqual(xData: SampleData, yData: SampleData, alpha: number): boolean {
|
|
99
|
+
// The applied approach can be found in [3]
|
|
100
|
+
checkSignificanceLevel(alpha);
|
|
101
|
+
|
|
102
|
+
const xVar = getVariance(xData);
|
|
103
|
+
const yVar = getVariance(yData);
|
|
104
|
+
|
|
105
|
+
if ((xVar === 0) || (yVar === 0))
|
|
106
|
+
return (xVar === yVar);
|
|
107
|
+
|
|
108
|
+
const fStat = xVar / yVar;
|
|
109
|
+
const fCrit = jStat.centralF.inv(1 - alpha, xData.size - 1, yData.size - 1);
|
|
110
|
+
|
|
111
|
+
return (fStat < fCrit);
|
|
112
|
+
} // areVarsEqual
|
|
113
|
+
|
|
114
|
+
export class FactorizedData {
|
|
115
|
+
private sums!: Float64Array;
|
|
116
|
+
private sumsOfSquares!: Float64Array;
|
|
117
|
+
private subSampleSizes!: Int32Array;
|
|
118
|
+
private size!: number;
|
|
119
|
+
private catCount!: number;
|
|
120
|
+
private nullsCount = 0;
|
|
121
|
+
|
|
122
|
+
constructor(categories: CatCol, values: NumCol, uniqueCount: number) {
|
|
123
|
+
if (categories.length !== values.length)
|
|
124
|
+
throw new Error(ERROR_MSG.NON_EQUAL_FACTORS_VALUES_SIZE);
|
|
125
|
+
|
|
126
|
+
if (values.stats.stdev > 0)
|
|
127
|
+
this.setStats(categories, values, uniqueCount);
|
|
128
|
+
else
|
|
129
|
+
throw new Error(ERROR_MSG.NO_FEATURE_VARIATION);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/** Check equality of variances of factorized data. */
|
|
133
|
+
public areVarsEqual(alpha: number): boolean {
|
|
134
|
+
const K = this.catCount;
|
|
135
|
+
|
|
136
|
+
if (K === 1)
|
|
137
|
+
return true;
|
|
138
|
+
|
|
139
|
+
const first: SampleData = {sum: this.sums[0], sumOfSquares: this.sumsOfSquares[0], size: this.subSampleSizes[0]};
|
|
140
|
+
|
|
141
|
+
for (let i = 1; i < K; ++i) {
|
|
142
|
+
if (!areVarsEqual(first, {sum: this.sums[i], sumOfSquares: this.sumsOfSquares[i],
|
|
143
|
+
size: this.subSampleSizes[i]}, alpha))
|
|
144
|
+
return false;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
return true;
|
|
148
|
+
} // areVarsEqual
|
|
149
|
+
|
|
150
|
+
/** Perform one-way ANOVA computations. */
|
|
151
|
+
public getOneWayAnova(): OneWayAnova {
|
|
152
|
+
// Further, notations and formulas from (see [2], p. 290) are used.
|
|
153
|
+
|
|
154
|
+
let sum = 0;
|
|
155
|
+
let sumOfSquares = 0;
|
|
156
|
+
let buf = 0;
|
|
157
|
+
let K = this.catCount;
|
|
158
|
+
let nonEmptyCategories = K;
|
|
159
|
+
|
|
160
|
+
for (let i = 0; i < K; ++i) {
|
|
161
|
+
if (this.subSampleSizes[i] !== 0) {
|
|
162
|
+
sum += this.sums[i];
|
|
163
|
+
sumOfSquares += this.sumsOfSquares[i];
|
|
164
|
+
buf += this.sums[i] ** 2 / this.subSampleSizes[i];
|
|
165
|
+
} else
|
|
166
|
+
--nonEmptyCategories;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
K = nonEmptyCategories;
|
|
170
|
+
|
|
171
|
+
if (K === 1)
|
|
172
|
+
throw new Error(ERROR_MSG.SINGLE_FACTOR);
|
|
173
|
+
|
|
174
|
+
const N = this.size - this.nullsCount;
|
|
175
|
+
if (N === K)
|
|
176
|
+
throw new Error(ERROR_MSG.CATS_EQUAL_SIZE);
|
|
177
|
+
|
|
178
|
+
const ssTot = sumOfSquares - sum ** 2 / N;
|
|
179
|
+
const ssBn = buf - sum ** 2 / N;
|
|
180
|
+
const ssWn = ssTot - ssBn;
|
|
181
|
+
|
|
182
|
+
if (ssWn === 0)
|
|
183
|
+
throw new Error(ERROR_MSG.NO_FEATURE_VARIATION_WITHIN_GROUPS);
|
|
184
|
+
|
|
185
|
+
const dfBn = K - 1;
|
|
186
|
+
const dfWn = N - K;
|
|
187
|
+
const dfTot = N - 1;
|
|
188
|
+
|
|
189
|
+
const msBn = ssBn / dfBn;
|
|
190
|
+
const msWn = ssWn / dfWn;
|
|
191
|
+
|
|
192
|
+
const fStat = msBn / msWn;
|
|
193
|
+
|
|
194
|
+
return {
|
|
195
|
+
ssBn: ssBn,
|
|
196
|
+
ssWn: ssWn,
|
|
197
|
+
ssTot: ssTot,
|
|
198
|
+
dfBn: dfBn,
|
|
199
|
+
dfWn: dfWn,
|
|
200
|
+
dfTot: dfTot,
|
|
201
|
+
msBn: msBn,
|
|
202
|
+
msWn: msWn,
|
|
203
|
+
fStat: fStat,
|
|
204
|
+
pValue: 1 - jStat.centralF.cdf(fStat, dfBn, dfWn),
|
|
205
|
+
};
|
|
206
|
+
} // getOneWayAnova
|
|
207
|
+
|
|
208
|
+
/** Compute sum & sums of squares with respect to factor levels. */
|
|
209
|
+
private setStats(categories: CatCol, features: NumCol, uniqueCount: number): void {
|
|
210
|
+
const type = features.type;
|
|
211
|
+
const size = features.length;
|
|
212
|
+
const featuresNull = getNullValue(features);
|
|
213
|
+
|
|
214
|
+
switch (type) {
|
|
215
|
+
case DG.COLUMN_TYPE.INT:
|
|
216
|
+
case DG.COLUMN_TYPE.FLOAT:
|
|
217
|
+
const catCount = uniqueCount;
|
|
218
|
+
this.catCount = catCount;
|
|
219
|
+
this.size = size;
|
|
220
|
+
|
|
221
|
+
const vals = features.getRawData();
|
|
222
|
+
const cats = categories.getRawData();
|
|
223
|
+
|
|
224
|
+
const sums = new Float64Array(catCount).fill(0);
|
|
225
|
+
const sumsOfSquares = new Float64Array(catCount).fill(0);
|
|
226
|
+
const subSampleSizes = new Int32Array(catCount).fill(0);
|
|
227
|
+
|
|
228
|
+
let cat: number;
|
|
229
|
+
|
|
230
|
+
if (categories.type == DG.COLUMN_TYPE.BOOL) {
|
|
231
|
+
let catIdx = 0;
|
|
232
|
+
let shift = 0;
|
|
233
|
+
let packed = cats[0];
|
|
234
|
+
const MAX_SHIFT = 8 * cats.BYTES_PER_ELEMENT - 1;
|
|
235
|
+
|
|
236
|
+
for (let i = 0; i < size; ++i) {
|
|
237
|
+
cat = 1 & (packed >> shift);
|
|
238
|
+
|
|
239
|
+
if (vals[i] !== featuresNull) {
|
|
240
|
+
sums[cat] += vals[i];
|
|
241
|
+
sumsOfSquares[cat] += vals[i] ** 2;
|
|
242
|
+
++subSampleSizes[cat];
|
|
243
|
+
} else
|
|
244
|
+
++this.nullsCount;
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
++shift;
|
|
248
|
+
|
|
249
|
+
if (shift > MAX_SHIFT) {
|
|
250
|
+
shift = 0;
|
|
251
|
+
++catIdx;
|
|
252
|
+
packed = cats[catIdx];
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
} else {
|
|
256
|
+
const categoriesNull = categories.stats.missingValueCount > 0 ? getNullValue(categories) : -1;
|
|
257
|
+
|
|
258
|
+
for (let i = 0; i < size; ++i) {
|
|
259
|
+
cat = cats[i];
|
|
260
|
+
|
|
261
|
+
if ((cat === categoriesNull) || (vals[i] === featuresNull)) {
|
|
262
|
+
++this.nullsCount;
|
|
263
|
+
continue;
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
sums[cat] += vals[i];
|
|
267
|
+
sumsOfSquares[cat] += vals[i] ** 2;
|
|
268
|
+
++subSampleSizes[cat];
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
this.sums = sums;
|
|
273
|
+
this.sumsOfSquares = sumsOfSquares;
|
|
274
|
+
this.subSampleSizes = subSampleSizes;
|
|
275
|
+
|
|
276
|
+
console.log(sums);
|
|
277
|
+
console.log(sumsOfSquares);
|
|
278
|
+
console.log(subSampleSizes);
|
|
279
|
+
|
|
280
|
+
break;
|
|
281
|
+
|
|
282
|
+
default:
|
|
283
|
+
throw new Error(ERROR_MSG.UNSUPPORTED_COLUMN_TYPE);
|
|
284
|
+
}
|
|
285
|
+
} // setStats
|
|
286
|
+
} // FactorizedData
|
|
287
|
+
|
|
288
|
+
/** Perform one-way analysis of variances. */
|
|
289
|
+
export function oneWayAnova(categores: CatCol, values: NumCol, alpha: number,
|
|
290
|
+
toValidate: boolean = true): OneWayAnovaReport {
|
|
291
|
+
checkSignificanceLevel(alpha);
|
|
292
|
+
|
|
293
|
+
const uniqueCount = categores.stats.uniqueCount;
|
|
294
|
+
|
|
295
|
+
if (uniqueCount < 2)
|
|
296
|
+
throw new Error(ERROR_MSG.SINGLE_FACTOR);
|
|
297
|
+
|
|
298
|
+
const factorized = new FactorizedData(categores, values, uniqueCount);
|
|
299
|
+
|
|
300
|
+
if (toValidate) {
|
|
301
|
+
if (!factorized.areVarsEqual(alpha))
|
|
302
|
+
throw new Error(ERROR_MSG.NON_EQUAL_VARIANCES);
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
const anova = factorized.getOneWayAnova();
|
|
306
|
+
|
|
307
|
+
return {
|
|
308
|
+
anovaTable: anova,
|
|
309
|
+
fCritical: jStat.centralF.inv(1 - alpha, anova.dfBn, anova.dfWn),
|
|
310
|
+
significance: alpha,
|
|
311
|
+
};
|
|
312
|
+
} // oneWayAnova
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
// Analysis of Variances (ANOVA): UI
|
|
2
|
+
|
|
3
|
+
import * as grok from 'datagrok-api/grok';
|
|
4
|
+
import * as ui from 'datagrok-api/ui';
|
|
5
|
+
import * as DG from 'datagrok-api/dg';
|
|
6
|
+
|
|
7
|
+
import {oneWayAnova, OneWayAnovaReport} from './anova-tools';
|
|
8
|
+
|
|
9
|
+
const FEATURE_TYPES = [DG.COLUMN_TYPE.INT, DG.COLUMN_TYPE.FLOAT] as string[];
|
|
10
|
+
const FACTOR_TYPES = [DG.COLUMN_TYPE.STRING, DG.COLUMN_TYPE.BOOL] as string[];
|
|
11
|
+
|
|
12
|
+
const ANOVA_HELP_URL = '/help/explore/anova';
|
|
13
|
+
|
|
14
|
+
/** Significance const */
|
|
15
|
+
enum SIGNIFICANCE {
|
|
16
|
+
DEFAULT = 0.05,
|
|
17
|
+
MIN = 0.01,
|
|
18
|
+
MAX = 0.99,
|
|
19
|
+
INFIMUM = 0,
|
|
20
|
+
SUPREMUM = 1,
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
/** Default names */
|
|
24
|
+
enum DEFAULT {
|
|
25
|
+
FACTOR = 'race',
|
|
26
|
+
FEATURE = 'age',
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
/** Add one-way ANOVA results */
|
|
30
|
+
function addVizualization(df: DG.DataFrame, factorsName: string, featuresName: string, report: OneWayAnovaReport) {
|
|
31
|
+
const test = report.anovaTable.fStat > report.fCritical;
|
|
32
|
+
|
|
33
|
+
const shortConclusion = test ?
|
|
34
|
+
`"${factorsName}" affects the "${featuresName}"` :
|
|
35
|
+
`"${factorsName}" doesn't affect the "${featuresName}"`;
|
|
36
|
+
|
|
37
|
+
const view = grok.shell.getTableView(df.name);
|
|
38
|
+
const boxPlot = DG.Viewer.boxPlot(df, {
|
|
39
|
+
categoryColumnNames: [factorsName],
|
|
40
|
+
valueColumnName: featuresName,
|
|
41
|
+
showPValue: false,
|
|
42
|
+
showStatistics: false,
|
|
43
|
+
description: shortConclusion,
|
|
44
|
+
showColorSelector: false,
|
|
45
|
+
});
|
|
46
|
+
const boxPlotNode = view.dockManager.dock(boxPlot.root, DG.DOCK_TYPE.RIGHT, null, 'ANOVA');
|
|
47
|
+
|
|
48
|
+
const hypoMd = ui.markdown(`**H0:** the "${factorsName}"
|
|
49
|
+
factor does not produce a significant difference in the "${featuresName}" feature.`);
|
|
50
|
+
ui.tooltip.bind(hypoMd, 'Null hypothesis');
|
|
51
|
+
|
|
52
|
+
const testMd = ui.markdown(`**Test result:** ${test ?
|
|
53
|
+
'means differ significantly.' :
|
|
54
|
+
'means do not differ significantly.'}`,
|
|
55
|
+
);
|
|
56
|
+
|
|
57
|
+
const tooltipDiv = test ?
|
|
58
|
+
ui.divV([
|
|
59
|
+
ui.p(`Reject the null hypothesis, since F > F-critical:
|
|
60
|
+
${report.anovaTable.fStat.toFixed(2)} > ${report.fCritical.toFixed(2)}.`),
|
|
61
|
+
ui.h2('There is a significant difference among sample averages.'),
|
|
62
|
+
]) :
|
|
63
|
+
ui.divV([
|
|
64
|
+
ui.p(`Fail to reject the null hypothesis, since F < F-critical:
|
|
65
|
+
${report.anovaTable.fStat.toFixed(2)} < ${report.fCritical.toFixed(2)}.`),
|
|
66
|
+
ui.h2('There is no significant difference among sample averages.'),
|
|
67
|
+
]);
|
|
68
|
+
|
|
69
|
+
ui.tooltip.bind(testMd, () => tooltipDiv);
|
|
70
|
+
|
|
71
|
+
const divResult = ui.divV([
|
|
72
|
+
hypoMd,
|
|
73
|
+
testMd,
|
|
74
|
+
ui.link('Learn more',
|
|
75
|
+
() => window.open('https://en.wikipedia.org/wiki/F-test', '_blank'),
|
|
76
|
+
'Click to open in a new tab',
|
|
77
|
+
),
|
|
78
|
+
]);
|
|
79
|
+
divResult.style.marginLeft = '20px';
|
|
80
|
+
|
|
81
|
+
const hypoNode = grok.shell.dockManager.dock(divResult, DG.DOCK_TYPE.DOWN, boxPlotNode, 'F-test', 0.3);
|
|
82
|
+
|
|
83
|
+
const reportViewer = getAnovaGrid(report);
|
|
84
|
+
grok.shell.dockManager.dock(reportViewer.root, DG.DOCK_TYPE.FILL, hypoNode, 'Analysis');
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/** Create dataframe with one-way ANOVA results. */
|
|
88
|
+
function getAnovaGrid(report: OneWayAnovaReport): DG.Grid {
|
|
89
|
+
const anova = report.anovaTable;
|
|
90
|
+
|
|
91
|
+
const grid = DG.Viewer.grid(DG.DataFrame.fromColumns([
|
|
92
|
+
DG.Column.fromStrings('Source of variance', ['Between groups', 'Within groups', 'Total']),
|
|
93
|
+
DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'SS', [anova.ssBn, anova.ssWn, anova.ssTot]),
|
|
94
|
+
DG.Column.fromList(DG.COLUMN_TYPE.INT, 'DF', [anova.dfBn, anova.dfWn, anova.dfTot]),
|
|
95
|
+
DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'MS', [anova.msBn, anova.msWn, null]),
|
|
96
|
+
DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'F', [anova.fStat, null, null]),
|
|
97
|
+
DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'F-critical', [report.fCritical, null, null]),
|
|
98
|
+
DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'p-value', [anova.pValue, null, null]),
|
|
99
|
+
]));
|
|
100
|
+
|
|
101
|
+
const tooltip = new Map([
|
|
102
|
+
['Source of variance', 'List of the explored variation sources'],
|
|
103
|
+
['SS', 'Sum of squares (SS)'],
|
|
104
|
+
['DF', 'Degrees of freedom (DF)'],
|
|
105
|
+
['MS', 'Mean square (MS)'],
|
|
106
|
+
['F', 'F-statistics (F)'],
|
|
107
|
+
['F-critical', `${report.significance}-critical value of F-statistics (F)`],
|
|
108
|
+
['p-value', `Probability to obtain F-statistics (F) greater than the actual observation.`],
|
|
109
|
+
]);
|
|
110
|
+
|
|
111
|
+
grid.onCellTooltip(function(cell, x, y) {
|
|
112
|
+
if (cell.isColHeader) {
|
|
113
|
+
ui.tooltip.show(ui.divV([ui.p(tooltip.get(cell.tableColumn!.name)!)]), x, y);
|
|
114
|
+
return true;
|
|
115
|
+
}
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
grid.helpUrl = ANOVA_HELP_URL;
|
|
119
|
+
|
|
120
|
+
return grid;
|
|
121
|
+
} // getOneWayAnovaDF
|
|
122
|
+
|
|
123
|
+
/** Return warning div */
|
|
124
|
+
function getWarning(msg: string): HTMLElement {
|
|
125
|
+
return ui.divV([
|
|
126
|
+
ui.markdown(`ANOVA cannot be performed:
|
|
127
|
+
|
|
128
|
+
${msg}`),
|
|
129
|
+
ui.link('Learn more',
|
|
130
|
+
() => window.open('https://en.wikipedia.org/wiki/Analysis_of_variance#Assumptions', '_blank'),
|
|
131
|
+
'Click to open in a new tab',
|
|
132
|
+
),
|
|
133
|
+
]);
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
/** Run one-way analysis of variances */
|
|
137
|
+
export function runOneWayAnova(): void {
|
|
138
|
+
/** current dataframe */
|
|
139
|
+
const df: DG.DataFrame | null = grok.shell.t;
|
|
140
|
+
|
|
141
|
+
if (df === null) {
|
|
142
|
+
grok.shell.warning('No dataframe is opened');
|
|
143
|
+
return;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
const columns = df.columns;
|
|
147
|
+
const factorColNames = [] as string[];
|
|
148
|
+
const featureColNames = [] as string[];
|
|
149
|
+
|
|
150
|
+
for (const col of columns) {
|
|
151
|
+
if (FEATURE_TYPES.includes(col.type))
|
|
152
|
+
featureColNames.push(col.name);
|
|
153
|
+
else if (FACTOR_TYPES.includes(col.type))
|
|
154
|
+
factorColNames.push(col.name);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
const factorColsCount = factorColNames.length;
|
|
158
|
+
if (factorColsCount < 1) {
|
|
159
|
+
grok.shell.warning(ui.markdown(`No acceptable factor columns:
|
|
160
|
+
|
|
161
|
+
- type: ${FACTOR_TYPES.join(', ')}
|
|
162
|
+
- at least two categories`,
|
|
163
|
+
));
|
|
164
|
+
return;
|
|
165
|
+
};
|
|
166
|
+
|
|
167
|
+
let factor = df.col(DEFAULT.FACTOR);
|
|
168
|
+
|
|
169
|
+
if (factor === null) {
|
|
170
|
+
let minIdx = 0;
|
|
171
|
+
let minCount = columns.byName(factorColNames[0]).categories.length;
|
|
172
|
+
let current: number;
|
|
173
|
+
|
|
174
|
+
for (let i = 1; i < factorColsCount; ++i) {
|
|
175
|
+
current = columns.byName(factorColNames[i]).categories.length;
|
|
176
|
+
if (current < minCount) {
|
|
177
|
+
minCount = current;
|
|
178
|
+
minIdx = i;
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
factor = columns.byName(factorColNames[minIdx]);
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
if (featureColNames.length < 1) {
|
|
186
|
+
grok.shell.warning(ui.markdown(`No acceptable feature columns:
|
|
187
|
+
|
|
188
|
+
- type: ${FEATURE_TYPES.join(', ')}`,
|
|
189
|
+
));
|
|
190
|
+
return;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
const factorInput = ui.input.column('Category', {
|
|
194
|
+
table: df,
|
|
195
|
+
value: factor,
|
|
196
|
+
tooltipText: 'Column with factor values',
|
|
197
|
+
onValueChanged: (col) => factor = col,
|
|
198
|
+
filter: (col: DG.Column) => factorColNames.includes(col.name),
|
|
199
|
+
nullable: false,
|
|
200
|
+
});
|
|
201
|
+
|
|
202
|
+
let feature = df.col(DEFAULT.FEATURE);
|
|
203
|
+
if (feature === null)
|
|
204
|
+
feature = columns.byName(featureColNames[0]);
|
|
205
|
+
|
|
206
|
+
const featureInput = ui.input.column('Feature', {
|
|
207
|
+
table: df,
|
|
208
|
+
value: feature,
|
|
209
|
+
tooltipText: 'Column with feature values',
|
|
210
|
+
onValueChanged: (col) => feature = col,
|
|
211
|
+
filter: (col: DG.Column) => featureColNames.includes(col.name),
|
|
212
|
+
nullable: false,
|
|
213
|
+
});
|
|
214
|
+
|
|
215
|
+
let significance = SIGNIFICANCE.DEFAULT;
|
|
216
|
+
const signInput = ui.input.float('Alpha', {
|
|
217
|
+
min: SIGNIFICANCE.MIN,
|
|
218
|
+
max: SIGNIFICANCE.MAX,
|
|
219
|
+
value: significance,
|
|
220
|
+
nullable: false,
|
|
221
|
+
tooltipText: 'Significance level',
|
|
222
|
+
onValueChanged: (value) => {
|
|
223
|
+
significance = value;
|
|
224
|
+
runBtn.disabled = (significance <= SIGNIFICANCE.INFIMUM) || (significance >= SIGNIFICANCE.SUPREMUM);
|
|
225
|
+
},
|
|
226
|
+
});
|
|
227
|
+
|
|
228
|
+
const dlg = ui.dialog({title: 'ANOVA', helpUrl: ANOVA_HELP_URL});
|
|
229
|
+
const view = grok.shell.getTableView(df.name);
|
|
230
|
+
view.root.appendChild(dlg.root);
|
|
231
|
+
dlg.addButton('Run', () => {
|
|
232
|
+
dlg.close();
|
|
233
|
+
|
|
234
|
+
try {
|
|
235
|
+
const res = oneWayAnova(factor!, feature!, significance);
|
|
236
|
+
addVizualization(df, factor!.name, feature!.name, res);
|
|
237
|
+
} catch (error) {
|
|
238
|
+
if (error instanceof Error) {
|
|
239
|
+
grok.shell.warning(getWarning(error.message));
|
|
240
|
+
|
|
241
|
+
view.addViewer(DG.VIEWER.BOX_PLOT, {
|
|
242
|
+
categoryColumnNames: [factor!.name],
|
|
243
|
+
valueColumnName: feature!.name,
|
|
244
|
+
showStatistics: false,
|
|
245
|
+
showPValue: false,
|
|
246
|
+
});
|
|
247
|
+
} else
|
|
248
|
+
grok.shell.error('ANOVA fails: the platform issue');
|
|
249
|
+
}
|
|
250
|
+
}, undefined, 'Perform analysis of variances');
|
|
251
|
+
|
|
252
|
+
const runBtn = dlg.getButton('Run');
|
|
253
|
+
|
|
254
|
+
dlg.add(factorInput)
|
|
255
|
+
.add(featureInput)
|
|
256
|
+
.add(signInput)
|
|
257
|
+
.show();
|
|
258
|
+
} // runOneWayAnova
|
package/src/eda-ui.ts
CHANGED
|
@@ -131,12 +131,3 @@ export function addPLSvisualization(
|
|
|
131
131
|
// 4. Scores Scatter Plot
|
|
132
132
|
view.addViewer(scoresScatterPlot(samplesNames, plsOutput[2], plsOutput[3]));
|
|
133
133
|
}
|
|
134
|
-
|
|
135
|
-
// Add one-way ANOVA results
|
|
136
|
-
export function addOneWayAnovaVizualization(
|
|
137
|
-
table: DG.DataFrame, factors: DG.Column, values: DG.Column, anova: DG.DataFrame,
|
|
138
|
-
) {
|
|
139
|
-
const view = grok.shell.getTableView(table.name);
|
|
140
|
-
view.addViewer(DG.Viewer.boxPlot(DG.DataFrame.fromColumns([factors, values])));
|
|
141
|
-
view.addViewer(DG.Viewer.grid(anova));
|
|
142
|
-
}
|
package/src/package-test.ts
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import * as DG from 'datagrok-api/dg';
|
|
2
|
-
import {runTests, tests, TestContext} from '@datagrok-libraries/utils/src/test';
|
|
2
|
+
import {runTests, tests, TestContext, initAutoTests as initTests} from '@datagrok-libraries/utils/src/test';
|
|
3
3
|
import './tests/dim-reduction-tests';
|
|
4
4
|
import './tests/linear-methods-tests';
|
|
5
5
|
import './tests/classifiers-tests';
|
|
6
6
|
import './tests/mis-vals-imputation-tests';
|
|
7
|
+
import './tests/anova-tests';
|
|
7
8
|
export const _package = new DG.Package();
|
|
8
9
|
export {tests};
|
|
9
10
|
|
|
@@ -16,3 +17,8 @@ export async function test(category: string, test: string, testContext: TestCont
|
|
|
16
17
|
const data = await runTests({category, test, testContext});
|
|
17
18
|
return DG.DataFrame.fromObjects(data)!;
|
|
18
19
|
}
|
|
20
|
+
|
|
21
|
+
//name: initAutoTests
|
|
22
|
+
export async function initAutoTests() {
|
|
23
|
+
await initTests(_package, _package.getModule('package-test.js'));
|
|
24
|
+
}
|
package/src/package.ts
CHANGED
|
@@ -7,14 +7,14 @@ import * as DG from 'datagrok-api/dg';
|
|
|
7
7
|
|
|
8
8
|
import {_initEDAAPI} from '../wasm/EDAAPI';
|
|
9
9
|
import {computePCA} from './eda-tools';
|
|
10
|
-
import {addPrefixToEachColumnName
|
|
10
|
+
import {addPrefixToEachColumnName} from './eda-ui';
|
|
11
11
|
import {LINEAR, RBF, POLYNOMIAL, SIGMOID,
|
|
12
12
|
getTrainedModel, getPrediction, isApplicableSVM, isInteractiveSVM, showTrainReport, getPackedModel} from './svm';
|
|
13
13
|
|
|
14
14
|
import {PLS_ANALYSIS} from './pls/pls-constants';
|
|
15
15
|
import {runMVA, runDemoMVA, getPlsAnalysis, PlsOutput} from './pls/pls-tools';
|
|
16
|
+
import {runOneWayAnova} from './anova/anova-ui';
|
|
16
17
|
|
|
17
|
-
import {oneWayAnova} from './stat-tools';
|
|
18
18
|
import {getDbscanWorker} from '@datagrok-libraries/math';
|
|
19
19
|
|
|
20
20
|
import {DistanceAggregationMethod, DistanceAggregationMethods} from '@datagrok-libraries/ml/src/distance-matrix/types';
|
|
@@ -547,15 +547,9 @@ export async function visualizeSigmoidKernelSVM(df: DG.DataFrame, targetColumn:
|
|
|
547
547
|
|
|
548
548
|
//top-menu: ML | Analyze | ANOVA...
|
|
549
549
|
//name: ANOVA
|
|
550
|
-
//description: One-way analysis of variances (ANOVA) determines whether the examined factor has a significant impact on the
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
//input: column feature {type: numerical}
|
|
554
|
-
//input: double significance = 0.05 [The significance level is a value from the interval (0, 1) specifying the criterion used for rejecting the null hypothesis.]
|
|
555
|
-
//input: bool validate = false [Indicates whether the normality of distribution and an eqaulity of varainces should be checked.]
|
|
556
|
-
export function anova(table: DG.DataFrame, factor: DG.Column, feature: DG.Column, significance: number, validate: boolean) {
|
|
557
|
-
const res = oneWayAnova(factor, feature, significance, validate);
|
|
558
|
-
addOneWayAnovaVizualization(table, factor, feature, res);
|
|
550
|
+
//description: One-way analysis of variances (ANOVA) determines whether the examined factor has a significant impact on the explored feature.
|
|
551
|
+
export function anova(): void {
|
|
552
|
+
runOneWayAnova();
|
|
559
553
|
}
|
|
560
554
|
|
|
561
555
|
//top-menu: ML | Missing Values Imputation ...
|