@datagrok/eda 1.1.9 → 1.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintignore +1 -0
- package/.eslintrc.json +45 -0
- package/CHANGELOG.md +21 -13
- package/README.md +2 -0
- package/dist/100.js +2 -2
- package/dist/42.js +2 -0
- package/dist/729.js +1 -1
- package/dist/935.js +3 -0
- package/dist/package-test.js +2 -2
- package/dist/package.js +2 -2
- package/package.json +7 -3
- package/src/data-generators.ts +13 -13
- package/src/eda-tools.ts +42 -42
- package/src/eda-ui.ts +65 -58
- package/src/missing-values-imputation/knn-imputer.ts +468 -0
- package/src/missing-values-imputation/ui-constants.ts +64 -0
- package/src/missing-values-imputation/ui.ts +246 -0
- package/src/package-test.ts +2 -2
- package/src/package.ts +61 -60
- package/src/stat-tools.ts +72 -61
- package/src/svm.ts +144 -151
- package/src/utils.ts +13 -17
- package/src/workers/tsne-worker.ts +6 -6
- package/src/workers/umap-worker.ts +3 -3
- package/dist/943.js +0 -3
- /package/dist/{943.js.LICENSE.txt → 935.js.LICENSE.txt} +0 -0
package/src/stat-tools.ts
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
// Statistic tools
|
|
2
2
|
|
|
3
3
|
/* REFERENCES
|
|
4
|
-
|
|
4
|
+
|
|
5
5
|
[1] One-way analysis of variance, https://en.wikipedia.org/wiki/One-way_analysis_of_variance
|
|
6
6
|
|
|
7
7
|
[2] G.W. Heiman. Basic Statistics for the Behavioral Sciences, 6th ed. Wadsworth Publishing, 2010
|
|
8
|
-
|
|
8
|
+
|
|
9
9
|
[3] F-test of equality of variances, https://en.wikipedia.org/wiki/F-test_of_equality_of_variances
|
|
10
10
|
|
|
11
11
|
[4] S. McKillup. Statistics Explained, Cambridge University Press, 2005
|
|
@@ -40,9 +40,9 @@ type SampleData = {
|
|
|
40
40
|
type OneWayAnova = {
|
|
41
41
|
/** sum of squares between groups, SSbn */
|
|
42
42
|
ssBn: number,
|
|
43
|
-
/** sum of squares within groups, SSnn */
|
|
43
|
+
/** sum of squares within groups, SSnn */
|
|
44
44
|
ssWn: number,
|
|
45
|
-
/** total sum of squares, SStot */
|
|
45
|
+
/** total sum of squares, SStot */
|
|
46
46
|
ssTot: number,
|
|
47
47
|
/** degrees of freedom between groups, DFbn */
|
|
48
48
|
dfBn: number,
|
|
@@ -67,15 +67,21 @@ type CatCol = DG.Column<DG.COLUMN_TYPE.STRING>;
|
|
|
67
67
|
type NumCol = DG.Column<DG.COLUMN_TYPE.FLOAT> | DG.Column<DG.COLUMN_TYPE.INT>;
|
|
68
68
|
|
|
69
69
|
/** Create dataframe with one-way ANOVA results. */
|
|
70
|
-
export function getOneWayAnovaDF(
|
|
70
|
+
export function getOneWayAnovaDF(
|
|
71
|
+
anova: OneWayAnova, alpha: number, fCritical: number, hypothesis: string, testResult: string,
|
|
72
|
+
): DG.DataFrame {
|
|
71
73
|
return DG.DataFrame.fromColumns([
|
|
72
|
-
DG.Column.fromStrings('Source of variance',
|
|
73
|
-
|
|
74
|
-
DG.Column.fromList(DG.COLUMN_TYPE.
|
|
74
|
+
DG.Column.fromStrings('Source of variance',
|
|
75
|
+
['Between groups', 'Within groups', 'Total', '', hypothesis, '', testResult]),
|
|
76
|
+
DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'Sum of squares',
|
|
77
|
+
[anova.ssBn, anova.ssWn, anova.ssTot, null, null, null, null]),
|
|
78
|
+
DG.Column.fromList(DG.COLUMN_TYPE.INT, 'Degrees of freedom',
|
|
79
|
+
[anova.dfBn, anova.dfWn, anova.dfTot, null, null, null, null]),
|
|
75
80
|
DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'Mean square', [anova.msBn, anova.msWn, null, null, null, null, null]),
|
|
76
81
|
DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'F-statistics', [anova.fStat, null, null, null, null, null, null]),
|
|
77
82
|
DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'p-value', [anova.pValue, null, null, null, null, null, null]),
|
|
78
|
-
DG.Column.fromList(DG.COLUMN_TYPE.FLOAT,
|
|
83
|
+
DG.Column.fromList(DG.COLUMN_TYPE.FLOAT,
|
|
84
|
+
`${alpha}-critical value`, [fCritical, null, null, null, null, null, null]),
|
|
79
85
|
]);
|
|
80
86
|
} // getOneWayAnovaDF
|
|
81
87
|
|
|
@@ -95,7 +101,7 @@ export function getVariance(data: SampleData): number {
|
|
|
95
101
|
|
|
96
102
|
if (size === 1)
|
|
97
103
|
return 0;
|
|
98
|
-
|
|
104
|
+
|
|
99
105
|
return (data.sumOfSquares - (data.sum) ** 2 / size) / (size - 1);
|
|
100
106
|
} // getVariance
|
|
101
107
|
|
|
@@ -103,7 +109,7 @@ export function getVariance(data: SampleData): number {
|
|
|
103
109
|
function areVarsEqual(xData: SampleData, yData: SampleData, alpha: number = 0.05): boolean {
|
|
104
110
|
// The applied approach can be found in [3]
|
|
105
111
|
checkSignificanceLevel(alpha);
|
|
106
|
-
|
|
112
|
+
|
|
107
113
|
const xVar = getVariance(xData);
|
|
108
114
|
const yVar = getVariance(yData);
|
|
109
115
|
|
|
@@ -118,7 +124,7 @@ function areVarsEqual(xData: SampleData, yData: SampleData, alpha: number = 0.05
|
|
|
118
124
|
|
|
119
125
|
export class FactorizedData {
|
|
120
126
|
private isNormDistrib: boolean | undefined = undefined;
|
|
121
|
-
private categories: string[] = [];
|
|
127
|
+
private categories: string[] = [];
|
|
122
128
|
private sums!: Float64Array;
|
|
123
129
|
private sumsOfSquares!: Float64Array;
|
|
124
130
|
private subSampleSizes!: Int32Array;
|
|
@@ -130,7 +136,7 @@ export class FactorizedData {
|
|
|
130
136
|
throw new Error();
|
|
131
137
|
|
|
132
138
|
if (categories.length !== values.length)
|
|
133
|
-
throw new Error(ERROR_MSG.NON_EQUAL_FACTORS_VALUES_SIZE);
|
|
139
|
+
throw new Error(ERROR_MSG.NON_EQUAL_FACTORS_VALUES_SIZE);
|
|
134
140
|
|
|
135
141
|
this.setStats(categories, values, checkNormality, alpha);
|
|
136
142
|
}
|
|
@@ -148,11 +154,13 @@ export class FactorizedData {
|
|
|
148
154
|
|
|
149
155
|
const first: SampleData = {sum: this.sums[0], sumOfSquares: this.sumsOfSquares[0], size: this.subSampleSizes[0]};
|
|
150
156
|
|
|
151
|
-
for (let i = 1; i < K; ++i)
|
|
152
|
-
if(!areVarsEqual(first, {sum: this.sums[i], sumOfSquares: this.sumsOfSquares[i],
|
|
157
|
+
for (let i = 1; i < K; ++i) {
|
|
158
|
+
if (!areVarsEqual(first, {sum: this.sums[i], sumOfSquares: this.sumsOfSquares[i],
|
|
159
|
+
size: this.subSampleSizes[i]}, alpha))
|
|
153
160
|
return false;
|
|
161
|
+
}
|
|
154
162
|
|
|
155
|
-
return true;
|
|
163
|
+
return true;
|
|
156
164
|
} // areVarsEqual
|
|
157
165
|
|
|
158
166
|
/** Perform one-way ANOVA computations. */
|
|
@@ -163,18 +171,18 @@ export class FactorizedData {
|
|
|
163
171
|
|
|
164
172
|
if (K === 1)
|
|
165
173
|
throw new Error(ERROR_MSG.ANOVA_FAILED_JUST_ONE_CAT);
|
|
166
|
-
|
|
174
|
+
|
|
167
175
|
let sum = 0;
|
|
168
176
|
let sumOfSquares = 0;
|
|
169
|
-
|
|
177
|
+
const N = this.size;
|
|
170
178
|
let buf = 0;
|
|
171
179
|
|
|
172
180
|
for (let i = 0; i < K; ++i) {
|
|
173
181
|
sum += this.sums[i];
|
|
174
182
|
sumOfSquares += this.sumsOfSquares[i];
|
|
175
|
-
buf += this.sums[i] ** 2 / this.subSampleSizes[i];
|
|
183
|
+
buf += this.sums[i] ** 2 / this.subSampleSizes[i];
|
|
176
184
|
}
|
|
177
|
-
|
|
185
|
+
|
|
178
186
|
const ssTot = sumOfSquares - sum ** 2 / N;
|
|
179
187
|
const ssBn = buf - sum ** 2 / N;
|
|
180
188
|
const ssWn = ssTot - ssBn;
|
|
@@ -182,12 +190,12 @@ export class FactorizedData {
|
|
|
182
190
|
const dfBn = K - 1;
|
|
183
191
|
const dfWn = N - K;
|
|
184
192
|
const dfTot = N - 1;
|
|
185
|
-
|
|
193
|
+
|
|
186
194
|
const msBn = ssBn / dfBn;
|
|
187
195
|
const msWn = ssWn / dfWn;
|
|
188
196
|
|
|
189
197
|
const fStat = msBn / msWn;
|
|
190
|
-
|
|
198
|
+
|
|
191
199
|
return {
|
|
192
200
|
ssBn: ssBn,
|
|
193
201
|
ssWn: ssWn,
|
|
@@ -197,61 +205,63 @@ export class FactorizedData {
|
|
|
197
205
|
dfTot: dfTot,
|
|
198
206
|
msBn: msBn,
|
|
199
207
|
msWn: msWn,
|
|
200
|
-
|
|
201
|
-
pValue: 1 - jStat.centralF.cdf(fStat, dfBn, dfWn)
|
|
208
|
+
fStat: fStat,
|
|
209
|
+
pValue: 1 - jStat.centralF.cdf(fStat, dfBn, dfWn),
|
|
202
210
|
};
|
|
203
211
|
} // getOneWayAnova
|
|
204
212
|
|
|
205
213
|
/** Compute sum & sums of squares with respect to factor levels. */
|
|
206
|
-
private setStats(categories: CatCol, values: NumCol,
|
|
207
|
-
// TODO: provide check normality feature
|
|
214
|
+
private setStats(categories: CatCol, values: NumCol, _checkNormality: boolean = false, _alpha: number = 0.05): void {
|
|
215
|
+
// TODO: provide check normality feature
|
|
208
216
|
const type = values.type;
|
|
209
217
|
const size = values.length;
|
|
210
218
|
|
|
211
219
|
switch (type) {
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
}
|
|
220
|
+
case DG.COLUMN_TYPE.INT:
|
|
221
|
+
case DG.COLUMN_TYPE.FLOAT:
|
|
222
|
+
this.categories = categories.categories;
|
|
223
|
+
const catCount = this.categories.length;
|
|
224
|
+
this.catCount = catCount;
|
|
225
|
+
this.size = size;
|
|
226
|
+
|
|
227
|
+
const vals = values.getRawData();
|
|
228
|
+
const cats = categories.getRawData();
|
|
229
|
+
|
|
230
|
+
const sums = new Float64Array(catCount).fill(0);
|
|
231
|
+
const sumsOfSquares = new Float64Array(catCount).fill(0);
|
|
232
|
+
const subSampleSizes = new Int32Array(catCount).fill(0);
|
|
233
|
+
|
|
234
|
+
for (let i = 0; i < size; ++i) {
|
|
235
|
+
const c = cats[i];
|
|
236
|
+
sums[c] += vals[i];
|
|
237
|
+
sumsOfSquares[c] += vals[i] ** 2;
|
|
238
|
+
++subSampleSizes[c];
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
this.sums = sums;
|
|
242
|
+
this.sumsOfSquares = sumsOfSquares;
|
|
243
|
+
this.subSampleSizes = subSampleSizes;
|
|
244
|
+
|
|
245
|
+
break;
|
|
246
|
+
|
|
247
|
+
default:
|
|
248
|
+
throw new Error(ERROR_MSG.UNSUPPORTED_COLUMN_TYPE);
|
|
249
|
+
}
|
|
242
250
|
} // setStats
|
|
243
251
|
} // FactorizedData
|
|
244
252
|
|
|
245
253
|
/** Perform one-way analysis of variances. */
|
|
246
|
-
export function oneWayAnova(
|
|
247
|
-
|
|
254
|
+
export function oneWayAnova(
|
|
255
|
+
categores: CatCol, values: NumCol, alpha: number = 0.05, validate: boolean = false,
|
|
256
|
+
): DG.DataFrame {
|
|
257
|
+
checkSignificanceLevel(alpha);
|
|
248
258
|
|
|
249
259
|
const factorized = new FactorizedData(categores, values, validate, alpha);
|
|
250
260
|
|
|
251
261
|
if (validate) {
|
|
252
|
-
if(!factorized.areVarsEqual(alpha))
|
|
262
|
+
if (!factorized.areVarsEqual(alpha))
|
|
253
263
|
throw new Error(ERROR_MSG.NON_EQUAL_VARIANCES);
|
|
254
|
-
|
|
264
|
+
|
|
255
265
|
if (!factorized.isNormal())
|
|
256
266
|
throw new Error(ERROR_MSG.NON_NORMAL_DISTRIB);
|
|
257
267
|
}
|
|
@@ -259,7 +269,8 @@ export function oneWayAnova(categores: CatCol, values: NumCol, alpha: number = 0
|
|
|
259
269
|
const anova = factorized.getOneWayAnova();
|
|
260
270
|
const fCrit = jStat.centralF.inv(1 - alpha, anova.dfBn, anova.dfWn);
|
|
261
271
|
|
|
262
|
-
const hypothesis = `THE NULL HYPOTHESIS: the "${categores.name}"
|
|
272
|
+
const hypothesis = `THE NULL HYPOTHESIS: the "${categores.name}"
|
|
273
|
+
factor does not produce a significant difference in the "${values.name}" feature.`;
|
|
263
274
|
const testResult = `Test result: ${(anova.fStat > fCrit) ? 'REJECTED.' : 'FAILED TO REJECT.'}`;
|
|
264
275
|
|
|
265
276
|
return getOneWayAnovaDF(anova, alpha, fCrit, hypothesis, testResult);
|