@datagrok/eda 1.2.0 → 1.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +15 -0
- package/dist/package-test.js +1 -1
- package/dist/package-test.js.map +1 -1
- package/dist/package.js +1 -1
- package/dist/package.js.map +1 -1
- package/package.json +1 -1
- package/src/anova/anova-tools.ts +312 -0
- package/src/anova/anova-ui.ts +258 -0
- package/src/eda-ui.ts +0 -9
- package/src/missing-values-imputation/knn-imputer.ts +100 -91
- package/src/missing-values-imputation/ui-constants.ts +2 -2
- package/src/missing-values-imputation/ui.ts +66 -44
- package/src/package-test.ts +8 -1
- package/src/package.ts +5 -11
- package/src/tests/anova-tests.ts +87 -0
- package/src/tests/mis-vals-imputation-tests.ts +58 -0
- package/src/tests/utils.ts +75 -0
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
// Tests for ANOVA
|
|
2
|
+
|
|
3
|
+
import * as grok from 'datagrok-api/grok';
|
|
4
|
+
import * as ui from 'datagrok-api/ui';
|
|
5
|
+
import * as DG from 'datagrok-api/dg';
|
|
6
|
+
import {_package} from '../package-test';
|
|
7
|
+
|
|
8
|
+
import {category, expect, test} from '@datagrok-libraries/utils/src/test';
|
|
9
|
+
|
|
10
|
+
import {oneWayAnova, FactorizedData} from '../anova/anova-tools';
|
|
11
|
+
|
|
12
|
+
const ROWS_M = 1;
|
|
13
|
+
const M = 1000000;
|
|
14
|
+
const TIMEOUT = 4000;
|
|
15
|
+
const ALPHA = 0.05;
|
|
16
|
+
const CATEGORIES = 'race';
|
|
17
|
+
const FEATURES = 'height';
|
|
18
|
+
const TO_VALIDATE = false;
|
|
19
|
+
const ERR = 0.01;
|
|
20
|
+
|
|
21
|
+
/** Validation features*/
|
|
22
|
+
const FEATURES_COL = DG.Column.fromList(DG.COLUMN_TYPE.INT, 'features', [
|
|
23
|
+
9, 12, 4, 8, 7, 4, 6, 8, 2, 10, 1, 3, 4, 5, 2,
|
|
24
|
+
]);
|
|
25
|
+
|
|
26
|
+
/** Validation categories */
|
|
27
|
+
const CATEGORIES_COL = DG.Column.fromStrings('features', [
|
|
28
|
+
'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'C',
|
|
29
|
+
]);
|
|
30
|
+
|
|
31
|
+
/** Expected ANOVA results for the validation data */
|
|
32
|
+
enum EXPECTED {
|
|
33
|
+
DF_BN = 2,
|
|
34
|
+
DF_TOT = 14,
|
|
35
|
+
DF_WN = 12,
|
|
36
|
+
SS_BN = 63.333,
|
|
37
|
+
SS_TOT = 147.333,
|
|
38
|
+
SS_WN = 84,
|
|
39
|
+
MS_BN = 31.666,
|
|
40
|
+
MS_WN = 7,
|
|
41
|
+
F_STAT = 4.523,
|
|
42
|
+
F_CRIT = 3.885,
|
|
43
|
+
P_VAL = 0.034,
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
category('ANOVA', () => {
|
|
47
|
+
test(`Performance: ${ROWS_M}M rows demog`, async () => {
|
|
48
|
+
const df = grok.data.demo.demog(ROWS_M * M);
|
|
49
|
+
const categories = df.col(CATEGORIES);
|
|
50
|
+
const features = df.col(FEATURES);
|
|
51
|
+
|
|
52
|
+
const factorized = new FactorizedData(categories!, features!, categories!.stats.uniqueCount);
|
|
53
|
+
factorized.areVarsEqual(ALPHA);
|
|
54
|
+
|
|
55
|
+
oneWayAnova(categories!, features!, ALPHA, TO_VALIDATE);
|
|
56
|
+
}, {timeout: TIMEOUT, benchmark: true});
|
|
57
|
+
|
|
58
|
+
test(`Correctness`, async () => {
|
|
59
|
+
const analysis = oneWayAnova(CATEGORIES_COL, FEATURES_COL, ALPHA, TO_VALIDATE);
|
|
60
|
+
const anova = analysis.anovaTable;
|
|
61
|
+
|
|
62
|
+
// check degrees of freedom (df-s)
|
|
63
|
+
expect(anova.dfBn, EXPECTED.DF_BN, 'Incorrect degrees of freedom: dfBn');
|
|
64
|
+
expect(anova.dfTot, EXPECTED.DF_TOT, 'Incorrect degrees of freedom: dfTot');
|
|
65
|
+
expect(anova.dfWn, EXPECTED.DF_WN, 'Incorrect degrees of freedom: dfWn');
|
|
66
|
+
|
|
67
|
+
const eq = (x: number, y: number) => Math.abs(x - y) < ERR;
|
|
68
|
+
|
|
69
|
+
// check sum of squares (ss-s)
|
|
70
|
+
expect(eq(anova.ssBn, EXPECTED.SS_BN), true, 'Incorrect sum of squares: ssBn');
|
|
71
|
+
expect(eq(anova.ssTot, EXPECTED.SS_TOT), true, 'Incorrect sum of squares: ssTot');
|
|
72
|
+
expect(eq(anova.ssWn, EXPECTED.SS_WN), true, 'Incorrect sum of squares: ssWn');
|
|
73
|
+
|
|
74
|
+
// check mean squares (ms-s)
|
|
75
|
+
expect(eq(anova.msBn, EXPECTED.MS_BN), true, 'Incorrect mean squares: msBn');
|
|
76
|
+
expect(eq(anova.msWn, EXPECTED.MS_WN), true, 'Incorrect mean squares: msWn');
|
|
77
|
+
|
|
78
|
+
// check F-statistics
|
|
79
|
+
expect(eq(anova.fStat, EXPECTED.F_STAT), true, 'Incorrect F-statistics value');
|
|
80
|
+
|
|
81
|
+
// check p-value
|
|
82
|
+
expect(eq(anova.pValue, EXPECTED.P_VAL), true, 'Incorrect p-value');
|
|
83
|
+
|
|
84
|
+
// check F-critical
|
|
85
|
+
expect(eq(analysis.fCritical, EXPECTED.F_CRIT), true, 'Incorrect F-critical');
|
|
86
|
+
}, {timeout: TIMEOUT, benchmark: true});
|
|
87
|
+
});
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
// Tests for missing values imputation
|
|
2
|
+
|
|
3
|
+
import * as grok from 'datagrok-api/grok';
|
|
4
|
+
import * as ui from 'datagrok-api/ui';
|
|
5
|
+
import * as DG from 'datagrok-api/dg';
|
|
6
|
+
import {_package} from '../package-test';
|
|
7
|
+
|
|
8
|
+
import {category, expect, test} from '@datagrok-libraries/utils/src/test';
|
|
9
|
+
|
|
10
|
+
import {MetricInfo, DISTANCE_TYPE, impute} from '../missing-values-imputation/knn-imputer';
|
|
11
|
+
import {getFeatureInputSettings} from '../missing-values-imputation/ui';
|
|
12
|
+
import {dataWithMissingVals} from './utils';
|
|
13
|
+
|
|
14
|
+
const ROWS_K = 100;
|
|
15
|
+
const K = 1000;
|
|
16
|
+
const INT_COLS = 5;
|
|
17
|
+
const FLOAT_COLS = 5;
|
|
18
|
+
const STRING_COLS = 5;
|
|
19
|
+
const MIS_VALS_COUNT = 5;
|
|
20
|
+
const NEIGHBORS = 5;
|
|
21
|
+
const TIMEOUT = 10000;
|
|
22
|
+
const TOTAL_COLS = INT_COLS + FLOAT_COLS + STRING_COLS;
|
|
23
|
+
|
|
24
|
+
const testKNN = (dist: DISTANCE_TYPE) => {
|
|
25
|
+
test(`${dist} dist, ${ROWS_K}K rows, ${TOTAL_COLS} cols, ${MIS_VALS_COUNT * TOTAL_COLS} missing vals`, async () => {
|
|
26
|
+
// Data
|
|
27
|
+
const data = dataWithMissingVals(ROWS_K * K, INT_COLS, FLOAT_COLS, STRING_COLS, MIS_VALS_COUNT);
|
|
28
|
+
const df = data.df;
|
|
29
|
+
const cols = df.columns;
|
|
30
|
+
|
|
31
|
+
// Inputs for kNN imputer
|
|
32
|
+
const targetColNames = cols.names();
|
|
33
|
+
const featuresMetrics = new Map<string, MetricInfo>();
|
|
34
|
+
const missingValsIndices = data.misValsIds;
|
|
35
|
+
|
|
36
|
+
// Imputation settings
|
|
37
|
+
for (const col of df.columns) {
|
|
38
|
+
const settings = getFeatureInputSettings(col.type as DG.COLUMN_TYPE);
|
|
39
|
+
featuresMetrics.set(col.name, {
|
|
40
|
+
weight: settings.defaultWeight,
|
|
41
|
+
type: settings.defaultMetric,
|
|
42
|
+
});
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// Impute missing values & get fails
|
|
46
|
+
const failedToImput = impute(df, targetColNames, featuresMetrics, missingValsIndices, dist, NEIGHBORS, true);
|
|
47
|
+
|
|
48
|
+
// Check fails
|
|
49
|
+
let fails = 0;
|
|
50
|
+
failedToImput.forEach((inds, _) => fails += inds.length);
|
|
51
|
+
expect(fails, 0, `Failed to impute ${fails} missing values`);
|
|
52
|
+
}, {timeout: TIMEOUT, benchmark: true});
|
|
53
|
+
};
|
|
54
|
+
|
|
55
|
+
category(`Missing values imputation`, () => {
|
|
56
|
+
testKNN(DISTANCE_TYPE.EUCLIDEAN);
|
|
57
|
+
testKNN(DISTANCE_TYPE.MANHATTAN);
|
|
58
|
+
});
|
package/src/tests/utils.ts
CHANGED
|
@@ -6,6 +6,10 @@ const TRESHOLD = 0.5;
|
|
|
6
6
|
const SHIFT = 1;
|
|
7
7
|
const LIMIT = 2;
|
|
8
8
|
|
|
9
|
+
const MAX_INT = 10;
|
|
10
|
+
const MAX_FLOAT = 10;
|
|
11
|
+
const CATEGORIES = ['Alpha', 'Beta', 'Gamma', 'Delta'];
|
|
12
|
+
|
|
9
13
|
/** Check lengths of columns */
|
|
10
14
|
function checkLen(target: DG.Column, prediction: DG.Column): void {
|
|
11
15
|
if (target.length !== prediction.length)
|
|
@@ -119,3 +123,74 @@ export function accuracy(target: DG.Column, prediction: DG.Column): number {
|
|
|
119
123
|
|
|
120
124
|
return correctPredictions / rows;
|
|
121
125
|
}
|
|
126
|
+
|
|
127
|
+
/** Return dataframe with missing values */
|
|
128
|
+
export function dataWithMissingVals(rows: number, intCols: number, floatCols: number,
|
|
129
|
+
strCols: number, misValCount: number): {df: DG.DataFrame, misValsIds: Map<string, number[]>} {
|
|
130
|
+
const catsCount = CATEGORIES.length;
|
|
131
|
+
const cols = [];
|
|
132
|
+
let idx = 0;
|
|
133
|
+
|
|
134
|
+
const misValsIds = new Map<string, number[]>();
|
|
135
|
+
|
|
136
|
+
for (let j = 0; j < intCols; ++j) {
|
|
137
|
+
const arr = new Int32Array(rows);
|
|
138
|
+
const name = `int #${j + 1}`;
|
|
139
|
+
const indeces: number[] = [];
|
|
140
|
+
|
|
141
|
+
for (let i = 0; i < rows; ++i)
|
|
142
|
+
arr[i] = Math.floor(Math.random() * MAX_INT);
|
|
143
|
+
|
|
144
|
+
for (let k = 0; k < misValCount; ++k) {
|
|
145
|
+
idx = Math.floor(rows * Math.random());
|
|
146
|
+
arr[idx] = DG.INT_NULL;
|
|
147
|
+
indeces.push(idx);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
cols.push(DG.Column.fromInt32Array(name, arr));
|
|
151
|
+
misValsIds.set(name, indeces);
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
for (let j = 0; j < floatCols; ++j) {
|
|
155
|
+
const arr = new Float32Array(rows);
|
|
156
|
+
const name = `float #${j + 1}`;
|
|
157
|
+
const indeces: number[] = [];
|
|
158
|
+
|
|
159
|
+
for (let i = 0; i < rows; ++i)
|
|
160
|
+
arr[i] = Math.random() * MAX_FLOAT;
|
|
161
|
+
|
|
162
|
+
for (let k = 0; k < misValCount; ++k) {
|
|
163
|
+
idx = Math.floor(rows * Math.random());
|
|
164
|
+
arr[idx] = DG.FLOAT_NULL;
|
|
165
|
+
indeces.push(idx);
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
cols.push(DG.Column.fromFloat32Array(name, arr));
|
|
169
|
+
misValsIds.set(name, indeces);
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
for (let j = 0; j < strCols; ++j) {
|
|
173
|
+
const arr = new Array<string>(rows);
|
|
174
|
+
const name = `str #${j + 1}`;
|
|
175
|
+
const indeces: number[] = [];
|
|
176
|
+
|
|
177
|
+
for (let i = 0; i < rows; ++i)
|
|
178
|
+
arr[i] = CATEGORIES[Math.floor(Math.random() * catsCount)];
|
|
179
|
+
|
|
180
|
+
const col = DG.Column.fromStrings(name, arr);
|
|
181
|
+
|
|
182
|
+
for (let k = 0; k < misValCount; ++k) {
|
|
183
|
+
idx = Math.floor(rows * Math.random());
|
|
184
|
+
col.set(idx, null);
|
|
185
|
+
indeces.push(idx);
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
cols.push(col);
|
|
189
|
+
misValsIds.set(name, indeces);
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
return {
|
|
193
|
+
df: DG.DataFrame.fromColumns(cols),
|
|
194
|
+
misValsIds: misValsIds,
|
|
195
|
+
};
|
|
196
|
+
} // tableWithMissingVals
|