@datagrok/eda 1.2.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +8 -0
- package/dist/package-test.js +1 -1
- package/dist/package-test.js.map +1 -1
- package/dist/package.js +1 -1
- package/dist/package.js.map +1 -1
- package/package.json +1 -1
- package/src/missing-values-imputation/knn-imputer.ts +100 -91
- package/src/missing-values-imputation/ui-constants.ts +2 -2
- package/src/missing-values-imputation/ui.ts +66 -44
- package/src/package-test.ts +1 -0
- package/src/tests/mis-vals-imputation-tests.ts +58 -0
- package/src/tests/utils.ts +75 -0
package/package.json
CHANGED
|
@@ -18,24 +18,24 @@ export const SUPPORTED_COLUMN_TYPES = [
|
|
|
18
18
|
/** Return null value with respect to the column type */
|
|
19
19
|
export function getNullValue(col: DG.Column): number {
|
|
20
20
|
switch (col.type) {
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
}
|
|
21
|
+
case DG.COLUMN_TYPE.INT:
|
|
22
|
+
return DG.INT_NULL;
|
|
23
|
+
|
|
24
|
+
case DG.COLUMN_TYPE.FLOAT:
|
|
25
|
+
return DG.FLOAT_NULL;
|
|
26
|
+
|
|
27
|
+
case DG.COLUMN_TYPE.QNUM:
|
|
28
|
+
return DG.FLOAT_NULL;
|
|
29
|
+
|
|
30
|
+
case DG.COLUMN_TYPE.DATE_TIME:
|
|
31
|
+
return DG.FLOAT_NULL;
|
|
32
|
+
|
|
33
|
+
case DG.COLUMN_TYPE.STRING:
|
|
34
|
+
return col.max;
|
|
35
|
+
|
|
36
|
+
default:
|
|
37
|
+
throw new Error(ERROR_MSG.UNSUPPORTED_COLUMN_TYPE);
|
|
38
|
+
}
|
|
39
39
|
}
|
|
40
40
|
|
|
41
41
|
/** Metric types (between column elements) */
|
|
@@ -76,8 +76,8 @@ type Item = {
|
|
|
76
76
|
|
|
77
77
|
/** Impute missing values using the KNN method and returns an array of items for which an imputation fails */
|
|
78
78
|
export function impute(df: DG.DataFrame, targetColNames: string[], featuresMetrics: Map<string, MetricInfo>,
|
|
79
|
-
missingValsIndices: Map<string, number[]>, distance: DISTANCE_TYPE, neighbors: number,
|
|
80
|
-
{
|
|
79
|
+
missingValsIndices: Map<string, number[]>, distance: DISTANCE_TYPE, neighbors: number,
|
|
80
|
+
inPlace: boolean): Map<string, number[]> {
|
|
81
81
|
// 1. Check inputs completness
|
|
82
82
|
|
|
83
83
|
if (neighbors < MIN_NEIGHBORS)
|
|
@@ -91,16 +91,17 @@ export function impute(df: DG.DataFrame, targetColNames: string[], featuresMetri
|
|
|
91
91
|
|
|
92
92
|
if (featuresMetrics.size === 0)
|
|
93
93
|
throw new Error(ERROR_MSG.KNN_NO_FEATURE_COLUMNS);
|
|
94
|
-
|
|
95
|
-
if (featuresMetrics.size === 1)
|
|
94
|
+
|
|
95
|
+
if (featuresMetrics.size === 1) {
|
|
96
96
|
targetColNames.forEach((name) => {
|
|
97
97
|
if (featuresMetrics.has(name))
|
|
98
98
|
throw new Error(`${ERROR_MSG.KNN_NO_FEATURE_COLUMNS} can be used for the column '${name}'`);
|
|
99
|
-
|
|
99
|
+
});
|
|
100
|
+
}
|
|
100
101
|
|
|
101
102
|
targetColNames.forEach((name) => {
|
|
102
103
|
if (!missingValsIndices.has(name))
|
|
103
|
-
throw new Error(`${ERROR_MSG.KNN_FAILS}: ${ERROR_MSG.WRONG_PREDICTIONS}`);
|
|
104
|
+
throw new Error(`${ERROR_MSG.KNN_FAILS}: ${ERROR_MSG.WRONG_PREDICTIONS}`);
|
|
104
105
|
});
|
|
105
106
|
|
|
106
107
|
const columns = df.columns;
|
|
@@ -140,21 +141,22 @@ export function impute(df: DG.DataFrame, targetColNames: string[], featuresMetri
|
|
|
140
141
|
const feature = columns.byName(name);
|
|
141
142
|
featureSource.push(feature.getRawData());
|
|
142
143
|
featureNullVal.push(getNullValue(feature));
|
|
143
|
-
|
|
144
|
+
|
|
144
145
|
switch (metricInfo.type) {
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
146
|
+
case METRIC_TYPE.DIFFERENCE:
|
|
147
|
+
metricFunc.push((a: number, b: number) => metricInfo.weight * Math.abs(a - b));
|
|
148
|
+
break;
|
|
149
|
+
|
|
150
|
+
case METRIC_TYPE.ONE_HOT:
|
|
151
|
+
metricFunc.push((a: number, b: number) => metricInfo.weight * ((a === b) ? 0 : 1));
|
|
152
|
+
break;
|
|
153
|
+
|
|
154
|
+
default:
|
|
155
|
+
break;
|
|
155
156
|
}
|
|
156
|
-
|
|
157
|
-
|
|
157
|
+
}
|
|
158
|
+
});
|
|
159
|
+
|
|
158
160
|
const featuresCount = featureSource.length;
|
|
159
161
|
const properIndices = new Uint32Array(featureSource.length);
|
|
160
162
|
const bufferVector = new Float32Array(featureSource.length);
|
|
@@ -173,19 +175,20 @@ export function impute(df: DG.DataFrame, targetColNames: string[], featuresMetri
|
|
|
173
175
|
/** Obtain proper indices for KNN: features with missing vals are skipped */
|
|
174
176
|
const getProperIndeces = (idx: number) => {
|
|
175
177
|
properIndicesCount = 0;
|
|
176
|
-
|
|
177
|
-
for (let i = 0; i < featuresCount; ++i)
|
|
178
|
+
|
|
179
|
+
for (let i = 0; i < featuresCount; ++i) {
|
|
178
180
|
if (featureSource[i][idx] !== featureNullVal[i]) {
|
|
179
181
|
properIndices[properIndicesCount] = i;
|
|
180
182
|
++properIndicesCount;
|
|
181
183
|
}
|
|
184
|
+
}
|
|
182
185
|
};
|
|
183
186
|
|
|
184
187
|
/** Compute buffer vector */
|
|
185
188
|
const computeBufferVector = (idx: number, cur: number) => {
|
|
186
189
|
properIndices.forEach((properIndex, k) => {
|
|
187
190
|
bufferVector[k] = metricFunc[properIndex](featureSource[properIndex][idx], featureSource[properIndex][cur]);
|
|
188
|
-
})
|
|
191
|
+
});
|
|
189
192
|
};
|
|
190
193
|
|
|
191
194
|
/** Euclidean distance function */
|
|
@@ -201,10 +204,10 @@ export function impute(df: DG.DataFrame, targetColNames: string[], featuresMetri
|
|
|
201
204
|
/** Manhattan distance function */
|
|
202
205
|
const manhattanDistFunc = () => {
|
|
203
206
|
let sum = 0;
|
|
204
|
-
|
|
207
|
+
|
|
205
208
|
for (let i = 0; i < properIndicesCount; ++i)
|
|
206
209
|
sum += Math.abs(bufferVector[i]);
|
|
207
|
-
|
|
210
|
+
|
|
208
211
|
return Math.sqrt(sum);
|
|
209
212
|
};
|
|
210
213
|
|
|
@@ -216,16 +219,17 @@ export function impute(df: DG.DataFrame, targetColNames: string[], featuresMetri
|
|
|
216
219
|
if (source[cur] === nullValue)
|
|
217
220
|
return false;
|
|
218
221
|
|
|
219
|
-
for (let i = 0; i < properIndicesCount; ++i)
|
|
222
|
+
for (let i = 0; i < properIndicesCount; ++i) {
|
|
220
223
|
if (featureSource[properIndices[i]][cur] === featureNullVal[properIndices[i]])
|
|
221
224
|
return false;
|
|
225
|
+
}
|
|
222
226
|
|
|
223
227
|
return true;
|
|
224
228
|
};
|
|
225
229
|
|
|
226
230
|
/** Return the most frequent of the nearest items (for categorial data) */
|
|
227
231
|
const mostFrequentOfTheNearestItems = () => {
|
|
228
|
-
frequencies.forEach((v, i,arr) => arr[i] = 0);
|
|
232
|
+
frequencies.forEach((v, i, arr) => arr[i] = 0);
|
|
229
233
|
let i = 0;
|
|
230
234
|
|
|
231
235
|
for (i = 0; i < nearestItemsCount; ++i)
|
|
@@ -233,7 +237,7 @@ export function impute(df: DG.DataFrame, targetColNames: string[], featuresMetri
|
|
|
233
237
|
|
|
234
238
|
let maxFreq = frequencies[0];
|
|
235
239
|
let maxFreqIdx = 0;
|
|
236
|
-
|
|
240
|
+
|
|
237
241
|
frequencies.forEach((v, i) => {
|
|
238
242
|
if (v > maxFreq) {
|
|
239
243
|
maxFreq = v;
|
|
@@ -245,7 +249,7 @@ export function impute(df: DG.DataFrame, targetColNames: string[], featuresMetri
|
|
|
245
249
|
};
|
|
246
250
|
|
|
247
251
|
/** Get imputation value */
|
|
248
|
-
const getFillValue = (idx: number) => {
|
|
252
|
+
const getFillValue = (idx: number) => {
|
|
249
253
|
getProperIndeces(idx);
|
|
250
254
|
|
|
251
255
|
// check available features
|
|
@@ -255,9 +259,9 @@ export function impute(df: DG.DataFrame, targetColNames: string[], featuresMetri
|
|
|
255
259
|
nearestItemsCount = 0;
|
|
256
260
|
|
|
257
261
|
// search for the closest items
|
|
258
|
-
for (let cur = 0; cur < len; ++cur)
|
|
262
|
+
for (let cur = 0; cur < len; ++cur) {
|
|
259
263
|
if (canItemBeUsed(cur) && (cur !== idx)) {
|
|
260
|
-
|
|
264
|
+
// 1) compute distance between cur-th and idx-th items
|
|
261
265
|
computeBufferVector(idx, cur);
|
|
262
266
|
const curDist = dist();
|
|
263
267
|
|
|
@@ -265,24 +269,25 @@ export function impute(df: DG.DataFrame, targetColNames: string[], featuresMetri
|
|
|
265
269
|
if (nearestItemsCount < neighbors) {
|
|
266
270
|
nearestItems[nearestItemsCount] = {index: cur, dist: curDist};
|
|
267
271
|
++nearestItemsCount;
|
|
268
|
-
}
|
|
269
|
-
|
|
270
|
-
// 2.1) find the farest
|
|
272
|
+
} else {
|
|
273
|
+
// 2.1) find the farest
|
|
271
274
|
maxInd = 0;
|
|
272
275
|
maxDist = nearestItems[0].dist;
|
|
273
276
|
|
|
274
|
-
for(let i = 1; i < nearestItemsCount; ++i)
|
|
277
|
+
for (let i = 1; i < nearestItemsCount; ++i) {
|
|
275
278
|
if (maxDist < nearestItems[i].dist) {
|
|
276
279
|
maxDist = nearestItems[i].dist;
|
|
277
280
|
maxInd = i;
|
|
278
281
|
}
|
|
279
|
-
|
|
282
|
+
}
|
|
283
|
+
|
|
280
284
|
// 2.2) replace
|
|
281
285
|
if (curDist < maxDist)
|
|
282
286
|
nearestItems[maxInd] = {index: cur, dist: curDist};
|
|
283
287
|
} // else
|
|
284
|
-
}
|
|
285
|
-
|
|
288
|
+
}
|
|
289
|
+
} // for cur
|
|
290
|
+
|
|
286
291
|
// check found nearest items
|
|
287
292
|
if (nearestItemsCount === 0)
|
|
288
293
|
throw new Error(`${ERROR_MSG.KNN_IMPOSSIBLE_IMPUTATION}: the column "${col.name}", row ${idx + 1}`);
|
|
@@ -293,35 +298,35 @@ export function impute(df: DG.DataFrame, targetColNames: string[], featuresMetri
|
|
|
293
298
|
// compute fill value
|
|
294
299
|
sum = 0;
|
|
295
300
|
for (let i = 0; i < nearestItemsCount; ++i)
|
|
296
|
-
sum += source[nearestItems[i].index];
|
|
301
|
+
sum += source[nearestItems[i].index];
|
|
297
302
|
|
|
298
303
|
fillValue = sum / nearestItemsCount;
|
|
299
304
|
|
|
300
305
|
if (col.type === DG.COLUMN_TYPE.INT)
|
|
301
306
|
return Math.round(fillValue);
|
|
302
307
|
|
|
303
|
-
return fillValue;
|
|
308
|
+
return fillValue;
|
|
304
309
|
}; // getFillValue
|
|
305
|
-
|
|
310
|
+
|
|
306
311
|
if (inPlace) {
|
|
307
312
|
// use indices found previousely
|
|
308
|
-
for (const i of missingValsIndices.get(name)!)
|
|
313
|
+
for (const i of missingValsIndices.get(name)!) {
|
|
309
314
|
try {
|
|
310
315
|
source[i] = getFillValue(i);
|
|
311
|
-
}
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
+
} catch (err) {
|
|
317
|
+
failedToImputeIndices.push(i);
|
|
318
|
+
|
|
319
|
+
if (!(err instanceof Error))
|
|
320
|
+
grok.shell.error(ERROR_MSG.CORE_ISSUE);
|
|
316
321
|
}
|
|
317
|
-
|
|
322
|
+
}
|
|
323
|
+
|
|
318
324
|
if (failedToImputeIndices.length > 0)
|
|
319
325
|
failedToImpute.set(name, failedToImputeIndices);
|
|
320
326
|
|
|
321
|
-
// to reset view
|
|
327
|
+
// to reset view
|
|
322
328
|
col.set(0, col.get(0));
|
|
323
|
-
}
|
|
324
|
-
else {
|
|
329
|
+
} else {
|
|
325
330
|
//@ts-ignore
|
|
326
331
|
const copy = col.clone();
|
|
327
332
|
|
|
@@ -335,19 +340,20 @@ export function impute(df: DG.DataFrame, targetColNames: string[], featuresMetri
|
|
|
335
340
|
}
|
|
336
341
|
|
|
337
342
|
copy.name = copyName;
|
|
338
|
-
|
|
343
|
+
|
|
339
344
|
const copySource = copy.getRawData();
|
|
340
345
|
|
|
341
346
|
// use indices found previousely
|
|
342
|
-
for (const i of missingValsIndices.get(name)!)
|
|
347
|
+
for (const i of missingValsIndices.get(name)!) {
|
|
343
348
|
try {
|
|
344
349
|
copySource[i] = getFillValue(i);
|
|
345
|
-
}
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
+
} catch (err) {
|
|
351
|
+
failedToImputeIndices.push(i);
|
|
352
|
+
|
|
353
|
+
if (!(err instanceof Error))
|
|
354
|
+
grok.shell.error(ERROR_MSG.CORE_ISSUE);
|
|
350
355
|
}
|
|
356
|
+
}
|
|
351
357
|
|
|
352
358
|
if (failedToImputeIndices.length > 0)
|
|
353
359
|
failedToImpute.set(copyName, failedToImputeIndices);
|
|
@@ -374,7 +380,7 @@ export function getMissingValsIndices(columns: DG.Column[]): Map<string, number[
|
|
|
374
380
|
|
|
375
381
|
const indices = [] as number[];
|
|
376
382
|
const nullValue = getNullValue(col);
|
|
377
|
-
|
|
383
|
+
|
|
378
384
|
col.getRawData().forEach((val, idx) => {
|
|
379
385
|
if (val === nullValue)
|
|
380
386
|
indices.push(idx);
|
|
@@ -387,11 +393,13 @@ export function getMissingValsIndices(columns: DG.Column[]): Map<string, number[
|
|
|
387
393
|
}
|
|
388
394
|
|
|
389
395
|
/** Predict existence of missing values imputation fails */
|
|
390
|
-
export function areThereFails(targetColNames: string[], featureColNames: string[],
|
|
396
|
+
export function areThereFails(targetColNames: string[], featureColNames: string[],
|
|
397
|
+
misValsInds: Map<string, number[]>): boolean {
|
|
391
398
|
// check feature columns
|
|
392
|
-
for (const name of featureColNames)
|
|
399
|
+
for (const name of featureColNames) {
|
|
393
400
|
if (!misValsInds.has(name))
|
|
394
401
|
return false;
|
|
402
|
+
}
|
|
395
403
|
|
|
396
404
|
// check target columns
|
|
397
405
|
for (const target of targetColNames) {
|
|
@@ -399,7 +407,7 @@ export function areThereFails(targetColNames: string[], featureColNames: string[
|
|
|
399
407
|
|
|
400
408
|
if (indices === undefined)
|
|
401
409
|
throw new Error(ERROR_MSG.FAILS_TO_PREDICT_IMPUTATION_FAILS);
|
|
402
|
-
|
|
410
|
+
|
|
403
411
|
for (const idx of indices) {
|
|
404
412
|
let failToImpute = true;
|
|
405
413
|
|
|
@@ -429,27 +437,28 @@ function getFirstNonNull<T>(col: DG.Column<T>): T {
|
|
|
429
437
|
const raw = col.getRawData();
|
|
430
438
|
const len = raw.length;
|
|
431
439
|
|
|
432
|
-
for (let i = 0; i < len; ++i)
|
|
440
|
+
for (let i = 0; i < len; ++i) {
|
|
433
441
|
if (raw[i] !== nullValue)
|
|
434
442
|
return col.get(i)!;
|
|
443
|
+
}
|
|
435
444
|
|
|
436
|
-
throw new Error(ERROR_MSG.EMPTY_COLUMN);
|
|
445
|
+
throw new Error(ERROR_MSG.EMPTY_COLUMN);
|
|
437
446
|
}
|
|
438
447
|
|
|
439
448
|
/** Return default fill value with respect to the column type */
|
|
440
449
|
function getDefaultFillValue<T>(col: DG.Column<T>): T {
|
|
441
450
|
switch (col.type) {
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
451
|
+
case DG.COLUMN_TYPE.STRING:
|
|
452
|
+
case DG.COLUMN_TYPE.DATE_TIME:
|
|
453
|
+
return getFirstNonNull(col); // TODO: replace by most frequent
|
|
445
454
|
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
455
|
+
case DG.COLUMN_TYPE.INT:
|
|
456
|
+
case DG.COLUMN_TYPE.FLOAT:
|
|
457
|
+
case DG.COLUMN_TYPE.QNUM:
|
|
458
|
+
return col.stats.avg as T;
|
|
450
459
|
|
|
451
|
-
|
|
452
|
-
|
|
460
|
+
default:
|
|
461
|
+
throw new Error(ERROR_MSG.UNSUPPORTED_COLUMN_TYPE);
|
|
453
462
|
}
|
|
454
463
|
}
|
|
455
464
|
|
|
@@ -459,7 +468,7 @@ export function imputeFailed(df: DG.DataFrame, failedToImpute: Map<string, numbe
|
|
|
459
468
|
const col = df.col(colName);
|
|
460
469
|
if (col !== null) {
|
|
461
470
|
if (!SUPPORTED_COLUMN_TYPES.includes(col.type))
|
|
462
|
-
throw new Error(ERROR_MSG.UNSUPPORTED_COLUMN_TYPE);
|
|
471
|
+
throw new Error(ERROR_MSG.UNSUPPORTED_COLUMN_TYPE);
|
|
463
472
|
|
|
464
473
|
const fillVal = getDefaultFillValue(col);
|
|
465
474
|
indices.forEach((idx) => col.set(idx, fillVal));
|
|
@@ -44,9 +44,9 @@ export enum TITLE {
|
|
|
44
44
|
};
|
|
45
45
|
|
|
46
46
|
/** Help links */
|
|
47
|
-
export const KNN_IMPUTER = '/help/
|
|
47
|
+
export const KNN_IMPUTER = '/help/explore/missing-values-imputation';
|
|
48
48
|
|
|
49
|
-
/** Tooltips */
|
|
49
|
+
/** Tooltips */
|
|
50
50
|
export enum HINT {
|
|
51
51
|
TARGET = 'Columns with missing values that must be filled',
|
|
52
52
|
FEATURES = "Columns with features to be used for determining the 'nearest' elements in the KNN method",
|
|
@@ -14,7 +14,7 @@ type FeatureInputSettings = {
|
|
|
14
14
|
};
|
|
15
15
|
|
|
16
16
|
/** Return default setting of the feature metric inputs */
|
|
17
|
-
function getFeatureInputSettings(type: DG.COLUMN_TYPE): FeatureInputSettings {
|
|
17
|
+
export function getFeatureInputSettings(type: DG.COLUMN_TYPE): FeatureInputSettings {
|
|
18
18
|
switch (type) {
|
|
19
19
|
case DG.COLUMN_TYPE.STRING:
|
|
20
20
|
case DG.COLUMN_TYPE.DATE_TIME:
|
|
@@ -61,9 +61,13 @@ export async function runKNNImputer(df?: DG.DataFrame): Promise<void> {
|
|
|
61
61
|
df.columns.toList()
|
|
62
62
|
.filter((col) => SUPPORTED_COLUMN_TYPES.includes(col.type))
|
|
63
63
|
.forEach((col) => {
|
|
64
|
+
const misValsCount = col.stats.missingValueCount;
|
|
65
|
+
if (misValsCount === col.length)
|
|
66
|
+
return;
|
|
67
|
+
|
|
64
68
|
availableFeatureColsNames.push(col.name);
|
|
65
69
|
|
|
66
|
-
if (
|
|
70
|
+
if (misValsCount > 0) {
|
|
67
71
|
colsWithMissingVals.push(col);
|
|
68
72
|
availableTargetColsNames.push(col.name);
|
|
69
73
|
}
|
|
@@ -96,14 +100,17 @@ export async function runKNNImputer(df?: DG.DataFrame): Promise<void> {
|
|
|
96
100
|
|
|
97
101
|
// Neighbors components
|
|
98
102
|
let neighbors = DEFAULT.NEIGHBORS;
|
|
99
|
-
const neighborsInput = ui.input.int(TITLE.NEIGHBORS, {
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
103
|
+
const neighborsInput = ui.input.int(TITLE.NEIGHBORS, {
|
|
104
|
+
value: neighbors,
|
|
105
|
+
showPlusMinus: true,
|
|
106
|
+
min: MIN_NEIGHBORS,
|
|
107
|
+
nullable: false,
|
|
108
|
+
onValueChanged: (value) => {
|
|
109
|
+
if ((value !== null) && (value >= MIN_NEIGHBORS))
|
|
110
|
+
neighbors = value;
|
|
111
|
+
checkApplicability();
|
|
112
|
+
},
|
|
113
|
+
});
|
|
107
114
|
neighborsInput.setTooltip(HINT.NEIGHBORS);
|
|
108
115
|
|
|
109
116
|
// Distance components
|
|
@@ -116,23 +123,32 @@ export async function runKNNImputer(df?: DG.DataFrame): Promise<void> {
|
|
|
116
123
|
|
|
117
124
|
// Target columns components (cols with missing values to be imputed)
|
|
118
125
|
let targetColNames = colsWithMissingVals.map((col) => col.name);
|
|
119
|
-
const targetColInput = ui.input.columns(TITLE.COLUMNS, {
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
126
|
+
const targetColInput = ui.input.columns(TITLE.COLUMNS, {
|
|
127
|
+
table: df,
|
|
128
|
+
value: df.columns.byNames(availableTargetColsNames),
|
|
129
|
+
onValueChanged: (value) => {
|
|
130
|
+
targetColNames = value.map((col) => col.name);
|
|
131
|
+
checkApplicability();
|
|
132
|
+
},
|
|
133
|
+
available: availableTargetColsNames,
|
|
134
|
+
});
|
|
123
135
|
targetColInput.setTooltip(HINT.TARGET);
|
|
124
136
|
|
|
125
137
|
// Feature columns components
|
|
126
138
|
let selectedFeatureColNames = availableFeatureColsNames as string[];
|
|
127
|
-
const featuresInput = ui.input.columns(TITLE.FEATURES, {
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
139
|
+
const featuresInput = ui.input.columns(TITLE.FEATURES, {
|
|
140
|
+
value: df.columns.byNames(availableFeatureColsNames),
|
|
141
|
+
table: df, onValueChanged: (value) => {
|
|
142
|
+
selectedFeatureColNames = value.map((col) => col.name);
|
|
143
|
+
|
|
144
|
+
if (selectedFeatureColNames.length > 0) {
|
|
145
|
+
checkApplicability();
|
|
146
|
+
metricInfoInputs.forEach((div, name) => div.hidden = !selectedFeatureColNames.includes(name));
|
|
147
|
+
} else
|
|
148
|
+
hideWidgets();
|
|
149
|
+
},
|
|
150
|
+
available: availableFeatureColsNames,
|
|
151
|
+
});
|
|
136
152
|
featuresInput.setTooltip(HINT.FEATURES);
|
|
137
153
|
|
|
138
154
|
/** Hide widgets (use if run is not applicable) */
|
|
@@ -147,7 +163,7 @@ export async function runKNNImputer(df?: DG.DataFrame): Promise<void> {
|
|
|
147
163
|
|
|
148
164
|
/** Show widgets (use if run is applicable) */
|
|
149
165
|
const showWidgets = () => {
|
|
150
|
-
dlg.getButton(TITLE.RUN).disabled =
|
|
166
|
+
dlg.getButton(TITLE.RUN).disabled = (neighborsInput.value === null) || (neighborsInput.value < MIN_NEIGHBORS);
|
|
151
167
|
distDiv.hidden = false;
|
|
152
168
|
inPlaceInput.root.hidden = false;
|
|
153
169
|
neighborsInput.root.hidden = false;
|
|
@@ -167,6 +183,9 @@ export async function runKNNImputer(df?: DG.DataFrame): Promise<void> {
|
|
|
167
183
|
}
|
|
168
184
|
});
|
|
169
185
|
}
|
|
186
|
+
|
|
187
|
+
if (targetColNames.length < 1)
|
|
188
|
+
hideWidgets();
|
|
170
189
|
};
|
|
171
190
|
|
|
172
191
|
// Metrics components
|
|
@@ -236,6 +255,28 @@ export async function runKNNImputer(df?: DG.DataFrame): Promise<void> {
|
|
|
236
255
|
resolve = res;
|
|
237
256
|
reject = rej;
|
|
238
257
|
});
|
|
258
|
+
|
|
259
|
+
dlg.addButton(TITLE.RUN, () => {
|
|
260
|
+
okClicked = true;
|
|
261
|
+
dlg.close();
|
|
262
|
+
availableFeatureColsNames.filter((name) => !selectedFeatureColNames.includes(name))
|
|
263
|
+
.forEach((name) => featuresMetrics.delete(name));
|
|
264
|
+
|
|
265
|
+
try {
|
|
266
|
+
const failedToImpute = impute(df!, targetColNames, featuresMetrics, misValsInds, distType, neighbors, inPlace);
|
|
267
|
+
|
|
268
|
+
if (!keepEmpty)
|
|
269
|
+
imputeFailed(df!, failedToImpute);
|
|
270
|
+
resolve();
|
|
271
|
+
} catch (err) {
|
|
272
|
+
if (err instanceof Error)
|
|
273
|
+
grok.shell.error(`${ERROR_MSG.KNN_FAILS}: ${err.message}`);
|
|
274
|
+
else
|
|
275
|
+
grok.shell.error(`${ERROR_MSG.KNN_FAILS}: ${ERROR_MSG.CORE_ISSUE}`);
|
|
276
|
+
reject(err);
|
|
277
|
+
}
|
|
278
|
+
});
|
|
279
|
+
|
|
239
280
|
dlg.add(targetColInput)
|
|
240
281
|
.add(featuresInput)
|
|
241
282
|
.add(distDiv)
|
|
@@ -244,26 +285,7 @@ export async function runKNNImputer(df?: DG.DataFrame): Promise<void> {
|
|
|
244
285
|
.add(inPlaceInput)
|
|
245
286
|
.add(keepEmptyInput)
|
|
246
287
|
.show()
|
|
247
|
-
.
|
|
248
|
-
okClicked = true;
|
|
249
|
-
dlg.close();
|
|
250
|
-
availableFeatureColsNames.filter((name) => !selectedFeatureColNames.includes(name))
|
|
251
|
-
.forEach((name) => featuresMetrics.delete(name));
|
|
252
|
-
|
|
253
|
-
try {
|
|
254
|
-
const failedToImpute = impute(df!, targetColNames, featuresMetrics, misValsInds, distType, neighbors, inPlace);
|
|
255
|
-
|
|
256
|
-
if (!keepEmpty)
|
|
257
|
-
imputeFailed(df!, failedToImpute);
|
|
258
|
-
resolve();
|
|
259
|
-
} catch (err) {
|
|
260
|
-
if (err instanceof Error)
|
|
261
|
-
grok.shell.error(`${ERROR_MSG.KNN_FAILS}: ${err.message}`);
|
|
262
|
-
else
|
|
263
|
-
grok.shell.error(`${ERROR_MSG.KNN_FAILS}: ${ERROR_MSG.CORE_ISSUE}`);
|
|
264
|
-
reject(err);
|
|
265
|
-
}
|
|
266
|
-
}).onClose.subscribe(() => !okClicked && resolve());
|
|
288
|
+
.onClose.subscribe(() => !okClicked && resolve());
|
|
267
289
|
|
|
268
290
|
return promise;
|
|
269
291
|
} // runKNNImputer
|
package/src/package-test.ts
CHANGED
|
@@ -3,6 +3,7 @@ import {runTests, tests, TestContext} from '@datagrok-libraries/utils/src/test';
|
|
|
3
3
|
import './tests/dim-reduction-tests';
|
|
4
4
|
import './tests/linear-methods-tests';
|
|
5
5
|
import './tests/classifiers-tests';
|
|
6
|
+
import './tests/mis-vals-imputation-tests';
|
|
6
7
|
export const _package = new DG.Package();
|
|
7
8
|
export {tests};
|
|
8
9
|
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
// Tests for missing values imputation
|
|
2
|
+
|
|
3
|
+
import * as grok from 'datagrok-api/grok';
|
|
4
|
+
import * as ui from 'datagrok-api/ui';
|
|
5
|
+
import * as DG from 'datagrok-api/dg';
|
|
6
|
+
import {_package} from '../package-test';
|
|
7
|
+
|
|
8
|
+
import {category, expect, test} from '@datagrok-libraries/utils/src/test';
|
|
9
|
+
|
|
10
|
+
import {MetricInfo, DISTANCE_TYPE, impute} from '../missing-values-imputation/knn-imputer';
|
|
11
|
+
import {getFeatureInputSettings} from '../missing-values-imputation/ui';
|
|
12
|
+
import {dataWithMissingVals} from './utils';
|
|
13
|
+
|
|
14
|
+
const ROWS_K = 100;
|
|
15
|
+
const K = 1000;
|
|
16
|
+
const INT_COLS = 5;
|
|
17
|
+
const FLOAT_COLS = 5;
|
|
18
|
+
const STRING_COLS = 5;
|
|
19
|
+
const MIS_VALS_COUNT = 5;
|
|
20
|
+
const NEIGHBORS = 5;
|
|
21
|
+
const TIMEOUT = 10000;
|
|
22
|
+
const TOTAL_COLS = INT_COLS + FLOAT_COLS + STRING_COLS;
|
|
23
|
+
|
|
24
|
+
const testKNN = (dist: DISTANCE_TYPE) => {
|
|
25
|
+
test(`${dist} dist, ${ROWS_K}K rows, ${TOTAL_COLS} cols, ${MIS_VALS_COUNT * TOTAL_COLS} missing vals`, async () => {
|
|
26
|
+
// Data
|
|
27
|
+
const data = dataWithMissingVals(ROWS_K * K, INT_COLS, FLOAT_COLS, STRING_COLS, MIS_VALS_COUNT);
|
|
28
|
+
const df = data.df;
|
|
29
|
+
const cols = df.columns;
|
|
30
|
+
|
|
31
|
+
// Inputs for kNN imputer
|
|
32
|
+
const targetColNames = cols.names();
|
|
33
|
+
const featuresMetrics = new Map<string, MetricInfo>();
|
|
34
|
+
const missingValsIndices = data.misValsIds;
|
|
35
|
+
|
|
36
|
+
// Imputation settings
|
|
37
|
+
for (const col of df.columns) {
|
|
38
|
+
const settings = getFeatureInputSettings(col.type as DG.COLUMN_TYPE);
|
|
39
|
+
featuresMetrics.set(col.name, {
|
|
40
|
+
weight: settings.defaultWeight,
|
|
41
|
+
type: settings.defaultMetric,
|
|
42
|
+
});
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// Impute missing values & get fails
|
|
46
|
+
const failedToImput = impute(df, targetColNames, featuresMetrics, missingValsIndices, dist, NEIGHBORS, true);
|
|
47
|
+
|
|
48
|
+
// Check fails
|
|
49
|
+
let fails = 0;
|
|
50
|
+
failedToImput.forEach((inds, _) => fails += inds.length);
|
|
51
|
+
expect(fails, 0, `Failed to impute ${fails} missing values`);
|
|
52
|
+
}, {timeout: TIMEOUT, benchmark: true});
|
|
53
|
+
};
|
|
54
|
+
|
|
55
|
+
category(`Missing values imputation`, () => {
|
|
56
|
+
testKNN(DISTANCE_TYPE.EUCLIDEAN);
|
|
57
|
+
testKNN(DISTANCE_TYPE.MANHATTAN);
|
|
58
|
+
});
|