@datagrok/eda 1.2.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@datagrok/eda",
3
3
  "friendlyName": "EDA",
4
- "version": "1.2.0",
4
+ "version": "1.2.1",
5
5
  "description": "Exploratory Data Analysis Tools",
6
6
  "dependencies": {
7
7
  "@datagrok-libraries/math": "^1.2.0",
@@ -18,24 +18,24 @@ export const SUPPORTED_COLUMN_TYPES = [
18
18
  /** Return null value with respect to the column type */
19
19
  export function getNullValue(col: DG.Column): number {
20
20
  switch (col.type) {
21
- case DG.COLUMN_TYPE.INT:
22
- return DG.INT_NULL;
23
-
24
- case DG.COLUMN_TYPE.FLOAT:
25
- return DG.FLOAT_NULL;
26
-
27
- case DG.COLUMN_TYPE.QNUM:
28
- return DG.FLOAT_NULL;
29
-
30
- case DG.COLUMN_TYPE.DATE_TIME:
31
- return DG.FLOAT_NULL;
32
-
33
- case DG.COLUMN_TYPE.STRING:
34
- return col.max;
35
-
36
- default:
37
- throw new Error(ERROR_MSG.UNSUPPORTED_COLUMN_TYPE);
38
- }
21
+ case DG.COLUMN_TYPE.INT:
22
+ return DG.INT_NULL;
23
+
24
+ case DG.COLUMN_TYPE.FLOAT:
25
+ return DG.FLOAT_NULL;
26
+
27
+ case DG.COLUMN_TYPE.QNUM:
28
+ return DG.FLOAT_NULL;
29
+
30
+ case DG.COLUMN_TYPE.DATE_TIME:
31
+ return DG.FLOAT_NULL;
32
+
33
+ case DG.COLUMN_TYPE.STRING:
34
+ return col.max;
35
+
36
+ default:
37
+ throw new Error(ERROR_MSG.UNSUPPORTED_COLUMN_TYPE);
38
+ }
39
39
  }
40
40
 
41
41
  /** Metric types (between column elements) */
@@ -76,8 +76,8 @@ type Item = {
76
76
 
77
77
  /** Impute missing values using the KNN method and returns an array of items for which an imputation fails */
78
78
  export function impute(df: DG.DataFrame, targetColNames: string[], featuresMetrics: Map<string, MetricInfo>,
79
- missingValsIndices: Map<string, number[]>, distance: DISTANCE_TYPE, neighbors: number, inPlace: boolean): Map<string, number[]>
80
- {
79
+ missingValsIndices: Map<string, number[]>, distance: DISTANCE_TYPE, neighbors: number,
80
+ inPlace: boolean): Map<string, number[]> {
81
81
  // 1. Check inputs completness
82
82
 
83
83
  if (neighbors < MIN_NEIGHBORS)
@@ -91,16 +91,17 @@ export function impute(df: DG.DataFrame, targetColNames: string[], featuresMetri
91
91
 
92
92
  if (featuresMetrics.size === 0)
93
93
  throw new Error(ERROR_MSG.KNN_NO_FEATURE_COLUMNS);
94
-
95
- if (featuresMetrics.size === 1)
94
+
95
+ if (featuresMetrics.size === 1) {
96
96
  targetColNames.forEach((name) => {
97
97
  if (featuresMetrics.has(name))
98
98
  throw new Error(`${ERROR_MSG.KNN_NO_FEATURE_COLUMNS} can be used for the column '${name}'`);
99
- });
99
+ });
100
+ }
100
101
 
101
102
  targetColNames.forEach((name) => {
102
103
  if (!missingValsIndices.has(name))
103
- throw new Error(`${ERROR_MSG.KNN_FAILS}: ${ERROR_MSG.WRONG_PREDICTIONS}`);
104
+ throw new Error(`${ERROR_MSG.KNN_FAILS}: ${ERROR_MSG.WRONG_PREDICTIONS}`);
104
105
  });
105
106
 
106
107
  const columns = df.columns;
@@ -140,21 +141,22 @@ export function impute(df: DG.DataFrame, targetColNames: string[], featuresMetri
140
141
  const feature = columns.byName(name);
141
142
  featureSource.push(feature.getRawData());
142
143
  featureNullVal.push(getNullValue(feature));
143
-
144
+
144
145
  switch (metricInfo.type) {
145
- case METRIC_TYPE.DIFFERENCE:
146
- metricFunc.push((a: number, b: number) => metricInfo.weight * Math.abs(a - b));
147
- break;
148
-
149
- case METRIC_TYPE.ONE_HOT:
150
- metricFunc.push((a: number, b: number) => metricInfo.weight * ((a === b) ? 0 : 1));
151
- break;
152
-
153
- default:
154
- break;
146
+ case METRIC_TYPE.DIFFERENCE:
147
+ metricFunc.push((a: number, b: number) => metricInfo.weight * Math.abs(a - b));
148
+ break;
149
+
150
+ case METRIC_TYPE.ONE_HOT:
151
+ metricFunc.push((a: number, b: number) => metricInfo.weight * ((a === b) ? 0 : 1));
152
+ break;
153
+
154
+ default:
155
+ break;
155
156
  }
156
- }});
157
-
157
+ }
158
+ });
159
+
158
160
  const featuresCount = featureSource.length;
159
161
  const properIndices = new Uint32Array(featureSource.length);
160
162
  const bufferVector = new Float32Array(featureSource.length);
@@ -173,19 +175,20 @@ export function impute(df: DG.DataFrame, targetColNames: string[], featuresMetri
173
175
  /** Obtain proper indices for KNN: features with missing vals are skipped */
174
176
  const getProperIndeces = (idx: number) => {
175
177
  properIndicesCount = 0;
176
-
177
- for (let i = 0; i < featuresCount; ++i)
178
+
179
+ for (let i = 0; i < featuresCount; ++i) {
178
180
  if (featureSource[i][idx] !== featureNullVal[i]) {
179
181
  properIndices[properIndicesCount] = i;
180
182
  ++properIndicesCount;
181
183
  }
184
+ }
182
185
  };
183
186
 
184
187
  /** Compute buffer vector */
185
188
  const computeBufferVector = (idx: number, cur: number) => {
186
189
  properIndices.forEach((properIndex, k) => {
187
190
  bufferVector[k] = metricFunc[properIndex](featureSource[properIndex][idx], featureSource[properIndex][cur]);
188
- })
191
+ });
189
192
  };
190
193
 
191
194
  /** Euclidean distance function */
@@ -201,10 +204,10 @@ export function impute(df: DG.DataFrame, targetColNames: string[], featuresMetri
201
204
  /** Manhattan distance function */
202
205
  const manhattanDistFunc = () => {
203
206
  let sum = 0;
204
-
207
+
205
208
  for (let i = 0; i < properIndicesCount; ++i)
206
209
  sum += Math.abs(bufferVector[i]);
207
-
210
+
208
211
  return Math.sqrt(sum);
209
212
  };
210
213
 
@@ -216,16 +219,17 @@ export function impute(df: DG.DataFrame, targetColNames: string[], featuresMetri
216
219
  if (source[cur] === nullValue)
217
220
  return false;
218
221
 
219
- for (let i = 0; i < properIndicesCount; ++i)
222
+ for (let i = 0; i < properIndicesCount; ++i) {
220
223
  if (featureSource[properIndices[i]][cur] === featureNullVal[properIndices[i]])
221
224
  return false;
225
+ }
222
226
 
223
227
  return true;
224
228
  };
225
229
 
226
230
  /** Return the most frequent of the nearest items (for categorial data) */
227
231
  const mostFrequentOfTheNearestItems = () => {
228
- frequencies.forEach((v, i,arr) => arr[i] = 0);
232
+ frequencies.forEach((v, i, arr) => arr[i] = 0);
229
233
  let i = 0;
230
234
 
231
235
  for (i = 0; i < nearestItemsCount; ++i)
@@ -233,7 +237,7 @@ export function impute(df: DG.DataFrame, targetColNames: string[], featuresMetri
233
237
 
234
238
  let maxFreq = frequencies[0];
235
239
  let maxFreqIdx = 0;
236
-
240
+
237
241
  frequencies.forEach((v, i) => {
238
242
  if (v > maxFreq) {
239
243
  maxFreq = v;
@@ -245,7 +249,7 @@ export function impute(df: DG.DataFrame, targetColNames: string[], featuresMetri
245
249
  };
246
250
 
247
251
  /** Get imputation value */
248
- const getFillValue = (idx: number) => {
252
+ const getFillValue = (idx: number) => {
249
253
  getProperIndeces(idx);
250
254
 
251
255
  // check available features
@@ -255,9 +259,9 @@ export function impute(df: DG.DataFrame, targetColNames: string[], featuresMetri
255
259
  nearestItemsCount = 0;
256
260
 
257
261
  // search for the closest items
258
- for (let cur = 0; cur < len; ++cur)
262
+ for (let cur = 0; cur < len; ++cur) {
259
263
  if (canItemBeUsed(cur) && (cur !== idx)) {
260
- // 1) compute distance between cur-th and idx-th items
264
+ // 1) compute distance between cur-th and idx-th items
261
265
  computeBufferVector(idx, cur);
262
266
  const curDist = dist();
263
267
 
@@ -265,24 +269,25 @@ export function impute(df: DG.DataFrame, targetColNames: string[], featuresMetri
265
269
  if (nearestItemsCount < neighbors) {
266
270
  nearestItems[nearestItemsCount] = {index: cur, dist: curDist};
267
271
  ++nearestItemsCount;
268
- }
269
- else {
270
- // 2.1) find the farest
272
+ } else {
273
+ // 2.1) find the farest
271
274
  maxInd = 0;
272
275
  maxDist = nearestItems[0].dist;
273
276
 
274
- for(let i = 1; i < nearestItemsCount; ++i)
277
+ for (let i = 1; i < nearestItemsCount; ++i) {
275
278
  if (maxDist < nearestItems[i].dist) {
276
279
  maxDist = nearestItems[i].dist;
277
280
  maxInd = i;
278
281
  }
279
-
282
+ }
283
+
280
284
  // 2.2) replace
281
285
  if (curDist < maxDist)
282
286
  nearestItems[maxInd] = {index: cur, dist: curDist};
283
287
  } // else
284
- } // for cur
285
-
288
+ }
289
+ } // for cur
290
+
286
291
  // check found nearest items
287
292
  if (nearestItemsCount === 0)
288
293
  throw new Error(`${ERROR_MSG.KNN_IMPOSSIBLE_IMPUTATION}: the column "${col.name}", row ${idx + 1}`);
@@ -293,35 +298,35 @@ export function impute(df: DG.DataFrame, targetColNames: string[], featuresMetri
293
298
  // compute fill value
294
299
  sum = 0;
295
300
  for (let i = 0; i < nearestItemsCount; ++i)
296
- sum += source[nearestItems[i].index];
301
+ sum += source[nearestItems[i].index];
297
302
 
298
303
  fillValue = sum / nearestItemsCount;
299
304
 
300
305
  if (col.type === DG.COLUMN_TYPE.INT)
301
306
  return Math.round(fillValue);
302
307
 
303
- return fillValue;
308
+ return fillValue;
304
309
  }; // getFillValue
305
-
310
+
306
311
  if (inPlace) {
307
312
  // use indices found previousely
308
- for (const i of missingValsIndices.get(name)!)
313
+ for (const i of missingValsIndices.get(name)!) {
309
314
  try {
310
315
  source[i] = getFillValue(i);
311
- } catch (err) {
312
- failedToImputeIndices.push(i);
313
-
314
- if (!(err instanceof Error))
315
- grok.shell.error(ERROR_MSG.CORE_ISSUE);
316
+ } catch (err) {
317
+ failedToImputeIndices.push(i);
318
+
319
+ if (!(err instanceof Error))
320
+ grok.shell.error(ERROR_MSG.CORE_ISSUE);
316
321
  }
317
-
322
+ }
323
+
318
324
  if (failedToImputeIndices.length > 0)
319
325
  failedToImpute.set(name, failedToImputeIndices);
320
326
 
321
- // to reset view
327
+ // to reset view
322
328
  col.set(0, col.get(0));
323
- } // if
324
- else {
329
+ } else {
325
330
  //@ts-ignore
326
331
  const copy = col.clone();
327
332
 
@@ -335,19 +340,20 @@ export function impute(df: DG.DataFrame, targetColNames: string[], featuresMetri
335
340
  }
336
341
 
337
342
  copy.name = copyName;
338
-
343
+
339
344
  const copySource = copy.getRawData();
340
345
 
341
346
  // use indices found previousely
342
- for (const i of missingValsIndices.get(name)!)
347
+ for (const i of missingValsIndices.get(name)!) {
343
348
  try {
344
349
  copySource[i] = getFillValue(i);
345
- } catch (err) {
346
- failedToImputeIndices.push(i);
347
-
348
- if (!(err instanceof Error))
349
- grok.shell.error(ERROR_MSG.CORE_ISSUE);
350
+ } catch (err) {
351
+ failedToImputeIndices.push(i);
352
+
353
+ if (!(err instanceof Error))
354
+ grok.shell.error(ERROR_MSG.CORE_ISSUE);
350
355
  }
356
+ }
351
357
 
352
358
  if (failedToImputeIndices.length > 0)
353
359
  failedToImpute.set(copyName, failedToImputeIndices);
@@ -374,7 +380,7 @@ export function getMissingValsIndices(columns: DG.Column[]): Map<string, number[
374
380
 
375
381
  const indices = [] as number[];
376
382
  const nullValue = getNullValue(col);
377
-
383
+
378
384
  col.getRawData().forEach((val, idx) => {
379
385
  if (val === nullValue)
380
386
  indices.push(idx);
@@ -387,11 +393,13 @@ export function getMissingValsIndices(columns: DG.Column[]): Map<string, number[
387
393
  }
388
394
 
389
395
  /** Predict existence of missing values imputation fails */
390
- export function areThereFails(targetColNames: string[], featureColNames: string[], misValsInds: Map<string, number[]>): boolean {
396
+ export function areThereFails(targetColNames: string[], featureColNames: string[],
397
+ misValsInds: Map<string, number[]>): boolean {
391
398
  // check feature columns
392
- for (const name of featureColNames)
399
+ for (const name of featureColNames) {
393
400
  if (!misValsInds.has(name))
394
401
  return false;
402
+ }
395
403
 
396
404
  // check target columns
397
405
  for (const target of targetColNames) {
@@ -399,7 +407,7 @@ export function areThereFails(targetColNames: string[], featureColNames: string[
399
407
 
400
408
  if (indices === undefined)
401
409
  throw new Error(ERROR_MSG.FAILS_TO_PREDICT_IMPUTATION_FAILS);
402
-
410
+
403
411
  for (const idx of indices) {
404
412
  let failToImpute = true;
405
413
 
@@ -429,27 +437,28 @@ function getFirstNonNull<T>(col: DG.Column<T>): T {
429
437
  const raw = col.getRawData();
430
438
  const len = raw.length;
431
439
 
432
- for (let i = 0; i < len; ++i)
440
+ for (let i = 0; i < len; ++i) {
433
441
  if (raw[i] !== nullValue)
434
442
  return col.get(i)!;
443
+ }
435
444
 
436
- throw new Error(ERROR_MSG.EMPTY_COLUMN);
445
+ throw new Error(ERROR_MSG.EMPTY_COLUMN);
437
446
  }
438
447
 
439
448
  /** Return default fill value with respect to the column type */
440
449
  function getDefaultFillValue<T>(col: DG.Column<T>): T {
441
450
  switch (col.type) {
442
- case DG.COLUMN_TYPE.STRING:
443
- case DG.COLUMN_TYPE.DATE_TIME:
444
- return getFirstNonNull(col); // TODO: replace by most frequent
451
+ case DG.COLUMN_TYPE.STRING:
452
+ case DG.COLUMN_TYPE.DATE_TIME:
453
+ return getFirstNonNull(col); // TODO: replace by most frequent
445
454
 
446
- case DG.COLUMN_TYPE.INT:
447
- case DG.COLUMN_TYPE.FLOAT:
448
- case DG.COLUMN_TYPE.QNUM:
449
- return col.stats.avg as T;
455
+ case DG.COLUMN_TYPE.INT:
456
+ case DG.COLUMN_TYPE.FLOAT:
457
+ case DG.COLUMN_TYPE.QNUM:
458
+ return col.stats.avg as T;
450
459
 
451
- default:
452
- throw new Error(ERROR_MSG.UNSUPPORTED_COLUMN_TYPE);
460
+ default:
461
+ throw new Error(ERROR_MSG.UNSUPPORTED_COLUMN_TYPE);
453
462
  }
454
463
  }
455
464
 
@@ -459,7 +468,7 @@ export function imputeFailed(df: DG.DataFrame, failedToImpute: Map<string, numbe
459
468
  const col = df.col(colName);
460
469
  if (col !== null) {
461
470
  if (!SUPPORTED_COLUMN_TYPES.includes(col.type))
462
- throw new Error(ERROR_MSG.UNSUPPORTED_COLUMN_TYPE);
471
+ throw new Error(ERROR_MSG.UNSUPPORTED_COLUMN_TYPE);
463
472
 
464
473
  const fillVal = getDefaultFillValue(col);
465
474
  indices.forEach((idx) => col.set(idx, fillVal));
@@ -44,9 +44,9 @@ export enum TITLE {
44
44
  };
45
45
 
46
46
  /** Help links */
47
- export const KNN_IMPUTER = '/help/transform/missing-values-imputation';
47
+ export const KNN_IMPUTER = '/help/explore/missing-values-imputation';
48
48
 
49
- /** Tooltips */
49
+ /** Tooltips */
50
50
  export enum HINT {
51
51
  TARGET = 'Columns with missing values that must be filled',
52
52
  FEATURES = "Columns with features to be used for determining the 'nearest' elements in the KNN method",
@@ -14,7 +14,7 @@ type FeatureInputSettings = {
14
14
  };
15
15
 
16
16
  /** Return default setting of the feature metric inputs */
17
- function getFeatureInputSettings(type: DG.COLUMN_TYPE): FeatureInputSettings {
17
+ export function getFeatureInputSettings(type: DG.COLUMN_TYPE): FeatureInputSettings {
18
18
  switch (type) {
19
19
  case DG.COLUMN_TYPE.STRING:
20
20
  case DG.COLUMN_TYPE.DATE_TIME:
@@ -61,9 +61,13 @@ export async function runKNNImputer(df?: DG.DataFrame): Promise<void> {
61
61
  df.columns.toList()
62
62
  .filter((col) => SUPPORTED_COLUMN_TYPES.includes(col.type))
63
63
  .forEach((col) => {
64
+ const misValsCount = col.stats.missingValueCount;
65
+ if (misValsCount === col.length)
66
+ return;
67
+
64
68
  availableFeatureColsNames.push(col.name);
65
69
 
66
- if (col.stats.missingValueCount > 0) {
70
+ if (misValsCount > 0) {
67
71
  colsWithMissingVals.push(col);
68
72
  availableTargetColsNames.push(col.name);
69
73
  }
@@ -96,14 +100,17 @@ export async function runKNNImputer(df?: DG.DataFrame): Promise<void> {
96
100
 
97
101
  // Neighbors components
98
102
  let neighbors = DEFAULT.NEIGHBORS;
99
- const neighborsInput = ui.input.int(TITLE.NEIGHBORS, {value: neighbors, onValueChanged: (value) => {
100
- if (value === null)
101
- neighborsInput.value = neighbors;
102
- else if (value >= MIN_NEIGHBORS)
103
- neighbors = value;
104
- else
105
- neighborsInput.value = neighbors;
106
- }});
103
+ const neighborsInput = ui.input.int(TITLE.NEIGHBORS, {
104
+ value: neighbors,
105
+ showPlusMinus: true,
106
+ min: MIN_NEIGHBORS,
107
+ nullable: false,
108
+ onValueChanged: (value) => {
109
+ if ((value !== null) && (value >= MIN_NEIGHBORS))
110
+ neighbors = value;
111
+ checkApplicability();
112
+ },
113
+ });
107
114
  neighborsInput.setTooltip(HINT.NEIGHBORS);
108
115
 
109
116
  // Distance components
@@ -116,23 +123,32 @@ export async function runKNNImputer(df?: DG.DataFrame): Promise<void> {
116
123
 
117
124
  // Target columns components (cols with missing values to be imputed)
118
125
  let targetColNames = colsWithMissingVals.map((col) => col.name);
119
- const targetColInput = ui.input.columns(TITLE.COLUMNS, {table: df, value: df.columns.byNames(availableTargetColsNames), onValueChanged: (value) => {
120
- targetColNames = value.map((col) => col.name);
121
- checkApplicability();
122
- }, available: availableTargetColsNames});
126
+ const targetColInput = ui.input.columns(TITLE.COLUMNS, {
127
+ table: df,
128
+ value: df.columns.byNames(availableTargetColsNames),
129
+ onValueChanged: (value) => {
130
+ targetColNames = value.map((col) => col.name);
131
+ checkApplicability();
132
+ },
133
+ available: availableTargetColsNames,
134
+ });
123
135
  targetColInput.setTooltip(HINT.TARGET);
124
136
 
125
137
  // Feature columns components
126
138
  let selectedFeatureColNames = availableFeatureColsNames as string[];
127
- const featuresInput = ui.input.columns(TITLE.FEATURES, {value: df.columns.byNames(availableFeatureColsNames), table: df, onValueChanged: (value) => {
128
- selectedFeatureColNames = value.map((col) => col.name);
129
-
130
- if (selectedFeatureColNames.length > 0) {
131
- checkApplicability();
132
- metricInfoInputs.forEach((div, name) => div.hidden = !selectedFeatureColNames.includes(name));
133
- } else
134
- hideWidgets();
135
- }, available: availableFeatureColsNames});
139
+ const featuresInput = ui.input.columns(TITLE.FEATURES, {
140
+ value: df.columns.byNames(availableFeatureColsNames),
141
+ table: df, onValueChanged: (value) => {
142
+ selectedFeatureColNames = value.map((col) => col.name);
143
+
144
+ if (selectedFeatureColNames.length > 0) {
145
+ checkApplicability();
146
+ metricInfoInputs.forEach((div, name) => div.hidden = !selectedFeatureColNames.includes(name));
147
+ } else
148
+ hideWidgets();
149
+ },
150
+ available: availableFeatureColsNames,
151
+ });
136
152
  featuresInput.setTooltip(HINT.FEATURES);
137
153
 
138
154
  /** Hide widgets (use if run is not applicable) */
@@ -147,7 +163,7 @@ export async function runKNNImputer(df?: DG.DataFrame): Promise<void> {
147
163
 
148
164
  /** Show widgets (use if run is applicable) */
149
165
  const showWidgets = () => {
150
- dlg.getButton(TITLE.RUN).disabled = false;
166
+ dlg.getButton(TITLE.RUN).disabled = (neighborsInput.value === null) || (neighborsInput.value < MIN_NEIGHBORS);
151
167
  distDiv.hidden = false;
152
168
  inPlaceInput.root.hidden = false;
153
169
  neighborsInput.root.hidden = false;
@@ -167,6 +183,9 @@ export async function runKNNImputer(df?: DG.DataFrame): Promise<void> {
167
183
  }
168
184
  });
169
185
  }
186
+
187
+ if (targetColNames.length < 1)
188
+ hideWidgets();
170
189
  };
171
190
 
172
191
  // Metrics components
@@ -236,6 +255,28 @@ export async function runKNNImputer(df?: DG.DataFrame): Promise<void> {
236
255
  resolve = res;
237
256
  reject = rej;
238
257
  });
258
+
259
+ dlg.addButton(TITLE.RUN, () => {
260
+ okClicked = true;
261
+ dlg.close();
262
+ availableFeatureColsNames.filter((name) => !selectedFeatureColNames.includes(name))
263
+ .forEach((name) => featuresMetrics.delete(name));
264
+
265
+ try {
266
+ const failedToImpute = impute(df!, targetColNames, featuresMetrics, misValsInds, distType, neighbors, inPlace);
267
+
268
+ if (!keepEmpty)
269
+ imputeFailed(df!, failedToImpute);
270
+ resolve();
271
+ } catch (err) {
272
+ if (err instanceof Error)
273
+ grok.shell.error(`${ERROR_MSG.KNN_FAILS}: ${err.message}`);
274
+ else
275
+ grok.shell.error(`${ERROR_MSG.KNN_FAILS}: ${ERROR_MSG.CORE_ISSUE}`);
276
+ reject(err);
277
+ }
278
+ });
279
+
239
280
  dlg.add(targetColInput)
240
281
  .add(featuresInput)
241
282
  .add(distDiv)
@@ -244,26 +285,7 @@ export async function runKNNImputer(df?: DG.DataFrame): Promise<void> {
244
285
  .add(inPlaceInput)
245
286
  .add(keepEmptyInput)
246
287
  .show()
247
- .onOK(() => {
248
- okClicked = true;
249
- dlg.close();
250
- availableFeatureColsNames.filter((name) => !selectedFeatureColNames.includes(name))
251
- .forEach((name) => featuresMetrics.delete(name));
252
-
253
- try {
254
- const failedToImpute = impute(df!, targetColNames, featuresMetrics, misValsInds, distType, neighbors, inPlace);
255
-
256
- if (!keepEmpty)
257
- imputeFailed(df!, failedToImpute);
258
- resolve();
259
- } catch (err) {
260
- if (err instanceof Error)
261
- grok.shell.error(`${ERROR_MSG.KNN_FAILS}: ${err.message}`);
262
- else
263
- grok.shell.error(`${ERROR_MSG.KNN_FAILS}: ${ERROR_MSG.CORE_ISSUE}`);
264
- reject(err);
265
- }
266
- }).onClose.subscribe(() => !okClicked && resolve());
288
+ .onClose.subscribe(() => !okClicked && resolve());
267
289
 
268
290
  return promise;
269
291
  } // runKNNImputer
@@ -3,6 +3,7 @@ import {runTests, tests, TestContext} from '@datagrok-libraries/utils/src/test';
3
3
  import './tests/dim-reduction-tests';
4
4
  import './tests/linear-methods-tests';
5
5
  import './tests/classifiers-tests';
6
+ import './tests/mis-vals-imputation-tests';
6
7
  export const _package = new DG.Package();
7
8
  export {tests};
8
9
 
@@ -0,0 +1,58 @@
1
+ // Tests for missing values imputation
2
+
3
+ import * as grok from 'datagrok-api/grok';
4
+ import * as ui from 'datagrok-api/ui';
5
+ import * as DG from 'datagrok-api/dg';
6
+ import {_package} from '../package-test';
7
+
8
+ import {category, expect, test} from '@datagrok-libraries/utils/src/test';
9
+
10
+ import {MetricInfo, DISTANCE_TYPE, impute} from '../missing-values-imputation/knn-imputer';
11
+ import {getFeatureInputSettings} from '../missing-values-imputation/ui';
12
+ import {dataWithMissingVals} from './utils';
13
+
14
+ const ROWS_K = 100;
15
+ const K = 1000;
16
+ const INT_COLS = 5;
17
+ const FLOAT_COLS = 5;
18
+ const STRING_COLS = 5;
19
+ const MIS_VALS_COUNT = 5;
20
+ const NEIGHBORS = 5;
21
+ const TIMEOUT = 10000;
22
+ const TOTAL_COLS = INT_COLS + FLOAT_COLS + STRING_COLS;
23
+
24
+ const testKNN = (dist: DISTANCE_TYPE) => {
25
+ test(`${dist} dist, ${ROWS_K}K rows, ${TOTAL_COLS} cols, ${MIS_VALS_COUNT * TOTAL_COLS} missing vals`, async () => {
26
+ // Data
27
+ const data = dataWithMissingVals(ROWS_K * K, INT_COLS, FLOAT_COLS, STRING_COLS, MIS_VALS_COUNT);
28
+ const df = data.df;
29
+ const cols = df.columns;
30
+
31
+ // Inputs for kNN imputer
32
+ const targetColNames = cols.names();
33
+ const featuresMetrics = new Map<string, MetricInfo>();
34
+ const missingValsIndices = data.misValsIds;
35
+
36
+ // Imputation settings
37
+ for (const col of df.columns) {
38
+ const settings = getFeatureInputSettings(col.type as DG.COLUMN_TYPE);
39
+ featuresMetrics.set(col.name, {
40
+ weight: settings.defaultWeight,
41
+ type: settings.defaultMetric,
42
+ });
43
+ }
44
+
45
+ // Impute missing values & get fails
46
+ const failedToImput = impute(df, targetColNames, featuresMetrics, missingValsIndices, dist, NEIGHBORS, true);
47
+
48
+ // Check fails
49
+ let fails = 0;
50
+ failedToImput.forEach((inds, _) => fails += inds.length);
51
+ expect(fails, 0, `Failed to impute ${fails} missing values`);
52
+ }, {timeout: TIMEOUT, benchmark: true});
53
+ };
54
+
55
+ category(`Missing values imputation`, () => {
56
+ testKNN(DISTANCE_TYPE.EUCLIDEAN);
57
+ testKNN(DISTANCE_TYPE.MANHATTAN);
58
+ });