@datagrok/eda 1.1.28 → 1.1.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +5 -0
- package/README.md +1 -0
- package/dist/{05e5e0770f54f07e9474.wasm → 12a82b8001995d426ed2.wasm} +0 -0
- package/dist/23.js +1 -1
- package/dist/23.js.map +1 -1
- package/dist/501.js +2 -0
- package/dist/501.js.map +1 -0
- package/dist/727.js +2 -0
- package/dist/727.js.map +1 -0
- package/dist/package.js +1 -1
- package/dist/package.js.map +1 -1
- package/package.json +5 -5
- package/scripts/command.txt +1 -1
- package/scripts/func.json +664 -1
- package/scripts/module.json +1 -1
- package/src/data-generators.ts +1 -44
- package/src/missing-values-imputation/ui.ts +16 -6
- package/src/package.ts +60 -78
- package/src/regression.ts +1 -1
- package/src/softmax-classifier.ts +412 -0
- package/src/svm.ts +11 -33
- package/src/workers/softmax-worker.ts +146 -0
- package/wasm/EDA.js +55 -1
- package/wasm/EDA.wasm +0 -0
- package/wasm/EDAAPI.js +15 -0
- package/wasm/EDAForWebWorker.js +1 -1
- package/wasm/regression.h +2 -5
- package/wasm/softmax-api.cpp +49 -0
- package/wasm/softmax.h +156 -0
- package/wasm/workers/fitSoftmaxWorker.js +13 -0
- package/webpack.config.js +3 -2
package/scripts/module.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "EDA",
|
|
3
3
|
"folder": "../wasm",
|
|
4
|
-
"source": ["pcaExport.cpp", "PCA/PCA.cpp", "plsExport.cpp", "PLS/PLS.cpp", "svmApi.cpp", "regression-api.cpp"],
|
|
4
|
+
"source": ["pcaExport.cpp", "PCA/PCA.cpp", "plsExport.cpp", "PLS/PLS.cpp", "svmApi.cpp", "regression-api.cpp", "softmax-api.cpp"],
|
|
5
5
|
"optimizationMode": "-O3",
|
|
6
6
|
"packageFile": "../src/package.ts",
|
|
7
7
|
"packageJsonFile": "../package.json",
|
package/src/data-generators.ts
CHANGED
|
@@ -6,15 +6,7 @@ import * as grok from 'datagrok-api/grok';
|
|
|
6
6
|
import * as ui from 'datagrok-api/ui';
|
|
7
7
|
import * as DG from 'datagrok-api/dg';
|
|
8
8
|
|
|
9
|
-
|
|
10
|
-
import {_generateDatasetInWebWorker} from '../wasm/EDAAPI';
|
|
11
|
-
|
|
12
|
-
const SVM_GEN_FEATURES_INDEX = 0;
|
|
13
|
-
const SVM_GEN_LABELS_INDEX = 1;
|
|
14
|
-
const SVM_FEATURE_NAME = 'Feature #';
|
|
15
|
-
const SVM_LABEL_NAME = 'Label';
|
|
16
|
-
|
|
17
|
-
// Returns the dataframe "cars"
|
|
9
|
+
/** Returns the dataframe "cars" */
|
|
18
10
|
export function carsDataframe(): DG.DataFrame {
|
|
19
11
|
return DG.DataFrame.fromColumns(
|
|
20
12
|
[
|
|
@@ -37,38 +29,3 @@ export function carsDataframe(): DG.DataFrame {
|
|
|
37
29
|
DG.Column.fromInt32Array('price', new Int32Array([16500, 17710, 16430, 6575, 7957, 6229, 7129, 8845, 6785, 35550, 7395, 31600, 16503, 5389, 7349, 7299, 6229, 13860, 37028, 12170, 7775, 5348, 7898, 9989, 10698, 7775, 13295, 12940, 19045, 22470])),
|
|
38
30
|
]);
|
|
39
31
|
} // carsDataframe
|
|
40
|
-
|
|
41
|
-
// Generate dataset for testing binary classifiers
|
|
42
|
-
export async function testDataForBinaryClassification(kernel: number, kernelParams: Array<number>,
|
|
43
|
-
name: string, samplesCount: number, featuresCount: number, min: number,
|
|
44
|
-
max: number, violatorsPercentage: number): Promise<DG.DataFrame> {
|
|
45
|
-
// check inputs
|
|
46
|
-
checkGeneratorSVMinputs(samplesCount, featuresCount, min, max, violatorsPercentage);
|
|
47
|
-
|
|
48
|
-
// kernel params column
|
|
49
|
-
const kernelParamsCol = DG.Column.fromList('double', 'kernelParams', kernelParams);
|
|
50
|
-
|
|
51
|
-
// CALL WASM-COMPUTATIONS
|
|
52
|
-
let _output: any;
|
|
53
|
-
const _promise = _generateDatasetInWebWorker(kernel, kernelParamsCol,
|
|
54
|
-
samplesCount, featuresCount, min, max, violatorsPercentage);
|
|
55
|
-
|
|
56
|
-
await _promise.then(
|
|
57
|
-
(_result) => {_output = _result;},
|
|
58
|
-
(_error) => {throw new Error(`Error: ${_error}`);},
|
|
59
|
-
);
|
|
60
|
-
|
|
61
|
-
// Rename labels column
|
|
62
|
-
_output[SVM_GEN_LABELS_INDEX].name = SVM_LABEL_NAME;
|
|
63
|
-
|
|
64
|
-
// Rename feature columns
|
|
65
|
-
for (const col of _output[SVM_GEN_FEATURES_INDEX])
|
|
66
|
-
col.name = SVM_FEATURE_NAME + col.name;
|
|
67
|
-
|
|
68
|
-
// Create dataframe
|
|
69
|
-
const df = DG.DataFrame.fromColumns(_output[SVM_GEN_FEATURES_INDEX]);
|
|
70
|
-
df.name = name;
|
|
71
|
-
df.columns.add(_output[SVM_GEN_LABELS_INDEX]);
|
|
72
|
-
|
|
73
|
-
return df;
|
|
74
|
-
} // testDataForMachineLearning
|
|
@@ -109,8 +109,10 @@ export async function runKNNImputer(df?: DG.DataFrame): Promise<void> {
|
|
|
109
109
|
|
|
110
110
|
// Distance components
|
|
111
111
|
let distType = DISTANCE_TYPE.EUCLIDEAN;
|
|
112
|
-
const distTypeInput: DG.ChoiceInput<DISTANCE_TYPE> = ui.input.choice(TITLE.DISTANCE, {
|
|
113
|
-
|
|
112
|
+
const distTypeInput: DG.ChoiceInput<DISTANCE_TYPE> = ui.input.choice(TITLE.DISTANCE, {
|
|
113
|
+
value: distType,
|
|
114
|
+
items: [DISTANCE_TYPE.EUCLIDEAN, DISTANCE_TYPE.MANHATTAN],
|
|
115
|
+
onValueChanged: () => distType = distTypeInput.value ?? DISTANCE_TYPE.EUCLIDEAN}) as DG.ChoiceInput<DISTANCE_TYPE>;
|
|
114
116
|
distTypeInput.setTooltip(HINT.DISTANCE);
|
|
115
117
|
|
|
116
118
|
// Target columns components (cols with missing values to be imputed)
|
|
@@ -193,8 +195,15 @@ export async function runKNNImputer(df?: DG.DataFrame): Promise<void> {
|
|
|
193
195
|
distTypeInput.root.hidden = true; // this input will be used further
|
|
194
196
|
|
|
195
197
|
// The following should provide a slider (see th bug https://reddata.atlassian.net/browse/GROK-14431)
|
|
196
|
-
|
|
197
|
-
|
|
198
|
+
const prop = DG.Property.fromOptions({
|
|
199
|
+
'name': name,
|
|
200
|
+
'inputType': 'Float',
|
|
201
|
+
'min': 0,
|
|
202
|
+
'max': 10,
|
|
203
|
+
// @ts-ignore
|
|
204
|
+
'showSlider': true,
|
|
205
|
+
'step': 1,
|
|
206
|
+
});
|
|
198
207
|
const weightInput = ui.input.forProperty(prop);
|
|
199
208
|
weightInput.value = settings.defaultWeight;
|
|
200
209
|
weightInput.onChanged(() => {
|
|
@@ -239,7 +248,8 @@ export async function runKNNImputer(df?: DG.DataFrame): Promise<void> {
|
|
|
239
248
|
.onOK(() => {
|
|
240
249
|
okClicked = true;
|
|
241
250
|
dlg.close();
|
|
242
|
-
availableFeatureColsNames.filter((name) => !selectedFeatureColNames.includes(name))
|
|
251
|
+
availableFeatureColsNames.filter((name) => !selectedFeatureColNames.includes(name))
|
|
252
|
+
.forEach((name) => featuresMetrics.delete(name));
|
|
243
253
|
|
|
244
254
|
try {
|
|
245
255
|
const failedToImpute = impute(df!, targetColNames, featuresMetrics, misValsInds, distType, neighbors, inPlace);
|
|
@@ -256,5 +266,5 @@ export async function runKNNImputer(df?: DG.DataFrame): Promise<void> {
|
|
|
256
266
|
}
|
|
257
267
|
}).onClose.subscribe(() => !okClicked && reject());
|
|
258
268
|
|
|
259
|
-
|
|
269
|
+
return promise;
|
|
260
270
|
} // runKNNImputer
|
package/src/package.ts
CHANGED
|
@@ -8,7 +8,6 @@ import * as DG from 'datagrok-api/dg';
|
|
|
8
8
|
import {_initEDAAPI} from '../wasm/EDAAPI';
|
|
9
9
|
import {computePCA} from './eda-tools';
|
|
10
10
|
import {addPrefixToEachColumnName, addOneWayAnovaVizualization} from './eda-ui';
|
|
11
|
-
import {testDataForBinaryClassification} from './data-generators';
|
|
12
11
|
import {LINEAR, RBF, POLYNOMIAL, SIGMOID,
|
|
13
12
|
getTrainedModel, getPrediction, isApplicableSVM, isInteractiveSVM, showTrainReport, getPackedModel} from './svm';
|
|
14
13
|
|
|
@@ -31,7 +30,8 @@ import {MCLEditor} from '@datagrok-libraries/ml/src/MCL/mcl-editor';
|
|
|
31
30
|
import {markovCluster} from '@datagrok-libraries/ml/src/MCL/clustering-view';
|
|
32
31
|
import {MCL_OPTIONS_TAG, MCLSerializableOptions} from '@datagrok-libraries/ml/src/MCL';
|
|
33
32
|
|
|
34
|
-
import {getLinearRegressionParams, getPredictionByLinearRegression
|
|
33
|
+
import {getLinearRegressionParams, getPredictionByLinearRegression} from './regression';
|
|
34
|
+
import {SoftmaxClassifier} from './softmax-classifier';
|
|
35
35
|
|
|
36
36
|
export const _package = new DG.Package();
|
|
37
37
|
|
|
@@ -299,37 +299,6 @@ export async function demoMultivariateAnalysis(): Promise<any> {
|
|
|
299
299
|
runDemoMVA();
|
|
300
300
|
}
|
|
301
301
|
|
|
302
|
-
//name: Generate linear separable dataset
|
|
303
|
-
//description: Generates linear separble dataset for testing binary classificators
|
|
304
|
-
//input: string name = 'Data' {caption: name; category: Dataset}
|
|
305
|
-
//input: int samplesCount = 1000 {caption: samples; category: Size}
|
|
306
|
-
//input: int featuresCount = 2 {caption: features; category: Size}
|
|
307
|
-
//input: double min = -39 {caption: min; category: Range}
|
|
308
|
-
//input: double max = 173 {caption: max; category: Range}
|
|
309
|
-
//input: double violatorsPercentage = 5 {caption: violators; units: %; category: Dataset}
|
|
310
|
-
//output: dataframe df
|
|
311
|
-
export async function testDataLinearSeparable(name: string, samplesCount: number, featuresCount: number,
|
|
312
|
-
min: number, max: number, violatorsPercentage: number): Promise<DG.DataFrame> {
|
|
313
|
-
return await testDataForBinaryClassification(LINEAR, [0, 0], name, samplesCount, featuresCount,
|
|
314
|
-
min, max, violatorsPercentage);
|
|
315
|
-
}
|
|
316
|
-
|
|
317
|
-
//name: Generate linear non-separable dataset
|
|
318
|
-
//description: Generates linear non-separble dataset for testing binary classificators
|
|
319
|
-
//input: string name = 'Data' {caption: name; category: Dataset}
|
|
320
|
-
//input: double sigma = 90 {caption: sigma; category: Hyperparameters} [RBF-kernel paramater]
|
|
321
|
-
//input: int samplesCount = 1000 {caption: samples; category: Size}
|
|
322
|
-
//input: int featuresCount = 2 {caption: features; category: Size}
|
|
323
|
-
//input: double min = -39 {caption: min; category: Range}
|
|
324
|
-
//input: double max = 173 {caption: max; category: Range}
|
|
325
|
-
//input: double violatorsPercentage = 5 {caption: violators; units: %; category: Dataset}
|
|
326
|
-
//output: dataframe df
|
|
327
|
-
export async function testDataLinearNonSeparable(name: string, sigma: number, samplesCount: number,
|
|
328
|
-
featuresCount: number, min: number, max: number, violatorsPercentage: number): Promise<DG.DataFrame> {
|
|
329
|
-
return await testDataForBinaryClassification(RBF, [sigma, 0], name, samplesCount, featuresCount,
|
|
330
|
-
min, max, violatorsPercentage);
|
|
331
|
-
}
|
|
332
|
-
|
|
333
302
|
//name: trainLinearKernelSVM
|
|
334
303
|
//meta.mlname: linear kernel LS-SVM
|
|
335
304
|
//meta.mlrole: train
|
|
@@ -593,48 +562,6 @@ export async function kNNImputationForTable(table: DG.DataFrame) {
|
|
|
593
562
|
await runKNNImputer(table);
|
|
594
563
|
}
|
|
595
564
|
|
|
596
|
-
//name: linearRegression
|
|
597
|
-
//description: Linear Regression demo
|
|
598
|
-
//input: dataframe table
|
|
599
|
-
//input: column_list features {type: numerical}
|
|
600
|
-
//input: column target {type: numerical}
|
|
601
|
-
//input: bool plot = true {caption: plot}
|
|
602
|
-
export async function linearRegression(table: DG.DataFrame, features: DG.ColumnList, target: DG.Column, plot: boolean): Promise<void> {
|
|
603
|
-
const t1 = performance.now();
|
|
604
|
-
const params = await getLinearRegressionParams(features, target);
|
|
605
|
-
const t2 = performance.now();
|
|
606
|
-
console.log(`Fit: ${t2 - t1} ms.`);
|
|
607
|
-
const prediction = getPredictionByLinearRegression(features, params);
|
|
608
|
-
console.log(`Predict: ${performance.now() - t2} ms.`);
|
|
609
|
-
|
|
610
|
-
prediction.name = table.columns.getUnusedName(prediction.name);
|
|
611
|
-
|
|
612
|
-
table.columns.add(prediction);
|
|
613
|
-
|
|
614
|
-
if (plot) {
|
|
615
|
-
const view = grok.shell.tableView(table.name);
|
|
616
|
-
view.addViewer(DG.VIEWER.SCATTER_PLOT, {
|
|
617
|
-
xColumnName: target.name,
|
|
618
|
-
yColumnName: prediction.name,
|
|
619
|
-
showRegressionLine: true,
|
|
620
|
-
});
|
|
621
|
-
}
|
|
622
|
-
}
|
|
623
|
-
|
|
624
|
-
//name: generateDatasetForLinearRegressionTest
|
|
625
|
-
//description: Create demo dataset for linear regression
|
|
626
|
-
//input: int rowCount = 10000 {min: 1000; max: 10000000; step: 10000}
|
|
627
|
-
//input: int colCount = 10 {min: 1; max: 1000; step: 10}
|
|
628
|
-
//input: double featuresScale = 10 {min: -1000; max: 1000; step: 10}
|
|
629
|
-
//input: double featuresBias = 10 {min: -1000; max: 1000; step: 10}
|
|
630
|
-
//input: double paramsScale = 10 {min: -1000; max: 1000; step: 10}
|
|
631
|
-
//input: double paramsBias = 10 {min: -1000; max: 1000; step: 10}
|
|
632
|
-
//output: dataframe table
|
|
633
|
-
export function generateDatasetForLinearRegressionTest(rowCount: number, colCount: number,
|
|
634
|
-
featuresScale: number, featuresBias: number, paramsScale: number, paramsBias: number): DG.DataFrame {
|
|
635
|
-
return getTestDatasetForLinearRegression(rowCount, colCount, featuresScale, featuresBias, paramsScale, paramsBias);
|
|
636
|
-
}
|
|
637
|
-
|
|
638
565
|
//name: trainLinearRegression
|
|
639
566
|
//meta.mlname: Linear Regression
|
|
640
567
|
//meta.mlrole: train
|
|
@@ -671,10 +598,8 @@ export function isApplicableLinearRegression(df: DG.DataFrame, predictColumn: DG
|
|
|
671
598
|
if (!col.matches('numerical'))
|
|
672
599
|
return false;
|
|
673
600
|
}
|
|
674
|
-
if (!predictColumn.matches('numerical'))
|
|
675
|
-
return false;
|
|
676
601
|
|
|
677
|
-
return
|
|
602
|
+
return predictColumn.matches('numerical');
|
|
678
603
|
}
|
|
679
604
|
|
|
680
605
|
//name: isInteractiveLinearRegression
|
|
@@ -686,3 +611,60 @@ export function isApplicableLinearRegression(df: DG.DataFrame, predictColumn: DG
|
|
|
686
611
|
export function isInteractiveLinearRegression(df: DG.DataFrame, predictColumn: DG.Column): boolean {
|
|
687
612
|
return df.rowCount <= 100000;
|
|
688
613
|
}
|
|
614
|
+
|
|
615
|
+
//name: trainSoftmax
|
|
616
|
+
//meta.mlname: Softmax
|
|
617
|
+
//meta.mlrole: train
|
|
618
|
+
//input: dataframe df
|
|
619
|
+
//input: column predictColumn
|
|
620
|
+
//input: double rate = 1.0 {category: Hyperparameters; min: 0.001; max: 20} [Learning rate]
|
|
621
|
+
//input: int iterations = 100 {category: Hyperparameters; min: 1; max: 10000; step: 10} [Fitting iterations count]
|
|
622
|
+
//input: double penalty = 0.1 {category: Hyperparameters; min: 0.0001; max: 1} [Regularization rate]
|
|
623
|
+
//input: double tolerance = 0.001 {category: Hyperparameters; min: 0.00001; max: 0.1} [Fitting tolerance]
|
|
624
|
+
//output: dynamic model
|
|
625
|
+
export async function trainSoftmax(df: DG.DataFrame, predictColumn: DG.Column, rate: number,
|
|
626
|
+
iterations: number, penalty: number, tolerance: number): Promise<Uint8Array> {
|
|
627
|
+
const features = df.columns;
|
|
628
|
+
|
|
629
|
+
const model = new SoftmaxClassifier({
|
|
630
|
+
classesCount: predictColumn.categories.length,
|
|
631
|
+
featuresCount: features.length,
|
|
632
|
+
});
|
|
633
|
+
|
|
634
|
+
await model.fit(features, predictColumn, rate, iterations, penalty, tolerance);
|
|
635
|
+
|
|
636
|
+
return model.toBytes();
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
//name: applySoftmax
|
|
640
|
+
//meta.mlname: Softmax
|
|
641
|
+
//meta.mlrole: apply
|
|
642
|
+
//input: dataframe df
|
|
643
|
+
//input: dynamic model
|
|
644
|
+
//output: dataframe table
|
|
645
|
+
export function applySoftmax(df: DG.DataFrame, model: any): DG.DataFrame {
|
|
646
|
+
const features = df.columns;
|
|
647
|
+
const unpackedModel = new SoftmaxClassifier(undefined, model);
|
|
648
|
+
|
|
649
|
+
return DG.DataFrame.fromColumns([unpackedModel.predict(features)]);
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
//name: isApplicableSoftmax
|
|
653
|
+
//meta.mlname: Softmax
|
|
654
|
+
//meta.mlrole: isApplicable
|
|
655
|
+
//input: dataframe df
|
|
656
|
+
//input: column predictColumn
|
|
657
|
+
//output: bool result
|
|
658
|
+
export function isApplicableSoftmax(df: DG.DataFrame, predictColumn: DG.Column): boolean {
|
|
659
|
+
return SoftmaxClassifier.isApplicable(df.columns, predictColumn);
|
|
660
|
+
}
|
|
661
|
+
|
|
662
|
+
//name: isInteractiveSoftmax
|
|
663
|
+
//meta.mlname: Softmax
|
|
664
|
+
//meta.mlrole: isInteractive
|
|
665
|
+
//input: dataframe df
|
|
666
|
+
//input: column predictColumn
|
|
667
|
+
//output: bool result
|
|
668
|
+
export function isInteractiveSoftmax(df: DG.DataFrame, predictColumn: DG.Column): boolean {
|
|
669
|
+
return SoftmaxClassifier.isInteractive(df.columns, predictColumn);
|
|
670
|
+
}
|
package/src/regression.ts
CHANGED