@datagrok/eda 1.1.28 → 1.1.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "EDA",
3
3
  "folder": "../wasm",
4
- "source": ["pcaExport.cpp", "PCA/PCA.cpp", "plsExport.cpp", "PLS/PLS.cpp", "svmApi.cpp", "regression-api.cpp"],
4
+ "source": ["pcaExport.cpp", "PCA/PCA.cpp", "plsExport.cpp", "PLS/PLS.cpp", "svmApi.cpp", "regression-api.cpp", "softmax-api.cpp"],
5
5
  "optimizationMode": "-O3",
6
6
  "packageFile": "../src/package.ts",
7
7
  "packageJsonFile": "../package.json",
@@ -6,15 +6,7 @@ import * as grok from 'datagrok-api/grok';
6
6
  import * as ui from 'datagrok-api/ui';
7
7
  import * as DG from 'datagrok-api/dg';
8
8
 
9
- import {checkGeneratorSVMinputs} from './utils';
10
- import {_generateDatasetInWebWorker} from '../wasm/EDAAPI';
11
-
12
- const SVM_GEN_FEATURES_INDEX = 0;
13
- const SVM_GEN_LABELS_INDEX = 1;
14
- const SVM_FEATURE_NAME = 'Feature #';
15
- const SVM_LABEL_NAME = 'Label';
16
-
17
- // Returns the dataframe "cars"
9
+ /** Returns the dataframe "cars" */
18
10
  export function carsDataframe(): DG.DataFrame {
19
11
  return DG.DataFrame.fromColumns(
20
12
  [
@@ -37,38 +29,3 @@ export function carsDataframe(): DG.DataFrame {
37
29
  DG.Column.fromInt32Array('price', new Int32Array([16500, 17710, 16430, 6575, 7957, 6229, 7129, 8845, 6785, 35550, 7395, 31600, 16503, 5389, 7349, 7299, 6229, 13860, 37028, 12170, 7775, 5348, 7898, 9989, 10698, 7775, 13295, 12940, 19045, 22470])),
38
30
  ]);
39
31
  } // carsDataframe
40
-
41
- // Generate dataset for testing binary classifiers
42
- export async function testDataForBinaryClassification(kernel: number, kernelParams: Array<number>,
43
- name: string, samplesCount: number, featuresCount: number, min: number,
44
- max: number, violatorsPercentage: number): Promise<DG.DataFrame> {
45
- // check inputs
46
- checkGeneratorSVMinputs(samplesCount, featuresCount, min, max, violatorsPercentage);
47
-
48
- // kernel params column
49
- const kernelParamsCol = DG.Column.fromList('double', 'kernelParams', kernelParams);
50
-
51
- // CALL WASM-COMPUTATIONS
52
- let _output: any;
53
- const _promise = _generateDatasetInWebWorker(kernel, kernelParamsCol,
54
- samplesCount, featuresCount, min, max, violatorsPercentage);
55
-
56
- await _promise.then(
57
- (_result) => {_output = _result;},
58
- (_error) => {throw new Error(`Error: ${_error}`);},
59
- );
60
-
61
- // Rename labels column
62
- _output[SVM_GEN_LABELS_INDEX].name = SVM_LABEL_NAME;
63
-
64
- // Rename feature columns
65
- for (const col of _output[SVM_GEN_FEATURES_INDEX])
66
- col.name = SVM_FEATURE_NAME + col.name;
67
-
68
- // Create dataframe
69
- const df = DG.DataFrame.fromColumns(_output[SVM_GEN_FEATURES_INDEX]);
70
- df.name = name;
71
- df.columns.add(_output[SVM_GEN_LABELS_INDEX]);
72
-
73
- return df;
74
- } // testDataForMachineLearning
@@ -109,8 +109,10 @@ export async function runKNNImputer(df?: DG.DataFrame): Promise<void> {
109
109
 
110
110
  // Distance components
111
111
  let distType = DISTANCE_TYPE.EUCLIDEAN;
112
- const distTypeInput: DG.ChoiceInput<DISTANCE_TYPE> = ui.input.choice(TITLE.DISTANCE, {value: distType,
113
- items: [DISTANCE_TYPE.EUCLIDEAN, DISTANCE_TYPE.MANHATTAN], onValueChanged: () => distType = distTypeInput.value ?? DISTANCE_TYPE.EUCLIDEAN}) as DG.ChoiceInput<DISTANCE_TYPE>;
112
+ const distTypeInput: DG.ChoiceInput<DISTANCE_TYPE> = ui.input.choice(TITLE.DISTANCE, {
113
+ value: distType,
114
+ items: [DISTANCE_TYPE.EUCLIDEAN, DISTANCE_TYPE.MANHATTAN],
115
+ onValueChanged: () => distType = distTypeInput.value ?? DISTANCE_TYPE.EUCLIDEAN}) as DG.ChoiceInput<DISTANCE_TYPE>;
114
116
  distTypeInput.setTooltip(HINT.DISTANCE);
115
117
 
116
118
  // Target columns components (cols with missing values to be imputed)
@@ -193,8 +195,15 @@ export async function runKNNImputer(df?: DG.DataFrame): Promise<void> {
193
195
  distTypeInput.root.hidden = true; // this input will be used further
194
196
 
195
197
  // The following should provide a slider (see th bug https://reddata.atlassian.net/browse/GROK-14431)
196
- // @ts-ignore
197
- const prop = DG.Property.fromOptions({'name': name, 'inputType': 'Float', 'min': 0, 'max': 10, 'showSlider': true, 'step': 1});
198
+ const prop = DG.Property.fromOptions({
199
+ 'name': name,
200
+ 'inputType': 'Float',
201
+ 'min': 0,
202
+ 'max': 10,
203
+ // @ts-ignore
204
+ 'showSlider': true,
205
+ 'step': 1,
206
+ });
198
207
  const weightInput = ui.input.forProperty(prop);
199
208
  weightInput.value = settings.defaultWeight;
200
209
  weightInput.onChanged(() => {
@@ -239,7 +248,8 @@ export async function runKNNImputer(df?: DG.DataFrame): Promise<void> {
239
248
  .onOK(() => {
240
249
  okClicked = true;
241
250
  dlg.close();
242
- availableFeatureColsNames.filter((name) => !selectedFeatureColNames.includes(name)).forEach((name) => featuresMetrics.delete(name));
251
+ availableFeatureColsNames.filter((name) => !selectedFeatureColNames.includes(name))
252
+ .forEach((name) => featuresMetrics.delete(name));
243
253
 
244
254
  try {
245
255
  const failedToImpute = impute(df!, targetColNames, featuresMetrics, misValsInds, distType, neighbors, inPlace);
@@ -256,5 +266,5 @@ export async function runKNNImputer(df?: DG.DataFrame): Promise<void> {
256
266
  }
257
267
  }).onClose.subscribe(() => !okClicked && reject());
258
268
 
259
- return promise;
269
+ return promise;
260
270
  } // runKNNImputer
package/src/package.ts CHANGED
@@ -8,7 +8,6 @@ import * as DG from 'datagrok-api/dg';
8
8
  import {_initEDAAPI} from '../wasm/EDAAPI';
9
9
  import {computePCA} from './eda-tools';
10
10
  import {addPrefixToEachColumnName, addOneWayAnovaVizualization} from './eda-ui';
11
- import {testDataForBinaryClassification} from './data-generators';
12
11
  import {LINEAR, RBF, POLYNOMIAL, SIGMOID,
13
12
  getTrainedModel, getPrediction, isApplicableSVM, isInteractiveSVM, showTrainReport, getPackedModel} from './svm';
14
13
 
@@ -31,7 +30,8 @@ import {MCLEditor} from '@datagrok-libraries/ml/src/MCL/mcl-editor';
31
30
  import {markovCluster} from '@datagrok-libraries/ml/src/MCL/clustering-view';
32
31
  import {MCL_OPTIONS_TAG, MCLSerializableOptions} from '@datagrok-libraries/ml/src/MCL';
33
32
 
34
- import {getLinearRegressionParams, getPredictionByLinearRegression, getTestDatasetForLinearRegression} from './regression';
33
+ import {getLinearRegressionParams, getPredictionByLinearRegression} from './regression';
34
+ import {SoftmaxClassifier} from './softmax-classifier';
35
35
 
36
36
  export const _package = new DG.Package();
37
37
 
@@ -299,37 +299,6 @@ export async function demoMultivariateAnalysis(): Promise<any> {
299
299
  runDemoMVA();
300
300
  }
301
301
 
302
- //name: Generate linear separable dataset
303
- //description: Generates linear separble dataset for testing binary classificators
304
- //input: string name = 'Data' {caption: name; category: Dataset}
305
- //input: int samplesCount = 1000 {caption: samples; category: Size}
306
- //input: int featuresCount = 2 {caption: features; category: Size}
307
- //input: double min = -39 {caption: min; category: Range}
308
- //input: double max = 173 {caption: max; category: Range}
309
- //input: double violatorsPercentage = 5 {caption: violators; units: %; category: Dataset}
310
- //output: dataframe df
311
- export async function testDataLinearSeparable(name: string, samplesCount: number, featuresCount: number,
312
- min: number, max: number, violatorsPercentage: number): Promise<DG.DataFrame> {
313
- return await testDataForBinaryClassification(LINEAR, [0, 0], name, samplesCount, featuresCount,
314
- min, max, violatorsPercentage);
315
- }
316
-
317
- //name: Generate linear non-separable dataset
318
- //description: Generates linear non-separble dataset for testing binary classificators
319
- //input: string name = 'Data' {caption: name; category: Dataset}
320
- //input: double sigma = 90 {caption: sigma; category: Hyperparameters} [RBF-kernel paramater]
321
- //input: int samplesCount = 1000 {caption: samples; category: Size}
322
- //input: int featuresCount = 2 {caption: features; category: Size}
323
- //input: double min = -39 {caption: min; category: Range}
324
- //input: double max = 173 {caption: max; category: Range}
325
- //input: double violatorsPercentage = 5 {caption: violators; units: %; category: Dataset}
326
- //output: dataframe df
327
- export async function testDataLinearNonSeparable(name: string, sigma: number, samplesCount: number,
328
- featuresCount: number, min: number, max: number, violatorsPercentage: number): Promise<DG.DataFrame> {
329
- return await testDataForBinaryClassification(RBF, [sigma, 0], name, samplesCount, featuresCount,
330
- min, max, violatorsPercentage);
331
- }
332
-
333
302
  //name: trainLinearKernelSVM
334
303
  //meta.mlname: linear kernel LS-SVM
335
304
  //meta.mlrole: train
@@ -593,48 +562,6 @@ export async function kNNImputationForTable(table: DG.DataFrame) {
593
562
  await runKNNImputer(table);
594
563
  }
595
564
 
596
- //name: linearRegression
597
- //description: Linear Regression demo
598
- //input: dataframe table
599
- //input: column_list features {type: numerical}
600
- //input: column target {type: numerical}
601
- //input: bool plot = true {caption: plot}
602
- export async function linearRegression(table: DG.DataFrame, features: DG.ColumnList, target: DG.Column, plot: boolean): Promise<void> {
603
- const t1 = performance.now();
604
- const params = await getLinearRegressionParams(features, target);
605
- const t2 = performance.now();
606
- console.log(`Fit: ${t2 - t1} ms.`);
607
- const prediction = getPredictionByLinearRegression(features, params);
608
- console.log(`Predict: ${performance.now() - t2} ms.`);
609
-
610
- prediction.name = table.columns.getUnusedName(prediction.name);
611
-
612
- table.columns.add(prediction);
613
-
614
- if (plot) {
615
- const view = grok.shell.tableView(table.name);
616
- view.addViewer(DG.VIEWER.SCATTER_PLOT, {
617
- xColumnName: target.name,
618
- yColumnName: prediction.name,
619
- showRegressionLine: true,
620
- });
621
- }
622
- }
623
-
624
- //name: generateDatasetForLinearRegressionTest
625
- //description: Create demo dataset for linear regression
626
- //input: int rowCount = 10000 {min: 1000; max: 10000000; step: 10000}
627
- //input: int colCount = 10 {min: 1; max: 1000; step: 10}
628
- //input: double featuresScale = 10 {min: -1000; max: 1000; step: 10}
629
- //input: double featuresBias = 10 {min: -1000; max: 1000; step: 10}
630
- //input: double paramsScale = 10 {min: -1000; max: 1000; step: 10}
631
- //input: double paramsBias = 10 {min: -1000; max: 1000; step: 10}
632
- //output: dataframe table
633
- export function generateDatasetForLinearRegressionTest(rowCount: number, colCount: number,
634
- featuresScale: number, featuresBias: number, paramsScale: number, paramsBias: number): DG.DataFrame {
635
- return getTestDatasetForLinearRegression(rowCount, colCount, featuresScale, featuresBias, paramsScale, paramsBias);
636
- }
637
-
638
565
  //name: trainLinearRegression
639
566
  //meta.mlname: Linear Regression
640
567
  //meta.mlrole: train
@@ -671,10 +598,8 @@ export function isApplicableLinearRegression(df: DG.DataFrame, predictColumn: DG
671
598
  if (!col.matches('numerical'))
672
599
  return false;
673
600
  }
674
- if (!predictColumn.matches('numerical'))
675
- return false;
676
601
 
677
- return true;
602
+ return predictColumn.matches('numerical');
678
603
  }
679
604
 
680
605
  //name: isInteractiveLinearRegression
@@ -686,3 +611,60 @@ export function isApplicableLinearRegression(df: DG.DataFrame, predictColumn: DG
686
611
  export function isInteractiveLinearRegression(df: DG.DataFrame, predictColumn: DG.Column): boolean {
687
612
  return df.rowCount <= 100000;
688
613
  }
614
+
615
+ //name: trainSoftmax
616
+ //meta.mlname: Softmax
617
+ //meta.mlrole: train
618
+ //input: dataframe df
619
+ //input: column predictColumn
620
+ //input: double rate = 1.0 {category: Hyperparameters; min: 0.001; max: 20} [Learning rate]
621
+ //input: int iterations = 100 {category: Hyperparameters; min: 1; max: 10000; step: 10} [Fitting iterations count]
622
+ //input: double penalty = 0.1 {category: Hyperparameters; min: 0.0001; max: 1} [Regularization rate]
623
+ //input: double tolerance = 0.001 {category: Hyperparameters; min: 0.00001; max: 0.1} [Fitting tolerance]
624
+ //output: dynamic model
625
+ export async function trainSoftmax(df: DG.DataFrame, predictColumn: DG.Column, rate: number,
626
+ iterations: number, penalty: number, tolerance: number): Promise<Uint8Array> {
627
+ const features = df.columns;
628
+
629
+ const model = new SoftmaxClassifier({
630
+ classesCount: predictColumn.categories.length,
631
+ featuresCount: features.length,
632
+ });
633
+
634
+ await model.fit(features, predictColumn, rate, iterations, penalty, tolerance);
635
+
636
+ return model.toBytes();
637
+ }
638
+
639
+ //name: applySoftmax
640
+ //meta.mlname: Softmax
641
+ //meta.mlrole: apply
642
+ //input: dataframe df
643
+ //input: dynamic model
644
+ //output: dataframe table
645
+ export function applySoftmax(df: DG.DataFrame, model: any): DG.DataFrame {
646
+ const features = df.columns;
647
+ const unpackedModel = new SoftmaxClassifier(undefined, model);
648
+
649
+ return DG.DataFrame.fromColumns([unpackedModel.predict(features)]);
650
+ }
651
+
652
+ //name: isApplicableSoftmax
653
+ //meta.mlname: Softmax
654
+ //meta.mlrole: isApplicable
655
+ //input: dataframe df
656
+ //input: column predictColumn
657
+ //output: bool result
658
+ export function isApplicableSoftmax(df: DG.DataFrame, predictColumn: DG.Column): boolean {
659
+ return SoftmaxClassifier.isApplicable(df.columns, predictColumn);
660
+ }
661
+
662
+ //name: isInteractiveSoftmax
663
+ //meta.mlname: Softmax
664
+ //meta.mlrole: isInteractive
665
+ //input: dataframe df
666
+ //input: column predictColumn
667
+ //output: bool result
668
+ export function isInteractiveSoftmax(df: DG.DataFrame, predictColumn: DG.Column): boolean {
669
+ return SoftmaxClassifier.isInteractive(df.columns, predictColumn);
670
+ }
package/src/regression.ts CHANGED
@@ -1,4 +1,4 @@
1
- // Linear regression tools
1
+ // Regression tools
2
2
 
3
3
  import * as grok from 'datagrok-api/grok';
4
4
  import * as ui from 'datagrok-api/ui';