npm - @datagrok/eda - Versions diffs - 1.1.30 → 1.1.32 - Mend

@datagrok/eda 1.1.30 → 1.1.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

package/CHANGELOG.md +8 -0
package/README.md +1 -0
package/dist/23.js.map +1 -1
package/dist/242.js +1 -1
package/dist/242.js.map +1 -1
package/dist/449.js +2 -0
package/dist/449.js.map +1 -0
package/dist/738.js +1 -1
package/dist/738.js.map +1 -1
package/dist/77573759e3857711e15b.wasm +0 -0
package/dist/990.js +2 -0
package/dist/990.js.map +1 -0
package/dist/package-test.js +1 -1
package/dist/package-test.js.map +1 -1
package/dist/package.js +1 -1
package/dist/package.js.map +1 -1
package/package.json +92 -91
package/src/missing-values-imputation/ui.ts +4 -4
package/src/package-test.ts +2 -0
package/src/package.ts +65 -3
package/src/pls/pls-constants.ts +21 -5
package/src/pls/pls-tools.ts +8 -2
package/src/tests/classifiers-tests.ts +114 -0
package/src/tests/linear-methods-tests.ts +150 -0
package/src/tests/utils.ts +121 -0
package/src/xgbooster.ts +260 -0
package/wasm/XGBoostAPI.js +32 -0
package/wasm/XGBoostAPI.wasm +0 -0
package/wasm/XGBoostAPIinWebWorker.js +32 -0
package/wasm/callWasmForWebWorker.js +11 -8
package/wasm/workers/xgboostWorker.js +67 -0
package/wasm/xgboost/CMakeLists.txt +23 -0
package/wasm/xgboost/commands.txt +12 -0
package/wasm/xgboost/xgboost/README.txt +1 -0
package/wasm/xgboost/xgboost-api.cpp +134 -0
package/wasm/xgbooster.js +161 -0
package/dist/317.js +0 -2
package/dist/317.js.map +0 -1

package/package.json CHANGED Viewed

@@ -1,94 +1,95 @@
 {
-    "name": "@datagrok/eda",
-    "friendlyName": "EDA",
-    "version": "1.1.30",
-    "description": "Exploratory Data Analysis Tools",
-    "dependencies": {
-        "@datagrok-libraries/math": "^1.1.10",
-        "@datagrok-libraries/ml": "^6.6.13",
-        "@datagrok-libraries/tutorials": "^1.3.6",
-        "@datagrok-libraries/utils": "^4.1.44",
-        "@keckelt/tsne": "^1.0.2",
-        "@webgpu/types": "^0.1.40",
-        "cash-dom": "^8.1.1",
-        "datagrok-api": "^1.20.0",
-        "dayjs": "^1.11.9",
-        "jstat": "^1.9.6",
-        "source-map-loader": "^4.0.1",
-        "umap-js": "^1.3.3",
-        "worker-loader": "latest"
-    },
-    "author": {
-        "name": "Viktor Makarichev",
-        "email": "vmakarichev@datagrok.ai"
-    },
-    "devDependencies": {
-        "@typescript-eslint/eslint-plugin": "^5.32.0",
-        "@typescript-eslint/parser": "^5.32.0",
-        "css-loader": "latest",
-        "eslint": "^8.21.0",
-        "eslint-config-google": "^0.14.0",
-        "style-loader": "latest",
-        "ts-loader": "latest",
-        "typescript": "latest",
-        "webpack": "latest",
-        "webpack-cli": "latest"
-    },
-    "scripts": {
-        "link-all": "npm link datagrok-api @datagrok-libraries/utils @datagrok-libraries/tutorials",
-        "debug-eda": "webpack && grok publish",
-        "release-eda": "webpack && grok publish --release",
-        "build-eda": "webpack",
-        "build": "webpack",
-        "debug-eda-dev": "webpack && grok publish dev",
-        "release-eda-dev": "webpack && grok publish dev --release",
-        "debug-eda-local": "webpack && grok publish local",
-        "release-eda-local": "webpack && grok publish local --release",
-        "build-all": "npm --prefix ./../../js-api run build && npm --prefix ./../../libraries/utils run build && npm --prefix ./../../libraries/tutorials run build && npm run build"
-    },
-    "canEdit": [
-        "Developers"
-    ],
-    "canView": [
-        "All users"
-    ],
-    "repository": {
-        "type": "git",
-        "url": "https://github.com/datagrok-ai/public.git",
-        "directory": "packages/EDA"
-    },
-    "category": "Machine Learning",
-    "sources": [
-        "wasm/EDA.js"
-    ],
-    "meta": {
-        "menu": {
-            "ML": {
-                "Tools": {
-                    "Impute Missing Values...": null,
-                    "Random Data...": null
-                },
-                "Cluster": {
-                    "Cluster...": null,
-                    "DBSCAN...": null
-                },
-                "Notebooks": {
-                    "Browse Notebooks": null,
-                    "Open in Notebook": null,
-                    "New Notebook": null
-                },
-                "Models": {
-                    "Browse Models": null,
-                    "Train Model...": null,
-                    "Apply Model...": null
-                },
-                "Analyse": {
-                    "PCA...": null,
-                    "ANOVA...": null,
-                    "Multivariate Analysis...": null
-                },
-                "Reduce Dimensionality": null
-            }
-        }
+  "name": "@datagrok/eda",
+  "friendlyName": "EDA",
+  "version": "1.1.32",
+  "description": "Exploratory Data Analysis Tools",
+  "dependencies": {
+    "@datagrok-libraries/math": "^1.1.11",
+    "@datagrok-libraries/ml": "^6.6.15",
+    "@datagrok-libraries/tutorials": "^1.3.13",
+    "@datagrok-libraries/utils": "^4.2.20",
+    "@keckelt/tsne": "^1.0.2",
+    "@webgpu/types": "^0.1.40",
+    "cash-dom": "^8.1.1",
+    "datagrok-api": "^1.20.1",
+    "dayjs": "^1.11.9",
+    "jstat": "^1.9.6",
+    "source-map-loader": "^4.0.1",
+    "umap-js": "^1.3.3",
+    "worker-loader": "latest"
+  },
+  "author": {
+    "name": "Viktor Makarichev",
+    "email": "vmakarichev@datagrok.ai"
+  },
+  "devDependencies": {
+    "@typescript-eslint/eslint-plugin": "^5.32.0",
+    "@typescript-eslint/parser": "^5.32.0",
+    "css-loader": "latest",
+    "eslint": "^8.21.0",
+    "eslint-config-google": "^0.14.0",
+    "style-loader": "latest",
+    "ts-loader": "latest",
+    "typescript": "latest",
+    "webpack": "latest",
+    "webpack-cli": "latest"
+  },
+  "scripts": {
+    "link-all": "npm link datagrok-api @datagrok-libraries/utils @datagrok-libraries/tutorials",
+    "debug-eda": "webpack && grok publish",
+    "release-eda": "webpack && grok publish --release",
+    "build-eda": "webpack",
+    "build": "webpack",
+    "debug-eda-dev": "webpack && grok publish dev",
+    "release-eda-dev": "webpack && grok publish dev --release",
+    "debug-eda-local": "webpack && grok publish local",
+    "release-eda-local": "webpack && grok publish local --release",
+    "build-all": "npm --prefix ./../../js-api run build && npm --prefix ./../../libraries/utils run build && npm --prefix ./../../libraries/tutorials run build && npm run build"
+  },
+  "canEdit": [
+    "Developers"
+  ],
+  "canView": [
+    "All users"
+  ],
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/datagrok-ai/public.git",
+    "directory": "packages/EDA"
+  },
+  "category": "Machine Learning",
+  "sources": [
+    "wasm/EDA.js",
+    "wasm/XGBoostAPI.js"
+  ],
+  "meta": {
+    "menu": {
+      "ML": {
+        "Tools": {
+          "Impute Missing Values...": null,
+          "Random Data...": null
+        },
+        "Cluster": {
+          "Cluster...": null,
+          "DBSCAN...": null
+        },
+        "Notebooks": {
+          "Browse Notebooks": null,
+          "Open in Notebook": null,
+          "New Notebook": null
+        },
+        "Models": {
+          "Browse Models": null,
+          "Train Model...": null,
+          "Apply Model...": null
+        },
+        "Analyse": {
+          "PCA...": null,
+          "ANOVA...": null,
+          "Multivariate Analysis...": null
+        },
+        "Reduce Dimensionality": null
+      }
     }
+  }
 }

package/src/missing-values-imputation/ui.ts CHANGED Viewed

@@ -117,15 +117,15 @@ export async function runKNNImputer(df?: DG.DataFrame): Promise<void> {
   // Target columns components (cols with missing values to be imputed)
   let targetColNames = colsWithMissingVals.map((col) => col.name);
-  const targetColInput = ui.input.columns(TITLE.COLUMNS, {table: df, onValueChanged: () => {
+  const targetColInput = ui.input.columns(TITLE.COLUMNS, {table: df, value: df.columns.byNames(availableTargetColsNames), onValueChanged: () => {
     targetColNames = targetColInput.value.map((col) => col.name);
     checkApplicability();
-  }, available: availableTargetColsNames, checked: availableTargetColsNames});
+  }, available: availableTargetColsNames});
   targetColInput.setTooltip(HINT.TARGET);
   // Feature columns components
   let selectedFeatureColNames = availableFeatureColsNames as string[];
-  const featuresInput = ui.input.columns(TITLE.FEATURES, {table: df, onValueChanged: () => {
+  const featuresInput = ui.input.columns(TITLE.FEATURES, {value: df.columns.byNames(availableFeatureColsNames), table: df, onValueChanged: () => {
     selectedFeatureColNames = featuresInput.value.map((col) => col.name);
     if (selectedFeatureColNames.length > 0) {
@@ -133,7 +133,7 @@ export async function runKNNImputer(df?: DG.DataFrame): Promise<void> {
       metricInfoInputs.forEach((div, name) => div.hidden = !selectedFeatureColNames.includes(name));
     } else
       hideWidgets();
-  }, available: availableFeatureColsNames, checked: availableFeatureColsNames});
+  }, available: availableFeatureColsNames});
   featuresInput.setTooltip(HINT.FEATURES);
   /** Hide widgets (use if run is not applicable) */

package/src/package-test.ts CHANGED Viewed

@@ -1,6 +1,8 @@
 import * as DG from 'datagrok-api/dg';
 import {runTests, tests, TestContext} from '@datagrok-libraries/utils/src/test';
 import './tests/dim-reduction-tests';
+import './tests/linear-methods-tests';
+import './tests/classifiers-tests';
 export const _package = new DG.Package();
 export {tests};

package/src/package.ts CHANGED Viewed

@@ -34,6 +34,9 @@ import {getLinearRegressionParams, getPredictionByLinearRegression} from './regr
 import {PlsModel} from './pls/pls-ml';
 import {SoftmaxClassifier} from './softmax-classifier';
+import {initXgboost} from '../wasm/xgbooster';
+import {XGBooster} from './xgbooster';
 export const _package = new DG.Package();
 //name: info
@@ -44,6 +47,7 @@ export function info() {
 //tags: init
 export async function init(): Promise<void> {
   await _initEDAAPI();
+  await initXgboost();
 }
 //top-menu: ML | Cluster | DBSCAN...
@@ -193,7 +197,7 @@ export function GetMCLEditor(call: DG.FuncCall): void {
           df: params.table, cols: params.columns, metrics: params.distanceMetrics,
           weights: params.weights, aggregationMethod: params.aggreaggregationMethod, preprocessingFuncs: params.preprocessingFunctions,
           preprocessingFuncArgs: params.preprocessingFuncArgs, threshold: params.threshold, maxIterations: params.maxIterations,
-          useWebGPU: params.useWebGPU, inflate: params.inflateFactor,
+          useWebGPU: params.useWebGPU, inflate: params.inflateFactor, minClusterSize: params.minClusterSize,
         }).call(true);
       }).show();
   } catch (err: any) {
@@ -219,10 +223,12 @@ export function GetMCLEditor(call: DG.FuncCall): void {
 //input: int maxIterations = 10
 //input: bool useWebGPU = false
 //input: double inflate = 2
+//input: int minClusterSize = 5
 //editor: EDA: GetMCLEditor
 export async function MCL(df: DG.DataFrame, cols: DG.Column[], metrics: KnownMetrics[],
   weights: number[], aggregationMethod: DistanceAggregationMethod, preprocessingFuncs: (DG.Func | null | undefined)[],
   preprocessingFuncArgs: any[], threshold: number = 80, maxIterations: number = 10, useWebGPU: boolean = false, inflate: number = 0,
+  minClusterSize: number = 5,
 ): Promise< DG.ScatterPlotViewer | undefined> {
   const tv = grok.shell.tableView(df.name) ?? grok.shell.addTableView(df);
   const serializedOptions: string = JSON.stringify({
@@ -236,6 +242,7 @@ export async function MCL(df: DG.DataFrame, cols: DG.Column[], metrics: KnownMet
     maxIterations: maxIterations,
     useWebGPU: useWebGPU,
     inflate: inflate,
+    minClusterSize: minClusterSize ?? 5,
   } satisfies MCLSerializableOptions);
   df.setTag(MCL_OPTIONS_TAG, serializedOptions);
@@ -255,9 +262,12 @@ export async function MCLInitializationFunction(sc: DG.ScatterPlotViewer) {
   const options: MCLSerializableOptions = JSON.parse(mclTag);
   const cols = options.cols.map((colName) => df.columns.byName(colName));
   const preprocessingFuncs = options.preprocessingFuncs.map((funcName) => funcName ? DG.Func.byName(funcName) : null);
+  // let presetMatrix = null;
+  // if (df.temp['sparseMatrix'])
+  //   presetMatrix = df.temp['sparseMatrix'];
   const res = await markovCluster(df, cols, options.metrics, options.weights,
     options.aggregationMethod, preprocessingFuncs, options.preprocessingFuncArgs, options.threshold,
-    options.maxIterations, options.useWebGPU, options.inflate, sc);
+    options.maxIterations, options.useWebGPU, options.inflate, options.minClusterSize, sc /**presetMatrix */);
   return res?.sc;
 }
@@ -297,7 +307,7 @@ export async function MVA(): Promise<void> {
 //description: Multidimensional data analysis using partial least squares (PLS) regression. It identifies latent factors and constructs a linear model based on them.
 //meta.demoPath: Compute | Multivariate analysis
 export async function demoMultivariateAnalysis(): Promise<any> {
-  runDemoMVA();
+  await runDemoMVA();
 }
 //name: trainLinearKernelSVM
@@ -734,3 +744,55 @@ export async function visualizePLSRegression(df: DG.DataFrame, targetColumn: DG.
 export function isInteractivePLSRegression(df: DG.DataFrame, predictColumn: DG.Column): boolean {
   return PlsModel.isInteractive(df.columns, predictColumn);
 }
+//name: trainXGBooster
+//meta.mlname: XGBoost
+//meta.mlrole: train
+//input: dataframe df
+//input: column predictColumn
+//input: int iterations = 20 {min: 1; max: 100} [Number of training iterations]
+//input: double eta = 0.3 {caption: Rate; min: 0; max: 1} [Learning rate]
+//input: int maxDepth = 6 {min: 0; max: 20} [Maximum depth of a tree]
+//input: double lambda = 1 {min: 0; max: 100} [L2 regularization term]
+//input: double alpha = 0 {min: 0; max: 100} [L1 regularization term]
+//output: dynamic model
+export async function trainXGBooster(df: DG.DataFrame, predictColumn: DG.Column,
+  iterations: number, eta: number, maxDepth: number, lambda: number, alpha: number): Promise<Uint8Array> {
+  const features = df.columns;
+  const booster = new XGBooster();
+  await booster.fit(features, predictColumn, iterations, eta, maxDepth, lambda, alpha);
+  return booster.toBytes();
+}
+//name: applyXGBooster
+//meta.mlname: XGBoost
+//meta.mlrole: apply
+//input: dataframe df
+//input: dynamic model
+//output: dataframe table
+export function applyXGBooster(df: DG.DataFrame, model: any): DG.DataFrame {
+  const unpackedModel = new XGBooster(model);
+  return DG.DataFrame.fromColumns([unpackedModel.predict(df.columns)]);
+}
+//name: isInteractiveXGBooster
+//meta.mlname: XGBoost
+//meta.mlrole: isInteractive
+//input: dataframe df
+//input: column predictColumn
+//output: bool result
+export function isInteractiveXGBooster(df: DG.DataFrame, predictColumn: DG.Column): boolean {
+  return XGBooster.isInteractive(df.columns, predictColumn);
+}
+//name: isApplicableXGBooster
+//meta.mlname: XGBoost
+//meta.mlrole: isApplicable
+//input: dataframe df
+//input: column predictColumn
+//output: bool result
+export function isApplicableXGBooster(df: DG.DataFrame, predictColumn: DG.Column): boolean {
+  return XGBooster.isApplicable(df.columns, predictColumn);
+}

package/src/pls/pls-constants.ts CHANGED Viewed

@@ -35,6 +35,7 @@ export enum TITLE {
   EXPL_VAR = 'Explained Variance',
   EXPLORE = 'Explore',
   FEATURES = 'Feature names',
+  BROWSE = 'Browse',
 }
 /** Tooltips */
@@ -115,11 +116,26 @@ The method finds the latent factors that
 /** Description of demo results: wizard components */
 export const DEMO_RESULTS = [
-  {caption: TITLE.MODEL, text: 'Closer to the line means better price prediction.'},
-  {caption: TITLE.SCORES, text: 'The latent factor values for each data sample reflect the similarities and dissimilarities among observations.'},
-  {caption: TITLE.LOADINGS, text: 'The impact of each feature on the latent factors: higher loading means stronger influence.'},
-  {caption: TITLE.REGR_COEFS, text: 'Parameters of the obtained linear model: features make different contribution to the prediction.'},
-  {caption: TITLE.EXPL_VAR, text: 'How well the latent components fit source data: closer to one means better fit.'},
+  {
+    caption: TITLE.MODEL,
+    text: 'Closer to the line means better price prediction.',
+  },
+  {
+    caption: TITLE.SCORES,
+    text: 'The latent factor values for each sample reflect the similarities and dissimilarities among observations.',
+  },
+  {
+    caption: TITLE.LOADINGS,
+    text: 'The impact of each feature on the latent factors: higher loading means stronger influence.',
+  },
+  {
+    caption: TITLE.REGR_COEFS,
+    text: 'Parameters of the obtained linear model: features make different contribution to the prediction.',
+  },
+  {
+    caption: TITLE.EXPL_VAR,
+    text: 'How well the latent components fit source data: closer to one means better fit.',
+  },
 ];
 /** Form results markdown for demo app */

package/src/pls/pls-tools.ts CHANGED Viewed

@@ -110,7 +110,11 @@ async function performMVA(input: PlsInput, analysisType: PLS_ANALYSIS): Promise<
   if (analysisType === PLS_ANALYSIS.COMPUTE_COMPONENTS)
     return;
-  const view = grok.shell.tableView(input.table.name);
+  //const view = grok.shell.tableView(input.table.name);
+  const view = (analysisType === PLS_ANALYSIS.DEMO) ?
+    (grok.shell.view(TITLE.BROWSE) as DG.BrowseView).preview as DG.TableView :
+    grok.shell.tableView(input.table.name);
   // 0.1 Buffer table
   const buffer = DG.DataFrame.fromColumns([
@@ -248,7 +252,9 @@ async function performMVA(input: PlsInput, analysisType: PLS_ANALYSIS): Promise<
 /** Run multivariate analysis (PLS) */
 export async function runMVA(analysisType: PLS_ANALYSIS): Promise<void> {
-  const table = grok.shell.t;
+  const table = (analysisType === PLS_ANALYSIS.DEMO) ?
+    ((grok.shell.view(TITLE.BROWSE) as DG.BrowseView).preview as DG.TableView).table :
+    grok.shell.t;
   if (table === null) {
     grok.shell.warning(ERROR_MSG.NO_DF);

package/src/tests/classifiers-tests.ts ADDED Viewed

@@ -0,0 +1,114 @@
+// Tests for classifiers
+import * as grok from 'datagrok-api/grok';
+import * as ui from 'datagrok-api/ui';
+import * as DG from 'datagrok-api/dg';
+import {_package} from '../package-test';
+import {category, expect, test} from '@datagrok-libraries/utils/src/test';
+import {classificationDataset, accuracy} from './utils';
+import {SoftmaxClassifier} from '../softmax-classifier';
+import {XGBooster} from '../xgbooster';
+const ROWS_K = 50;
+const MIN_COLS = 2;
+const COLS = 100;
+const TIMEOUT = 8000;
+const MIN_ACCURACY = 0.9;
+category('Softmax', () => {
+  test(`Performance: ${ROWS_K}K samples, ${COLS} features`, async () => {
+    // Data
+    const df = classificationDataset(ROWS_K * 1000, COLS, false);
+    const features = df.columns;
+    const target = features.byIndex(COLS);
+    features.remove(target.name);
+    // Fit & pack trained model
+    const model = new SoftmaxClassifier({
+      classesCount: target.categories.length,
+      featuresCount: features.length,
+    });
+    await model.fit(features, target);
+    const modelBytes = model.toBytes();
+    // Unpack & apply model
+    const unpackedModel = new SoftmaxClassifier(undefined, modelBytes);
+    unpackedModel.predict(features);
+  }, {timeout: TIMEOUT, benchmark: true});
+  test('Correctness', async () => {
+    // Prepare data
+    const df = classificationDataset(ROWS_K, MIN_COLS, true);
+    const features = df.columns;
+    const target = features.byIndex(MIN_COLS);
+    features.remove(target.name);
+    // Fit & pack trained model
+    const model = new SoftmaxClassifier({
+      classesCount: target.categories.length,
+      featuresCount: features.length,
+    });
+    await model.fit(features, target);
+    const modelBytes = model.toBytes();
+    // Unpack & apply model
+    const unpackedModel = new SoftmaxClassifier(undefined, modelBytes);
+    const prediction = unpackedModel.predict(features);
+    // Evaluate accuracy
+    const acc = accuracy(target, prediction);
+    expect(
+      acc > MIN_ACCURACY,
+      true,
+      `Softmax failed, too small accuracy: ${acc}; expected: <= ${MIN_ACCURACY}`,
+    );
+  }, {timeout: TIMEOUT});
+}); // Softmax
+category('XGBoost', () => {
+  test(`Performance: ${ROWS_K}K samples, ${COLS} features`, async () => {
+    // Data
+    const df = classificationDataset(ROWS_K * 1000, COLS, false);
+    const features = df.columns;
+    const target = features.byIndex(COLS);
+    features.remove(target.name);
+    // Fit & pack trained model
+    const model = new XGBooster();
+    await model.fit(features, target);
+    const modelBytes = model.toBytes();
+    // Unpack & apply model
+    const unpackedModel = new XGBooster(modelBytes);
+    unpackedModel.predict(features);
+  }, {timeout: TIMEOUT, benchmark: true});
+  test('Correctness', async () => {
+    // Prepare data
+    const df = classificationDataset(ROWS_K, MIN_COLS, true);
+    const features = df.columns;
+    const target = features.byIndex(MIN_COLS);
+    features.remove(target.name);
+    // Fit & pack trained model
+    const model = new XGBooster();
+    await model.fit(features, target);
+    const modelBytes = model.toBytes();
+    // Unpack & apply model
+    const unpackedModel = new XGBooster(modelBytes);
+    const prediction = unpackedModel.predict(features);
+    // Evaluate accuracy
+    const acc = accuracy(target, prediction);
+    expect(
+      acc > MIN_ACCURACY,
+      true,
+      `XGBoost failed, too small accuracy: ${acc}; expected: <= ${MIN_ACCURACY}`,
+    );
+  }, {timeout: TIMEOUT});
+}); // XGBoost