npm - @datagrok/eda - Versions diffs - 1.4.11 → 1.4.13 - Mend

@datagrok/eda 1.4.11 → 1.4.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

package/.eslintrc.json +0 -1
package/CHANGELOG.md +15 -0
package/CLAUDE.md +185 -0
package/README.md +8 -0
package/css/pmpo.css +35 -0
package/dist/package-test.js +1 -1
package/dist/package-test.js.map +1 -1
package/dist/package.js +1 -1
package/dist/package.js.map +1 -1
package/eslintrc.json +45 -0
package/files/drugs-props-test.csv +126 -0
package/files/drugs-props-train-scores.csv +664 -0
package/files/drugs-props-train.csv +664 -0
package/package.json +9 -3
package/src/anova/anova-tools.ts +1 -1
package/src/anova/anova-ui.ts +1 -1
package/src/package-api.ts +18 -0
package/src/package-test.ts +4 -1
package/src/package.g.ts +25 -0
package/src/package.ts +55 -15
package/src/pareto-optimization/pareto-computations.ts +6 -0
package/src/pareto-optimization/utils.ts +6 -4
package/src/probabilistic-scoring/data-generator.ts +157 -0
package/src/probabilistic-scoring/nelder-mead.ts +204 -0
package/src/probabilistic-scoring/pmpo-defs.ts +218 -0
package/src/probabilistic-scoring/pmpo-utils.ts +603 -0
package/src/probabilistic-scoring/prob-scoring.ts +991 -0
package/src/probabilistic-scoring/stat-tools.ts +303 -0
package/src/softmax-classifier.ts +1 -1
package/src/tests/anova-tests.ts +1 -1
package/src/tests/classifiers-tests.ts +1 -1
package/src/tests/dim-reduction-tests.ts +1 -1
package/src/tests/linear-methods-tests.ts +1 -1
package/src/tests/mis-vals-imputation-tests.ts +1 -1
package/src/tests/pareto-tests.ts +253 -0
package/src/tests/pmpo-tests.ts +157 -0
package/test-console-output-1.log +175 -209
package/test-record-1.mp4 +0 -0

package/package.json CHANGED Viewed

@@ -1,11 +1,12 @@
 {
   "name": "@datagrok/eda",
   "friendlyName": "EDA",
-  "version": "1.4.11",
+  "version": "1.4.13",
   "description": "Exploratory Data Analysis Tools",
   "dependencies": {
     "@datagrok-libraries/math": "^1.2.6",
     "@datagrok-libraries/ml": "^6.10.8",
+    "@datagrok-libraries/statistics": "^1.10.0",
     "@datagrok-libraries/tutorials": "^1.7.4",
     "@datagrok-libraries/utils": "^4.6.5",
     "@keckelt/tsne": "^1.0.2",
@@ -14,10 +15,12 @@
     "datagrok-api": "^1.26.3",
     "dayjs": "^1.11.9",
     "jstat": "^1.9.6",
+    "mathjs": "^15.1.0",
     "source-map-loader": "^4.0.1",
     "umap-js": "^1.3.3",
     "worker-loader": "^3.0.8",
-    "wu": "^2.1.0"
+    "wu": "^2.1.0",
+    "@datagrok-libraries/test": "^1.1.0"
   },
   "author": {
     "name": "Viktor Makarichev",
@@ -27,7 +30,7 @@
     "@typescript-eslint/eslint-plugin": "^5.32.0",
     "@typescript-eslint/parser": "^5.32.0",
     "css-loader": "^7.1.2",
-    "datagrok-tools": "^4.14.55",
+    "datagrok-tools": "^5.1.5",
     "eslint": "^8.21.0",
     "eslint-config-google": "^0.14.0",
     "style-loader": "^4.0.0",
@@ -95,5 +98,8 @@
         "Reduce Dimensionality": null
       }
     }
+  },
+  "overrides": {
+    "datagrok-api": "$datagrok-api"
   }
 }

package/src/anova/anova-tools.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-// Analysis of Variances (ANOVA): computations
+// Analysis of Variances (ANOVA) - computations
 /* REFERENCES

package/src/anova/anova-ui.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-// Analysis of Variances (ANOVA): UI
+// Analysis of Variances (ANOVA) - UI
 import * as grok from 'datagrok-api/grok';
 import * as ui from 'datagrok-api/ui';

package/src/package-api.ts CHANGED Viewed

@@ -274,4 +274,22 @@ export namespace funcs {
   export async function paretoFrontViewer(): Promise<any> {
     return await grok.functions.call('EDA:ParetoFrontViewer', {});
   }
+  /**
+  Train probabilistic multi-parameter optimization (pMPO) model
+  */
+  export async function trainPmpo(): Promise<void> {
+    return await grok.functions.call('EDA:TrainPmpo', {});
+  }
+  export async function getPmpoAppItems(view: DG.View ): Promise<any> {
+    return await grok.functions.call('EDA:GetPmpoAppItems', { view });
+  }
+  /**
+  Generates syntethetic dataset oriented on the pMPO modeling
+  */
+  export async function generatePmpoDataset(samples: number ): Promise<DG.DataFrame> {
+    return await grok.functions.call('EDA:GeneratePmpoDataset', { samples });
+  }
 }

package/src/package-test.ts CHANGED Viewed

@@ -1,10 +1,13 @@
 import * as DG from 'datagrok-api/dg';
-import {runTests, tests, TestContext, initAutoTests as initTests} from '@datagrok-libraries/utils/src/test';
+import {runTests, tests, TestContext, initAutoTests as initTests} from '@datagrok-libraries/test/src/test';
 import './tests/dim-reduction-tests';
 import './tests/linear-methods-tests';
 import './tests/classifiers-tests';
 import './tests/mis-vals-imputation-tests';
 import './tests/anova-tests';
+import './tests/pmpo-tests';
+import './tests/pareto-tests';
 export const _package = new DG.Package();
 export {tests};

package/src/package.g.ts CHANGED Viewed

@@ -7,6 +7,7 @@ export function info() : void {
 }
 //tags: init
+//meta.role: init
 export async function init() : Promise<void> {
   await PackageFunctions.init();
 }
@@ -43,6 +44,7 @@ export async function PCA(table: DG.DataFrame, features: DG.ColumnList, componen
 //input: double epsilon = 0.01 { description: Minimum distance between two points to be considered as in the same neighborhood. }
 //input: int minimumPoints = 5 { description: Minimum number of points to form a dense region. }
 //meta.defaultPostProcessingFunction: true
+//meta.role: dim-red-postprocessing-function
 export async function dbscanPostProcessingFunction(col1: DG.Column, col2: DG.Column, epsilon: number, minimumPoints: number) : Promise<void> {
   await PackageFunctions.dbscanPostProcessingFunction(col1, col2, epsilon, minimumPoints);
 }
@@ -54,6 +56,7 @@ export async function dbscanPostProcessingFunction(col1: DG.Column, col2: DG.Col
 //output: object result
 //meta.supportedTypes: int,float,double,qnum
 //meta.supportedDistanceFunctions: Difference
+//meta.role: dim-red-preprocessing-function
 export function numberPreprocessingFunction(col: DG.Column, _metric: string) {
   return PackageFunctions.numberPreprocessingFunction(col, _metric);
 }
@@ -65,6 +68,7 @@ export function numberPreprocessingFunction(col: DG.Column, _metric: string) {
 //output: object result
 //meta.supportedTypes: string
 //meta.supportedDistanceFunctions: One-Hot,Levenshtein,Hamming
+//meta.role: dim-red-preprocessing-function
 export function stringPreprocessingFunction(col: DG.Column, _metric: string) {
   return PackageFunctions.stringPreprocessingFunction(col, _metric);
 }
@@ -77,6 +81,7 @@ export async function reduceDimensionality() : Promise<void> {
 //tags: editor
 //input: funccall call
+//meta.role: editor
 export function GetMCLEditor(call: DG.FuncCall) : void {
   PackageFunctions.GetMCLEditor(call);
 }
@@ -105,6 +110,7 @@ export async function MCLClustering(df: DG.DataFrame, cols: DG.Column[], metrics
 //tags: viewer
 //output: viewer result
 //meta.showInGallery: false
+//meta.role: viewer
 export function markovClusteringViewer() : any {
   return PackageFunctions.markovClusteringViewer();
 }
@@ -535,6 +541,25 @@ export function paretoFront() : void {
 //tags: viewer
 //output: viewer result
 //meta.icon: icons/pareto-front-viewer.svg
+//meta.role: viewer
 export function paretoFrontViewer() : any {
   return PackageFunctions.paretoFrontViewer();
 }
+//description: Train probabilistic multi-parameter optimization (pMPO) model
+export function trainPmpo() : void {
+  PackageFunctions.trainPmpo();
+}
+//input: view view
+//output: object result
+export function getPmpoAppItems(view: any) : any {
+  return PackageFunctions.getPmpoAppItems(view);
+}
+//description: Generates syntethetic dataset oriented on the pMPO modeling
+//input: int samples
+//output: dataframe Synthetic
+export async function generatePmpoDataset(samples: number) : Promise<any> {
+  return await PackageFunctions.generatePmpoDataset(samples);
+}

package/src/package.ts CHANGED Viewed

@@ -36,9 +36,13 @@ import {SoftmaxClassifier} from './softmax-classifier';
 import {initXgboost} from '../wasm/xgbooster';
 import {XGBooster} from './xgbooster';
 import {ParetoOptimizer} from './pareto-optimization/pareto-optimizer';
 import {ParetoFrontViewer} from './pareto-optimization/pareto-front-viewer';
+import {Pmpo} from './probabilistic-scoring/prob-scoring';
+import {getSynteticPmpoData} from './probabilistic-scoring/data-generator';
 export const _package = new DG.Package();
 export * from './package.g';
@@ -51,7 +55,7 @@ export class PackageFunctions {
   }
-  @grok.decorators.init({})
+  @grok.decorators.init({tags: ['init']})
   static async init(): Promise<void> {
     await _initEDAAPI();
     await initXgboost();
@@ -113,13 +117,9 @@ export class PackageFunctions {
   @grok.decorators.func({
-    'meta': {
-      'defaultPostProcessingFunction': 'true',
-    },
-    'tags': [
-      'dim-red-postprocessing-function',
-    ],
+    'meta': {'defaultPostProcessingFunction': 'true', 'role': 'dim-red-postprocessing-function'},
     'name': 'DBSCAN clustering',
+    'tags': ['dim-red-postprocessing-function'],
   })
   static async dbscanPostProcessingFunction(
     col1: DG.Column,
@@ -148,9 +148,10 @@ export class PackageFunctions {
     'meta': {
       'supportedTypes': 'int,float,double,qnum',
       'supportedDistanceFunctions': 'Difference',
+      'role': 'dim-red-preprocessing-function',
     },
-    'tags': ['dim-red-preprocessing-function'],
     'name': 'None (number)',
+    'tags': ['dim-red-preprocessing-function'],
     'outputs': [{name: 'result', type: 'object'}],
   })
   static numberPreprocessingFunction(
@@ -166,6 +167,7 @@ export class PackageFunctions {
     'meta': {
       'supportedTypes': 'string',
       'supportedDistanceFunctions': 'One-Hot,Levenshtein,Hamming',
+      'role': 'dim-red-preprocessing-function',
     },
     'tags': ['dim-red-preprocessing-function'],
     'name': 'None (string)',
@@ -222,7 +224,7 @@ export class PackageFunctions {
   }
-  @grok.decorators.editor()
+  @grok.decorators.editor({tags: ['editor']})
   static GetMCLEditor(
     call: DG.FuncCall): void {
     try {
@@ -289,10 +291,8 @@ export class PackageFunctions {
   @grok.decorators.func({
     'outputs': [{'name': 'result', 'type': 'viewer'}],
-    'tags': [
-      'viewer',
-    ],
-    'meta': {showInGallery: 'false'},
+    'meta': {showInGallery: 'false', role: 'viewer'},
+    'tags': ['viewer'],
     'name': 'MCL',
     'description': 'Markov clustering viewer',
   })
@@ -984,11 +984,51 @@ export class PackageFunctions {
   @grok.decorators.func({
     'name': 'Pareto front',
     'description': 'Pareto front viewer',
-    'tags': ['viewer'],
     'outputs': [{'name': 'result', 'type': 'viewer'}],
-    'meta': {'icon': 'icons/pareto-front-viewer.svg'},
+    'meta': {'icon': 'icons/pareto-front-viewer.svg', 'role': 'viewer'},
+    'tags': ['viewer'],
   })
   static paretoFrontViewer(): DG.Viewer {
     return new ParetoFrontViewer();
   }
+  @grok.decorators.func({
+    'name': 'trainPmpo',
+    'description': 'Train probabilistic multi-parameter optimization (pMPO) model',
+  })
+  static trainPmpo(): void {
+    const df = grok.shell.t;
+    if (df === null) {
+      grok.shell.warning('No dataframe is opened');
+      return;
+    }
+    if (!Pmpo.isTableValid(df))
+      return;
+    const pMPO = new Pmpo(df);
+    pMPO.runTrainingApp();
+  }
+  @grok.decorators.func({'name': 'getPmpoAppItems', 'outputs': [{name: 'result', type: 'object'}]})
+  static getPmpoAppItems(@grok.decorators.param({type: 'view'}) view: DG.TableView): any | null {
+    const df = view.dataFrame;
+    if (!Pmpo.isTableValid(df))
+      return null;
+    const pMPO = new Pmpo(df, view);
+    return pMPO.getPmpoAppItems();
+  }
+  @grok.decorators.func({
+    'name': 'generatePmpoDataset',
+    'description': 'Generates syntethetic dataset oriented on the pMPO modeling',
+    'outputs': [{name: 'Synthetic', type: 'dataframe'}],
+  })
+  static async generatePmpoDataset(@grok.decorators.param({'type': 'int'}) samples: number): Promise<DG.DataFrame> {
+    const df = await getSynteticPmpoData(samples);
+    df.name = 'Synthetic';
+    return df;
+  }
 }

package/src/pareto-optimization/pareto-computations.ts CHANGED Viewed

@@ -2,6 +2,12 @@
 import {NumericArray, OPT_TYPE} from './defs';
+/** Computes the Pareto front mask for a given dataset and optimization sense
+ * @param rawData Array of numeric arrays representing the dataset (each array corresponds to a feature/dimension)
+ * @param sense Array of optimization types (OPT_TYPE.MIN or OPT_TYPE.MAX) for each dimension
+ * @param nPoints Number of data points in the dataset
+ * @param nullIndices Optional set of indices corresponding to missing values (these points will be marked as non-optimal)
+ * @returns Boolean array where true indicates that the point is on the Pareto front */
 export function getParetoMask(rawData: NumericArray[], sense: OPT_TYPE[], nPoints: number,
   nullIndices?: Set<number>): boolean[] {
   if (nPoints === 0)

package/src/pareto-optimization/utils.ts CHANGED Viewed

@@ -6,6 +6,7 @@ import {OPT_TYPE} from './defs';
 export const PALETTE = [DG.Color.darkGreen, DG.Color.yellow, DG.Color.darkRed];
+/** Return output color palette w.r.t. the specified type of optimization */
 export function getOutputPalette(type: OPT_TYPE): number[] {
   if (type === OPT_TYPE.MIN)
     return [...PALETTE];
@@ -13,13 +14,14 @@ export function getOutputPalette(type: OPT_TYPE): number[] {
   return [...PALETTE].reverse();
 }
-export function getColorScaleDiv(type: OPT_TYPE): HTMLElement {
+/** Return div with color scale description */
+export function getColorScaleDiv(type: OPT_TYPE, useMinMax: boolean = true): HTMLElement {
   const scale = ui.label('Color scale:');
   scale.style.paddingRight = '7px';
   const elems = [scale];
-  const minLbl = ui.label('min');
+  const minLbl = ui.label(useMinMax ? 'min' : 'worst');
   const midLbl = ui.label('. . .');
-  const maxLbl = ui.label('max');
+  const maxLbl = ui.label(useMinMax ? 'max' : 'best');
   const palette = getOutputPalette(type);
   const colorElems = [minLbl, midLbl, maxLbl].map((el, idx) => {
@@ -36,4 +38,4 @@ export function getColorScaleDiv(type: OPT_TYPE): HTMLElement {
   elems.push(...colorElems);
   return ui.divH(elems);
-}
+} // getColorScaleDiv

package/src/probabilistic-scoring/data-generator.ts ADDED Viewed

@@ -0,0 +1,157 @@
+import * as grok from 'datagrok-api/grok';
+import * as ui from 'datagrok-api/ui';
+import * as DG from 'datagrok-api/dg';
+import {DescriptorStatistics, SOURCE_PATH, SYNTHETIC_DRUG_NAME} from './pmpo-defs';
+import {getDescriptorStatistics, getDesiredTables} from './stat-tools';
+//@ts-ignore: no types
+import * as jStat from 'jstat';
+/** Generates synthetic data for pMPO model training and testing
+ * @param samplesCount Number of samples to generate
+ * @returns DataFrame with generated data */
+export async function getSynteticPmpoData(samplesCount: number): Promise<DG.DataFrame> {
+  const df = await grok.dapi.files.readCsv(SOURCE_PATH);
+  const generator = new PmpoDataGenerator(df, 'Drug', 'CNS', 'Smiles');
+  return generator.getGenerated(samplesCount);
+}
+/** Class for generating synthetic data for pMPO model training and testing */
+export class PmpoDataGenerator {
+  private sourceDf: DG.DataFrame;
+  private drugName: string;
+  private desirabilityColName: string;
+  private smilesColName: string;
+  private desiredProbability: number;
+  private descriptorStats: Map<string, DescriptorStatistics>;
+  constructor(df: DG.DataFrame, drugName: string, desirabilityColName: string, smilesColName: string) {
+    this.sourceDf = df;
+    this.drugName = drugName;
+    this.desirabilityColName = desirabilityColName;
+    this.smilesColName = smilesColName;
+    const descriptorNames = df.columns.toList().filter((col) => col.isNumerical).map((col) => col.name);
+    const {desired, nonDesired} = getDesiredTables(df, df.col(desirabilityColName)!);
+    // Compute descriptors' statistics
+    this.descriptorStats = new Map<string, DescriptorStatistics>();
+    descriptorNames.forEach((name) => {
+      this.descriptorStats.set(name, getDescriptorStatistics(desired.col(name)!, nonDesired.col(name)!));
+    });
+    // Probability of desired class
+    this.desiredProbability = desired.rowCount / df.rowCount;
+  } // constructor
+  /** Generates synthetic data for pMPO model training and testing
+   * @param samplesCount Number of samples to generate
+   * @returns DataFrame with generated data */
+  public getGenerated(samplesCount: number): DG.DataFrame {
+    if (samplesCount <= 1)
+      throw new Error('Failed to generate pMPO data: sample count must be greater than 1.');
+    let result: DG.DataFrame;
+    /* Use rows from the source dataframe if the requested sample count
+       is less than or equal to the source dataframe row count */
+    if (samplesCount <= this.sourceDf.rowCount) {
+      const rowMask = DG.BitSet.create(this.sourceDf.rowCount);
+      for (let i = 0; i < samplesCount; ++i)
+        rowMask.set(i, true);
+      result = this.sourceDf.clone(rowMask);
+    } else {
+      const cloneDf = this.getClonedSourceDfWithFloatNumericCols();
+      result = cloneDf.append(this.getSyntheticTable(samplesCount - this.sourceDf.rowCount));
+    }
+    // Check boolean columns and ensure non-zero stdev
+    for (const col of result.columns) {
+      if (col.type === DG.COLUMN_TYPE.BOOL && col.stats.stdev === 0) {
+        // All values are the same, flip the first value
+        let value = col.get(0);
+        col.set(0, !value);
+        value = col.get(1);
+        col.set(1, !value);
+      }
+    }
+    return result;
+  } // getGenerated
+  /** Generates a synthetic data table
+   * @param samplesCount Number of samples to generate
+   * @returns DataFrame with synthetic data */
+  private getSyntheticTable(samplesCount: number): DG.DataFrame {
+    const desirabilityRaw = new Array<boolean>(samplesCount);
+    for (let i = 0; i < samplesCount; ++i)
+      desirabilityRaw[i] = (Math.random() < this.desiredProbability);
+    const cols = [
+      this.getDrugColumn(samplesCount),
+      this.getSmilesColumn(samplesCount),
+      DG.Column.fromList(DG.COLUMN_TYPE.BOOL, this.desirabilityColName, desirabilityRaw),
+    ];
+    this.descriptorStats.forEach((stat, name) => {
+      const arr = new Float32Array(samplesCount);
+      for (let i = 0; i < samplesCount; ++i) {
+        if (desirabilityRaw[i])
+          arr[i] = jStat.normal.sample(stat.desAvg, stat.desStd);
+        else
+          arr[i] = jStat.normal.sample(stat.nonDesAvg, stat.nonDesStd);
+      }
+      // @ts-ignore
+      cols.push(DG.Column.fromFloat32Array(name, arr));
+    });
+    return DG.DataFrame.fromColumns(cols);
+  } // getSyntheticTable
+  /** Generates a column with synthetic drug names
+   * @param samplesCount Number of samples to generate
+   * @returns Column with synthetic drug names */
+  private getDrugColumn(samplesCount: number): DG.Column<string> {
+    return DG.Column.fromList(
+      DG.COLUMN_TYPE.STRING,
+      this.drugName,
+      Array.from({length: samplesCount}, (_, i) => `${SYNTHETIC_DRUG_NAME} ${i + 1}`));
+  }
+  /** Generates a column with synthetic SMILES strings
+   * @param samplesCount Number of samples to generate
+   * @returns Column with synthetic SMILES strings */
+  private getSmilesColumn(samplesCount: number): DG.Column<string> {
+    return DG.Column.fromList(
+      DG.COLUMN_TYPE.STRING,
+      this.smilesColName,
+      Array.from({length: samplesCount}, () => 'C'));
+  }
+  /** Clones the source dataframe converting numerical columns to Float type
+   * @returns Cloned dataframe */
+  private getClonedSourceDfWithFloatNumericCols(): DG.DataFrame {
+    const cols: DG.Column[] = [];
+    this.sourceDf.columns.toList().forEach((col) => {
+      if (col.isNumerical)
+        cols.push(col.clone().convertTo(DG.COLUMN_TYPE.FLOAT));
+      else
+        cols.push(col.clone());
+    });
+    const clone = DG.DataFrame.fromColumns(cols);
+    clone.name = this.sourceDf.name;
+    return clone;
+  }
+} // PmpoDataGenerator