@datagrok/eda 1.4.11 → 1.4.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintrc.json +0 -1
- package/CHANGELOG.md +15 -0
- package/CLAUDE.md +185 -0
- package/README.md +8 -0
- package/css/pmpo.css +35 -0
- package/dist/package-test.js +1 -1
- package/dist/package-test.js.map +1 -1
- package/dist/package.js +1 -1
- package/dist/package.js.map +1 -1
- package/eslintrc.json +45 -0
- package/files/drugs-props-test.csv +126 -0
- package/files/drugs-props-train-scores.csv +664 -0
- package/files/drugs-props-train.csv +664 -0
- package/package.json +9 -3
- package/src/anova/anova-tools.ts +1 -1
- package/src/anova/anova-ui.ts +1 -1
- package/src/package-api.ts +18 -0
- package/src/package-test.ts +4 -1
- package/src/package.g.ts +25 -0
- package/src/package.ts +55 -15
- package/src/pareto-optimization/pareto-computations.ts +6 -0
- package/src/pareto-optimization/utils.ts +6 -4
- package/src/probabilistic-scoring/data-generator.ts +157 -0
- package/src/probabilistic-scoring/nelder-mead.ts +204 -0
- package/src/probabilistic-scoring/pmpo-defs.ts +218 -0
- package/src/probabilistic-scoring/pmpo-utils.ts +603 -0
- package/src/probabilistic-scoring/prob-scoring.ts +991 -0
- package/src/probabilistic-scoring/stat-tools.ts +303 -0
- package/src/softmax-classifier.ts +1 -1
- package/src/tests/anova-tests.ts +1 -1
- package/src/tests/classifiers-tests.ts +1 -1
- package/src/tests/dim-reduction-tests.ts +1 -1
- package/src/tests/linear-methods-tests.ts +1 -1
- package/src/tests/mis-vals-imputation-tests.ts +1 -1
- package/src/tests/pareto-tests.ts +253 -0
- package/src/tests/pmpo-tests.ts +157 -0
- package/test-console-output-1.log +175 -209
- package/test-record-1.mp4 +0 -0
package/package.json
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@datagrok/eda",
|
|
3
3
|
"friendlyName": "EDA",
|
|
4
|
-
"version": "1.4.
|
|
4
|
+
"version": "1.4.13",
|
|
5
5
|
"description": "Exploratory Data Analysis Tools",
|
|
6
6
|
"dependencies": {
|
|
7
7
|
"@datagrok-libraries/math": "^1.2.6",
|
|
8
8
|
"@datagrok-libraries/ml": "^6.10.8",
|
|
9
|
+
"@datagrok-libraries/statistics": "^1.10.0",
|
|
9
10
|
"@datagrok-libraries/tutorials": "^1.7.4",
|
|
10
11
|
"@datagrok-libraries/utils": "^4.6.5",
|
|
11
12
|
"@keckelt/tsne": "^1.0.2",
|
|
@@ -14,10 +15,12 @@
|
|
|
14
15
|
"datagrok-api": "^1.26.3",
|
|
15
16
|
"dayjs": "^1.11.9",
|
|
16
17
|
"jstat": "^1.9.6",
|
|
18
|
+
"mathjs": "^15.1.0",
|
|
17
19
|
"source-map-loader": "^4.0.1",
|
|
18
20
|
"umap-js": "^1.3.3",
|
|
19
21
|
"worker-loader": "^3.0.8",
|
|
20
|
-
"wu": "^2.1.0"
|
|
22
|
+
"wu": "^2.1.0",
|
|
23
|
+
"@datagrok-libraries/test": "^1.1.0"
|
|
21
24
|
},
|
|
22
25
|
"author": {
|
|
23
26
|
"name": "Viktor Makarichev",
|
|
@@ -27,7 +30,7 @@
|
|
|
27
30
|
"@typescript-eslint/eslint-plugin": "^5.32.0",
|
|
28
31
|
"@typescript-eslint/parser": "^5.32.0",
|
|
29
32
|
"css-loader": "^7.1.2",
|
|
30
|
-
"datagrok-tools": "^
|
|
33
|
+
"datagrok-tools": "^5.1.5",
|
|
31
34
|
"eslint": "^8.21.0",
|
|
32
35
|
"eslint-config-google": "^0.14.0",
|
|
33
36
|
"style-loader": "^4.0.0",
|
|
@@ -95,5 +98,8 @@
|
|
|
95
98
|
"Reduce Dimensionality": null
|
|
96
99
|
}
|
|
97
100
|
}
|
|
101
|
+
},
|
|
102
|
+
"overrides": {
|
|
103
|
+
"datagrok-api": "$datagrok-api"
|
|
98
104
|
}
|
|
99
105
|
}
|
package/src/anova/anova-tools.ts
CHANGED
package/src/anova/anova-ui.ts
CHANGED
package/src/package-api.ts
CHANGED
|
@@ -274,4 +274,22 @@ export namespace funcs {
|
|
|
274
274
|
export async function paretoFrontViewer(): Promise<any> {
|
|
275
275
|
return await grok.functions.call('EDA:ParetoFrontViewer', {});
|
|
276
276
|
}
|
|
277
|
+
|
|
278
|
+
/**
|
|
279
|
+
Train probabilistic multi-parameter optimization (pMPO) model
|
|
280
|
+
*/
|
|
281
|
+
export async function trainPmpo(): Promise<void> {
|
|
282
|
+
return await grok.functions.call('EDA:TrainPmpo', {});
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
export async function getPmpoAppItems(view: DG.View ): Promise<any> {
|
|
286
|
+
return await grok.functions.call('EDA:GetPmpoAppItems', { view });
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
/**
|
|
290
|
+
Generates syntethetic dataset oriented on the pMPO modeling
|
|
291
|
+
*/
|
|
292
|
+
export async function generatePmpoDataset(samples: number ): Promise<DG.DataFrame> {
|
|
293
|
+
return await grok.functions.call('EDA:GeneratePmpoDataset', { samples });
|
|
294
|
+
}
|
|
277
295
|
}
|
package/src/package-test.ts
CHANGED
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import * as DG from 'datagrok-api/dg';
|
|
2
|
-
import {runTests, tests, TestContext, initAutoTests as initTests} from '@datagrok-libraries/
|
|
2
|
+
import {runTests, tests, TestContext, initAutoTests as initTests} from '@datagrok-libraries/test/src/test';
|
|
3
3
|
import './tests/dim-reduction-tests';
|
|
4
4
|
import './tests/linear-methods-tests';
|
|
5
5
|
import './tests/classifiers-tests';
|
|
6
6
|
import './tests/mis-vals-imputation-tests';
|
|
7
7
|
import './tests/anova-tests';
|
|
8
|
+
import './tests/pmpo-tests';
|
|
9
|
+
import './tests/pareto-tests';
|
|
10
|
+
|
|
8
11
|
export const _package = new DG.Package();
|
|
9
12
|
export {tests};
|
|
10
13
|
|
package/src/package.g.ts
CHANGED
|
@@ -7,6 +7,7 @@ export function info() : void {
|
|
|
7
7
|
}
|
|
8
8
|
|
|
9
9
|
//tags: init
|
|
10
|
+
//meta.role: init
|
|
10
11
|
export async function init() : Promise<void> {
|
|
11
12
|
await PackageFunctions.init();
|
|
12
13
|
}
|
|
@@ -43,6 +44,7 @@ export async function PCA(table: DG.DataFrame, features: DG.ColumnList, componen
|
|
|
43
44
|
//input: double epsilon = 0.01 { description: Minimum distance between two points to be considered as in the same neighborhood. }
|
|
44
45
|
//input: int minimumPoints = 5 { description: Minimum number of points to form a dense region. }
|
|
45
46
|
//meta.defaultPostProcessingFunction: true
|
|
47
|
+
//meta.role: dim-red-postprocessing-function
|
|
46
48
|
export async function dbscanPostProcessingFunction(col1: DG.Column, col2: DG.Column, epsilon: number, minimumPoints: number) : Promise<void> {
|
|
47
49
|
await PackageFunctions.dbscanPostProcessingFunction(col1, col2, epsilon, minimumPoints);
|
|
48
50
|
}
|
|
@@ -54,6 +56,7 @@ export async function dbscanPostProcessingFunction(col1: DG.Column, col2: DG.Col
|
|
|
54
56
|
//output: object result
|
|
55
57
|
//meta.supportedTypes: int,float,double,qnum
|
|
56
58
|
//meta.supportedDistanceFunctions: Difference
|
|
59
|
+
//meta.role: dim-red-preprocessing-function
|
|
57
60
|
export function numberPreprocessingFunction(col: DG.Column, _metric: string) {
|
|
58
61
|
return PackageFunctions.numberPreprocessingFunction(col, _metric);
|
|
59
62
|
}
|
|
@@ -65,6 +68,7 @@ export function numberPreprocessingFunction(col: DG.Column, _metric: string) {
|
|
|
65
68
|
//output: object result
|
|
66
69
|
//meta.supportedTypes: string
|
|
67
70
|
//meta.supportedDistanceFunctions: One-Hot,Levenshtein,Hamming
|
|
71
|
+
//meta.role: dim-red-preprocessing-function
|
|
68
72
|
export function stringPreprocessingFunction(col: DG.Column, _metric: string) {
|
|
69
73
|
return PackageFunctions.stringPreprocessingFunction(col, _metric);
|
|
70
74
|
}
|
|
@@ -77,6 +81,7 @@ export async function reduceDimensionality() : Promise<void> {
|
|
|
77
81
|
|
|
78
82
|
//tags: editor
|
|
79
83
|
//input: funccall call
|
|
84
|
+
//meta.role: editor
|
|
80
85
|
export function GetMCLEditor(call: DG.FuncCall) : void {
|
|
81
86
|
PackageFunctions.GetMCLEditor(call);
|
|
82
87
|
}
|
|
@@ -105,6 +110,7 @@ export async function MCLClustering(df: DG.DataFrame, cols: DG.Column[], metrics
|
|
|
105
110
|
//tags: viewer
|
|
106
111
|
//output: viewer result
|
|
107
112
|
//meta.showInGallery: false
|
|
113
|
+
//meta.role: viewer
|
|
108
114
|
export function markovClusteringViewer() : any {
|
|
109
115
|
return PackageFunctions.markovClusteringViewer();
|
|
110
116
|
}
|
|
@@ -535,6 +541,25 @@ export function paretoFront() : void {
|
|
|
535
541
|
//tags: viewer
|
|
536
542
|
//output: viewer result
|
|
537
543
|
//meta.icon: icons/pareto-front-viewer.svg
|
|
544
|
+
//meta.role: viewer
|
|
538
545
|
export function paretoFrontViewer() : any {
|
|
539
546
|
return PackageFunctions.paretoFrontViewer();
|
|
540
547
|
}
|
|
548
|
+
|
|
549
|
+
//description: Train probabilistic multi-parameter optimization (pMPO) model
|
|
550
|
+
export function trainPmpo() : void {
|
|
551
|
+
PackageFunctions.trainPmpo();
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
//input: view view
|
|
555
|
+
//output: object result
|
|
556
|
+
export function getPmpoAppItems(view: any) : any {
|
|
557
|
+
return PackageFunctions.getPmpoAppItems(view);
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
//description: Generates syntethetic dataset oriented on the pMPO modeling
|
|
561
|
+
//input: int samples
|
|
562
|
+
//output: dataframe Synthetic
|
|
563
|
+
export async function generatePmpoDataset(samples: number) : Promise<any> {
|
|
564
|
+
return await PackageFunctions.generatePmpoDataset(samples);
|
|
565
|
+
}
|
package/src/package.ts
CHANGED
|
@@ -36,9 +36,13 @@ import {SoftmaxClassifier} from './softmax-classifier';
|
|
|
36
36
|
|
|
37
37
|
import {initXgboost} from '../wasm/xgbooster';
|
|
38
38
|
import {XGBooster} from './xgbooster';
|
|
39
|
+
|
|
39
40
|
import {ParetoOptimizer} from './pareto-optimization/pareto-optimizer';
|
|
40
41
|
import {ParetoFrontViewer} from './pareto-optimization/pareto-front-viewer';
|
|
41
42
|
|
|
43
|
+
import {Pmpo} from './probabilistic-scoring/prob-scoring';
|
|
44
|
+
import {getSynteticPmpoData} from './probabilistic-scoring/data-generator';
|
|
45
|
+
|
|
42
46
|
export const _package = new DG.Package();
|
|
43
47
|
export * from './package.g';
|
|
44
48
|
|
|
@@ -51,7 +55,7 @@ export class PackageFunctions {
|
|
|
51
55
|
}
|
|
52
56
|
|
|
53
57
|
|
|
54
|
-
@grok.decorators.init({})
|
|
58
|
+
@grok.decorators.init({tags: ['init']})
|
|
55
59
|
static async init(): Promise<void> {
|
|
56
60
|
await _initEDAAPI();
|
|
57
61
|
await initXgboost();
|
|
@@ -113,13 +117,9 @@ export class PackageFunctions {
|
|
|
113
117
|
|
|
114
118
|
|
|
115
119
|
@grok.decorators.func({
|
|
116
|
-
'meta': {
|
|
117
|
-
'defaultPostProcessingFunction': 'true',
|
|
118
|
-
},
|
|
119
|
-
'tags': [
|
|
120
|
-
'dim-red-postprocessing-function',
|
|
121
|
-
],
|
|
120
|
+
'meta': {'defaultPostProcessingFunction': 'true', 'role': 'dim-red-postprocessing-function'},
|
|
122
121
|
'name': 'DBSCAN clustering',
|
|
122
|
+
'tags': ['dim-red-postprocessing-function'],
|
|
123
123
|
})
|
|
124
124
|
static async dbscanPostProcessingFunction(
|
|
125
125
|
col1: DG.Column,
|
|
@@ -148,9 +148,10 @@ export class PackageFunctions {
|
|
|
148
148
|
'meta': {
|
|
149
149
|
'supportedTypes': 'int,float,double,qnum',
|
|
150
150
|
'supportedDistanceFunctions': 'Difference',
|
|
151
|
+
'role': 'dim-red-preprocessing-function',
|
|
151
152
|
},
|
|
152
|
-
'tags': ['dim-red-preprocessing-function'],
|
|
153
153
|
'name': 'None (number)',
|
|
154
|
+
'tags': ['dim-red-preprocessing-function'],
|
|
154
155
|
'outputs': [{name: 'result', type: 'object'}],
|
|
155
156
|
})
|
|
156
157
|
static numberPreprocessingFunction(
|
|
@@ -166,6 +167,7 @@ export class PackageFunctions {
|
|
|
166
167
|
'meta': {
|
|
167
168
|
'supportedTypes': 'string',
|
|
168
169
|
'supportedDistanceFunctions': 'One-Hot,Levenshtein,Hamming',
|
|
170
|
+
'role': 'dim-red-preprocessing-function',
|
|
169
171
|
},
|
|
170
172
|
'tags': ['dim-red-preprocessing-function'],
|
|
171
173
|
'name': 'None (string)',
|
|
@@ -222,7 +224,7 @@ export class PackageFunctions {
|
|
|
222
224
|
}
|
|
223
225
|
|
|
224
226
|
|
|
225
|
-
@grok.decorators.editor()
|
|
227
|
+
@grok.decorators.editor({tags: ['editor']})
|
|
226
228
|
static GetMCLEditor(
|
|
227
229
|
call: DG.FuncCall): void {
|
|
228
230
|
try {
|
|
@@ -289,10 +291,8 @@ export class PackageFunctions {
|
|
|
289
291
|
|
|
290
292
|
@grok.decorators.func({
|
|
291
293
|
'outputs': [{'name': 'result', 'type': 'viewer'}],
|
|
292
|
-
'
|
|
293
|
-
|
|
294
|
-
],
|
|
295
|
-
'meta': {showInGallery: 'false'},
|
|
294
|
+
'meta': {showInGallery: 'false', role: 'viewer'},
|
|
295
|
+
'tags': ['viewer'],
|
|
296
296
|
'name': 'MCL',
|
|
297
297
|
'description': 'Markov clustering viewer',
|
|
298
298
|
})
|
|
@@ -984,11 +984,51 @@ export class PackageFunctions {
|
|
|
984
984
|
@grok.decorators.func({
|
|
985
985
|
'name': 'Pareto front',
|
|
986
986
|
'description': 'Pareto front viewer',
|
|
987
|
-
'tags': ['viewer'],
|
|
988
987
|
'outputs': [{'name': 'result', 'type': 'viewer'}],
|
|
989
|
-
'meta': {'icon': 'icons/pareto-front-viewer.svg'},
|
|
988
|
+
'meta': {'icon': 'icons/pareto-front-viewer.svg', 'role': 'viewer'},
|
|
989
|
+
'tags': ['viewer'],
|
|
990
990
|
})
|
|
991
991
|
static paretoFrontViewer(): DG.Viewer {
|
|
992
992
|
return new ParetoFrontViewer();
|
|
993
993
|
}
|
|
994
|
+
|
|
995
|
+
@grok.decorators.func({
|
|
996
|
+
'name': 'trainPmpo',
|
|
997
|
+
'description': 'Train probabilistic multi-parameter optimization (pMPO) model',
|
|
998
|
+
})
|
|
999
|
+
static trainPmpo(): void {
|
|
1000
|
+
const df = grok.shell.t;
|
|
1001
|
+
if (df === null) {
|
|
1002
|
+
grok.shell.warning('No dataframe is opened');
|
|
1003
|
+
return;
|
|
1004
|
+
}
|
|
1005
|
+
|
|
1006
|
+
if (!Pmpo.isTableValid(df))
|
|
1007
|
+
return;
|
|
1008
|
+
|
|
1009
|
+
const pMPO = new Pmpo(df);
|
|
1010
|
+
pMPO.runTrainingApp();
|
|
1011
|
+
}
|
|
1012
|
+
|
|
1013
|
+
@grok.decorators.func({'name': 'getPmpoAppItems', 'outputs': [{name: 'result', type: 'object'}]})
|
|
1014
|
+
static getPmpoAppItems(@grok.decorators.param({type: 'view'}) view: DG.TableView): any | null {
|
|
1015
|
+
const df = view.dataFrame;
|
|
1016
|
+
if (!Pmpo.isTableValid(df))
|
|
1017
|
+
return null;
|
|
1018
|
+
|
|
1019
|
+
const pMPO = new Pmpo(df, view);
|
|
1020
|
+
|
|
1021
|
+
return pMPO.getPmpoAppItems();
|
|
1022
|
+
}
|
|
1023
|
+
|
|
1024
|
+
@grok.decorators.func({
|
|
1025
|
+
'name': 'generatePmpoDataset',
|
|
1026
|
+
'description': 'Generates syntethetic dataset oriented on the pMPO modeling',
|
|
1027
|
+
'outputs': [{name: 'Synthetic', type: 'dataframe'}],
|
|
1028
|
+
})
|
|
1029
|
+
static async generatePmpoDataset(@grok.decorators.param({'type': 'int'}) samples: number): Promise<DG.DataFrame> {
|
|
1030
|
+
const df = await getSynteticPmpoData(samples);
|
|
1031
|
+
df.name = 'Synthetic';
|
|
1032
|
+
return df;
|
|
1033
|
+
}
|
|
994
1034
|
}
|
|
@@ -2,6 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
import {NumericArray, OPT_TYPE} from './defs';
|
|
4
4
|
|
|
5
|
+
/** Computes the Pareto front mask for a given dataset and optimization sense
|
|
6
|
+
* @param rawData Array of numeric arrays representing the dataset (each array corresponds to a feature/dimension)
|
|
7
|
+
* @param sense Array of optimization types (OPT_TYPE.MIN or OPT_TYPE.MAX) for each dimension
|
|
8
|
+
* @param nPoints Number of data points in the dataset
|
|
9
|
+
* @param nullIndices Optional set of indices corresponding to missing values (these points will be marked as non-optimal)
|
|
10
|
+
* @returns Boolean array where true indicates that the point is on the Pareto front */
|
|
5
11
|
export function getParetoMask(rawData: NumericArray[], sense: OPT_TYPE[], nPoints: number,
|
|
6
12
|
nullIndices?: Set<number>): boolean[] {
|
|
7
13
|
if (nPoints === 0)
|
|
@@ -6,6 +6,7 @@ import {OPT_TYPE} from './defs';
|
|
|
6
6
|
|
|
7
7
|
export const PALETTE = [DG.Color.darkGreen, DG.Color.yellow, DG.Color.darkRed];
|
|
8
8
|
|
|
9
|
+
/** Return output color palette w.r.t. the specified type of optimization */
|
|
9
10
|
export function getOutputPalette(type: OPT_TYPE): number[] {
|
|
10
11
|
if (type === OPT_TYPE.MIN)
|
|
11
12
|
return [...PALETTE];
|
|
@@ -13,13 +14,14 @@ export function getOutputPalette(type: OPT_TYPE): number[] {
|
|
|
13
14
|
return [...PALETTE].reverse();
|
|
14
15
|
}
|
|
15
16
|
|
|
16
|
-
|
|
17
|
+
/** Return div with color scale description */
|
|
18
|
+
export function getColorScaleDiv(type: OPT_TYPE, useMinMax: boolean = true): HTMLElement {
|
|
17
19
|
const scale = ui.label('Color scale:');
|
|
18
20
|
scale.style.paddingRight = '7px';
|
|
19
21
|
const elems = [scale];
|
|
20
|
-
const minLbl = ui.label('min');
|
|
22
|
+
const minLbl = ui.label(useMinMax ? 'min' : 'worst');
|
|
21
23
|
const midLbl = ui.label('. . .');
|
|
22
|
-
const maxLbl = ui.label('max');
|
|
24
|
+
const maxLbl = ui.label(useMinMax ? 'max' : 'best');
|
|
23
25
|
const palette = getOutputPalette(type);
|
|
24
26
|
|
|
25
27
|
const colorElems = [minLbl, midLbl, maxLbl].map((el, idx) => {
|
|
@@ -36,4 +38,4 @@ export function getColorScaleDiv(type: OPT_TYPE): HTMLElement {
|
|
|
36
38
|
elems.push(...colorElems);
|
|
37
39
|
|
|
38
40
|
return ui.divH(elems);
|
|
39
|
-
}
|
|
41
|
+
} // getColorScaleDiv
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
import * as grok from 'datagrok-api/grok';
|
|
2
|
+
import * as ui from 'datagrok-api/ui';
|
|
3
|
+
import * as DG from 'datagrok-api/dg';
|
|
4
|
+
|
|
5
|
+
import {DescriptorStatistics, SOURCE_PATH, SYNTHETIC_DRUG_NAME} from './pmpo-defs';
|
|
6
|
+
import {getDescriptorStatistics, getDesiredTables} from './stat-tools';
|
|
7
|
+
|
|
8
|
+
//@ts-ignore: no types
|
|
9
|
+
import * as jStat from 'jstat';
|
|
10
|
+
|
|
11
|
+
/** Generates synthetic data for pMPO model training and testing
|
|
12
|
+
* @param samplesCount Number of samples to generate
|
|
13
|
+
* @returns DataFrame with generated data */
|
|
14
|
+
export async function getSynteticPmpoData(samplesCount: number): Promise<DG.DataFrame> {
|
|
15
|
+
const df = await grok.dapi.files.readCsv(SOURCE_PATH);
|
|
16
|
+
const generator = new PmpoDataGenerator(df, 'Drug', 'CNS', 'Smiles');
|
|
17
|
+
|
|
18
|
+
return generator.getGenerated(samplesCount);
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
/** Class for generating synthetic data for pMPO model training and testing */
|
|
22
|
+
export class PmpoDataGenerator {
|
|
23
|
+
private sourceDf: DG.DataFrame;
|
|
24
|
+
private drugName: string;
|
|
25
|
+
private desirabilityColName: string;
|
|
26
|
+
private smilesColName: string;
|
|
27
|
+
private desiredProbability: number;
|
|
28
|
+
private descriptorStats: Map<string, DescriptorStatistics>;
|
|
29
|
+
|
|
30
|
+
constructor(df: DG.DataFrame, drugName: string, desirabilityColName: string, smilesColName: string) {
|
|
31
|
+
this.sourceDf = df;
|
|
32
|
+
this.drugName = drugName;
|
|
33
|
+
this.desirabilityColName = desirabilityColName;
|
|
34
|
+
this.smilesColName = smilesColName;
|
|
35
|
+
|
|
36
|
+
const descriptorNames = df.columns.toList().filter((col) => col.isNumerical).map((col) => col.name);
|
|
37
|
+
const {desired, nonDesired} = getDesiredTables(df, df.col(desirabilityColName)!);
|
|
38
|
+
|
|
39
|
+
// Compute descriptors' statistics
|
|
40
|
+
this.descriptorStats = new Map<string, DescriptorStatistics>();
|
|
41
|
+
descriptorNames.forEach((name) => {
|
|
42
|
+
this.descriptorStats.set(name, getDescriptorStatistics(desired.col(name)!, nonDesired.col(name)!));
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
// Probability of desired class
|
|
46
|
+
this.desiredProbability = desired.rowCount / df.rowCount;
|
|
47
|
+
} // constructor
|
|
48
|
+
|
|
49
|
+
/** Generates synthetic data for pMPO model training and testing
|
|
50
|
+
* @param samplesCount Number of samples to generate
|
|
51
|
+
* @returns DataFrame with generated data */
|
|
52
|
+
public getGenerated(samplesCount: number): DG.DataFrame {
|
|
53
|
+
if (samplesCount <= 1)
|
|
54
|
+
throw new Error('Failed to generate pMPO data: sample count must be greater than 1.');
|
|
55
|
+
|
|
56
|
+
let result: DG.DataFrame;
|
|
57
|
+
|
|
58
|
+
/* Use rows from the source dataframe if the requested sample count
|
|
59
|
+
is less than or equal to the source dataframe row count */
|
|
60
|
+
if (samplesCount <= this.sourceDf.rowCount) {
|
|
61
|
+
const rowMask = DG.BitSet.create(this.sourceDf.rowCount);
|
|
62
|
+
|
|
63
|
+
for (let i = 0; i < samplesCount; ++i)
|
|
64
|
+
rowMask.set(i, true);
|
|
65
|
+
|
|
66
|
+
result = this.sourceDf.clone(rowMask);
|
|
67
|
+
} else {
|
|
68
|
+
const cloneDf = this.getClonedSourceDfWithFloatNumericCols();
|
|
69
|
+
result = cloneDf.append(this.getSyntheticTable(samplesCount - this.sourceDf.rowCount));
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// Check boolean columns and ensure non-zero stdev
|
|
73
|
+
for (const col of result.columns) {
|
|
74
|
+
if (col.type === DG.COLUMN_TYPE.BOOL && col.stats.stdev === 0) {
|
|
75
|
+
// All values are the same, flip the first value
|
|
76
|
+
let value = col.get(0);
|
|
77
|
+
col.set(0, !value);
|
|
78
|
+
|
|
79
|
+
value = col.get(1);
|
|
80
|
+
col.set(1, !value);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
return result;
|
|
85
|
+
} // getGenerated
|
|
86
|
+
|
|
87
|
+
/** Generates a synthetic data table
|
|
88
|
+
* @param samplesCount Number of samples to generate
|
|
89
|
+
* @returns DataFrame with synthetic data */
|
|
90
|
+
private getSyntheticTable(samplesCount: number): DG.DataFrame {
|
|
91
|
+
const desirabilityRaw = new Array<boolean>(samplesCount);
|
|
92
|
+
|
|
93
|
+
for (let i = 0; i < samplesCount; ++i)
|
|
94
|
+
desirabilityRaw[i] = (Math.random() < this.desiredProbability);
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
const cols = [
|
|
98
|
+
this.getDrugColumn(samplesCount),
|
|
99
|
+
this.getSmilesColumn(samplesCount),
|
|
100
|
+
DG.Column.fromList(DG.COLUMN_TYPE.BOOL, this.desirabilityColName, desirabilityRaw),
|
|
101
|
+
];
|
|
102
|
+
|
|
103
|
+
this.descriptorStats.forEach((stat, name) => {
|
|
104
|
+
const arr = new Float32Array(samplesCount);
|
|
105
|
+
|
|
106
|
+
for (let i = 0; i < samplesCount; ++i) {
|
|
107
|
+
if (desirabilityRaw[i])
|
|
108
|
+
arr[i] = jStat.normal.sample(stat.desAvg, stat.desStd);
|
|
109
|
+
else
|
|
110
|
+
arr[i] = jStat.normal.sample(stat.nonDesAvg, stat.nonDesStd);
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// @ts-ignore
|
|
114
|
+
cols.push(DG.Column.fromFloat32Array(name, arr));
|
|
115
|
+
});
|
|
116
|
+
|
|
117
|
+
return DG.DataFrame.fromColumns(cols);
|
|
118
|
+
} // getSyntheticTable
|
|
119
|
+
|
|
120
|
+
/** Generates a column with synthetic drug names
|
|
121
|
+
* @param samplesCount Number of samples to generate
|
|
122
|
+
* @returns Column with synthetic drug names */
|
|
123
|
+
private getDrugColumn(samplesCount: number): DG.Column<string> {
|
|
124
|
+
return DG.Column.fromList(
|
|
125
|
+
DG.COLUMN_TYPE.STRING,
|
|
126
|
+
this.drugName,
|
|
127
|
+
Array.from({length: samplesCount}, (_, i) => `${SYNTHETIC_DRUG_NAME} ${i + 1}`));
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
/** Generates a column with synthetic SMILES strings
|
|
131
|
+
* @param samplesCount Number of samples to generate
|
|
132
|
+
* @returns Column with synthetic SMILES strings */
|
|
133
|
+
private getSmilesColumn(samplesCount: number): DG.Column<string> {
|
|
134
|
+
return DG.Column.fromList(
|
|
135
|
+
DG.COLUMN_TYPE.STRING,
|
|
136
|
+
this.smilesColName,
|
|
137
|
+
Array.from({length: samplesCount}, () => 'C'));
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/** Clones the source dataframe converting numerical columns to Float type
|
|
141
|
+
* @returns Cloned dataframe */
|
|
142
|
+
private getClonedSourceDfWithFloatNumericCols(): DG.DataFrame {
|
|
143
|
+
const cols: DG.Column[] = [];
|
|
144
|
+
|
|
145
|
+
this.sourceDf.columns.toList().forEach((col) => {
|
|
146
|
+
if (col.isNumerical)
|
|
147
|
+
cols.push(col.clone().convertTo(DG.COLUMN_TYPE.FLOAT));
|
|
148
|
+
else
|
|
149
|
+
cols.push(col.clone());
|
|
150
|
+
});
|
|
151
|
+
|
|
152
|
+
const clone = DG.DataFrame.fromColumns(cols);
|
|
153
|
+
clone.name = this.sourceDf.name;
|
|
154
|
+
|
|
155
|
+
return clone;
|
|
156
|
+
}
|
|
157
|
+
} // PmpoDataGenerator
|