@datagrok/eda 1.4.11 → 1.4.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintrc.json +0 -1
- package/CHANGELOG.md +15 -0
- package/CLAUDE.md +185 -0
- package/README.md +8 -0
- package/css/pmpo.css +35 -0
- package/dist/package-test.js +1 -1
- package/dist/package-test.js.map +1 -1
- package/dist/package.js +1 -1
- package/dist/package.js.map +1 -1
- package/eslintrc.json +45 -0
- package/files/drugs-props-test.csv +126 -0
- package/files/drugs-props-train-scores.csv +664 -0
- package/files/drugs-props-train.csv +664 -0
- package/package.json +9 -3
- package/src/anova/anova-tools.ts +1 -1
- package/src/anova/anova-ui.ts +1 -1
- package/src/package-api.ts +18 -0
- package/src/package-test.ts +4 -1
- package/src/package.g.ts +25 -0
- package/src/package.ts +55 -15
- package/src/pareto-optimization/pareto-computations.ts +6 -0
- package/src/pareto-optimization/utils.ts +6 -4
- package/src/probabilistic-scoring/data-generator.ts +157 -0
- package/src/probabilistic-scoring/nelder-mead.ts +204 -0
- package/src/probabilistic-scoring/pmpo-defs.ts +218 -0
- package/src/probabilistic-scoring/pmpo-utils.ts +603 -0
- package/src/probabilistic-scoring/prob-scoring.ts +991 -0
- package/src/probabilistic-scoring/stat-tools.ts +303 -0
- package/src/softmax-classifier.ts +1 -1
- package/src/tests/anova-tests.ts +1 -1
- package/src/tests/classifiers-tests.ts +1 -1
- package/src/tests/dim-reduction-tests.ts +1 -1
- package/src/tests/linear-methods-tests.ts +1 -1
- package/src/tests/mis-vals-imputation-tests.ts +1 -1
- package/src/tests/pareto-tests.ts +253 -0
- package/src/tests/pmpo-tests.ts +157 -0
- package/test-console-output-1.log +175 -209
- package/test-record-1.mp4 +0 -0
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
// Tests for Probabilistic MPO (pMPO)
|
|
2
|
+
// Reference scores are pre-computed and stored in the 'drugs-props-train-scores.csv' file.
|
|
3
|
+
// This scores are computed using the library: https://github.com/Merck/pmpo
|
|
4
|
+
|
|
5
|
+
import * as grok from 'datagrok-api/grok';
|
|
6
|
+
import * as ui from 'datagrok-api/ui';
|
|
7
|
+
import * as DG from 'datagrok-api/dg';
|
|
8
|
+
import {_package} from '../package-test';
|
|
9
|
+
|
|
10
|
+
import {category, expect, test} from '@datagrok-libraries/test/src/test';
|
|
11
|
+
|
|
12
|
+
import {Pmpo} from '../probabilistic-scoring/prob-scoring';
|
|
13
|
+
import {P_VAL_TRES_DEFAULT, Q_CUTOFF_DEFAULT, R2_DEFAULT, SCORES_PATH,
|
|
14
|
+
SOURCE_PATH} from '../probabilistic-scoring/pmpo-defs';
|
|
15
|
+
import {getSynteticPmpoData} from '../probabilistic-scoring/data-generator';
|
|
16
|
+
|
|
17
|
+
const TIMEOUT = 10000;
|
|
18
|
+
const MAD_THRESH = 1E-6;
|
|
19
|
+
|
|
20
|
+
const DESIRABILITY_COL_NAME = 'CNS';
|
|
21
|
+
const DESCRIPTOR_NAMES = ['TPSA', 'TPSA_S', 'HBA', 'HBD', 'MW', 'nAtoms',
|
|
22
|
+
'cLogD_ACD_v15', 'mapKa', 'cLogP_Biobyte', 'mbpKa', 'cLogP_ACD_v15', 'ALogP98'];
|
|
23
|
+
const SCORES_NAME = 'Score';
|
|
24
|
+
const DRUG = 'Drug';
|
|
25
|
+
|
|
26
|
+
const SIGMOIDAL = 'Sigmoidal';
|
|
27
|
+
const GAUSSIAN = 'Gaussian';
|
|
28
|
+
const PMPO_MODES = [SIGMOIDAL, GAUSSIAN];
|
|
29
|
+
|
|
30
|
+
const SAMPLES_K = 100;
|
|
31
|
+
const SAMPLES_COUNT = 1000 * SAMPLES_K;
|
|
32
|
+
|
|
33
|
+
/** Computes the maximum absolute deviation between pMPO scores in two data frames */
|
|
34
|
+
function getScoreMaxDeviation(sourceDrugCol: DG.Column, sourceScores: DG.Column,
|
|
35
|
+
referenceDrugCol: DG.Column, referenceScores: DG.Column): number {
|
|
36
|
+
let mad = 0;
|
|
37
|
+
|
|
38
|
+
const sourceDrugList = sourceDrugCol.toList();
|
|
39
|
+
const referenceDrugList = referenceDrugCol.toList();
|
|
40
|
+
|
|
41
|
+
const sourceScoresRaw = sourceScores.getRawData();
|
|
42
|
+
const referenceScoresRaw = referenceScores.getRawData();
|
|
43
|
+
|
|
44
|
+
sourceDrugList.forEach((name, idx) => {
|
|
45
|
+
const refIdx = referenceDrugList.indexOf(name);
|
|
46
|
+
|
|
47
|
+
if (refIdx < 0)
|
|
48
|
+
throw new Error(`Failed to compare pMPO scores: the "${name}" drug is missing in the reference data.`);
|
|
49
|
+
|
|
50
|
+
mad = Math.max(mad, Math.abs(sourceScoresRaw[idx] - referenceScoresRaw[refIdx]));
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
return mad;
|
|
54
|
+
} // getScoreMaxDeviation
|
|
55
|
+
|
|
56
|
+
category('Probabilistic MPO', () => {
|
|
57
|
+
// Correctness tests: compare pMPO scores with reference scores
|
|
58
|
+
PMPO_MODES.forEach((refScoreName) => {
|
|
59
|
+
const useSigmoid = (refScoreName == SIGMOIDAL);
|
|
60
|
+
|
|
61
|
+
test('Correctness: ' + refScoreName, async () => {
|
|
62
|
+
let sourceDf: DG.DataFrame | null = null;
|
|
63
|
+
let referenceDf: DG.DataFrame | null = null;
|
|
64
|
+
let desirability: DG.Column | null = null;
|
|
65
|
+
let descriptors: DG.Column[] = [];
|
|
66
|
+
let sourceDrugCol: DG.Column | null = null;
|
|
67
|
+
let referenceDrugCol: DG.Column | null = null;
|
|
68
|
+
let referencePrediction: DG.Column | null = null;
|
|
69
|
+
let mad: number | null = null;
|
|
70
|
+
|
|
71
|
+
try {
|
|
72
|
+
// Load data
|
|
73
|
+
sourceDf = await grok.dapi.files.readCsv(SOURCE_PATH);
|
|
74
|
+
referenceDf = await grok.dapi.files.readCsv(SCORES_PATH);
|
|
75
|
+
|
|
76
|
+
// Extract training items
|
|
77
|
+
desirability = sourceDf.col(DESIRABILITY_COL_NAME);
|
|
78
|
+
descriptors = sourceDf.columns.byNames(DESCRIPTOR_NAMES);
|
|
79
|
+
|
|
80
|
+
if (desirability == null)
|
|
81
|
+
throw new Error();
|
|
82
|
+
|
|
83
|
+
// Train pMPO model
|
|
84
|
+
const trainRes = Pmpo.fit(
|
|
85
|
+
sourceDf,
|
|
86
|
+
DG.DataFrame.fromColumns(descriptors).columns,
|
|
87
|
+
desirability,
|
|
88
|
+
P_VAL_TRES_DEFAULT,
|
|
89
|
+
R2_DEFAULT,
|
|
90
|
+
Q_CUTOFF_DEFAULT,
|
|
91
|
+
);
|
|
92
|
+
|
|
93
|
+
// Apply pMPO
|
|
94
|
+
const prediction = Pmpo.predict(sourceDf, trainRes.params, useSigmoid, SCORES_NAME);
|
|
95
|
+
|
|
96
|
+
// Compare with reference scores
|
|
97
|
+
sourceDrugCol = sourceDf.col(DRUG);
|
|
98
|
+
referenceDrugCol = referenceDf.col(DRUG);
|
|
99
|
+
referencePrediction = referenceDf.col(refScoreName);
|
|
100
|
+
|
|
101
|
+
mad = getScoreMaxDeviation(sourceDrugCol!, prediction, referenceDrugCol!, referencePrediction!);
|
|
102
|
+
|
|
103
|
+
//console.log(refScoreName, ': max absolute deviation of pMPO scores:', mad);
|
|
104
|
+
} catch (error) {
|
|
105
|
+
grok.shell.error((error as Error).message);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
expect(sourceDf !== null, true, 'Failed to load the source data: ' + SOURCE_PATH);
|
|
109
|
+
expect(referenceDf !== null, true, 'Failed to load the scores data: ' + SCORES_PATH);
|
|
110
|
+
expect(desirability !== null, true, 'Inconsistent source data: no column ' + DESIRABILITY_COL_NAME);
|
|
111
|
+
expect(descriptors.length, DESCRIPTOR_NAMES.length, 'Inconsistent source data: no enough of columns');
|
|
112
|
+
expect(sourceDrugCol !== null, true, 'Inconsistent source data: no column ' + DRUG);
|
|
113
|
+
expect(referenceDrugCol !== null, true, 'Inconsistent reference data: no column ' + DRUG);
|
|
114
|
+
expect(referencePrediction !== null, true, 'Inconsistent reference data: no column ' + SCORES_NAME);
|
|
115
|
+
expect(mad !== null, true, 'Failed to compare pMPO scores with the reference data');
|
|
116
|
+
expect(mad! < MAD_THRESH, true, `Max absolute deviation of pMPO scores exceeds the threshold (${MAD_THRESH})`);
|
|
117
|
+
}, {timeout: TIMEOUT});
|
|
118
|
+
});
|
|
119
|
+
|
|
120
|
+
// Performance tests: measure time of pMPO training
|
|
121
|
+
test('Performance: ' + SAMPLES_K + 'K drugs, ' + DESCRIPTOR_NAMES.length + ' descriptors', async () => {
|
|
122
|
+
let sourceDf: DG.DataFrame | null = null;
|
|
123
|
+
let desirability: DG.Column | null = null;
|
|
124
|
+
let descriptors: DG.Column[] = [];
|
|
125
|
+
|
|
126
|
+
try {
|
|
127
|
+
// Generate synthetic data
|
|
128
|
+
sourceDf = await getSynteticPmpoData(SAMPLES_COUNT);
|
|
129
|
+
|
|
130
|
+
// Extract training items
|
|
131
|
+
desirability = sourceDf.col(DESIRABILITY_COL_NAME);
|
|
132
|
+
descriptors = sourceDf.columns.byNames(DESCRIPTOR_NAMES);
|
|
133
|
+
|
|
134
|
+
if (desirability == null)
|
|
135
|
+
throw new Error();
|
|
136
|
+
|
|
137
|
+
// Train pMPO model
|
|
138
|
+
const trainRes = Pmpo.fit(
|
|
139
|
+
sourceDf,
|
|
140
|
+
DG.DataFrame.fromColumns(descriptors).columns,
|
|
141
|
+
desirability,
|
|
142
|
+
P_VAL_TRES_DEFAULT,
|
|
143
|
+
R2_DEFAULT,
|
|
144
|
+
Q_CUTOFF_DEFAULT,
|
|
145
|
+
);
|
|
146
|
+
|
|
147
|
+
// Apply pMPO
|
|
148
|
+
Pmpo.predict(sourceDf, trainRes.params, true, SCORES_NAME);
|
|
149
|
+
} catch (error) {
|
|
150
|
+
grok.shell.error((error as Error).message);
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
expect(sourceDf !== null, true, 'Failed to load the source data: ' + SOURCE_PATH);
|
|
154
|
+
expect(desirability !== null, true, 'Inconsistent source data: no column ' + DESIRABILITY_COL_NAME);
|
|
155
|
+
expect(descriptors.length, DESCRIPTOR_NAMES.length, 'Inconsistent source data: no enough of columns');
|
|
156
|
+
}, {timeout: TIMEOUT});
|
|
157
|
+
});
|