@datagrok/bio 2.4.18 → 2.4.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintrc.json +2 -8
- package/dist/864.js +1 -1
- package/dist/864.js.map +1 -1
- package/dist/package-test.js +1 -1
- package/dist/package-test.js.map +1 -1
- package/dist/package.js +1 -1
- package/dist/package.js.map +1 -1
- package/files/data/sample_FASTA_PT_activity.csv +100 -0
- package/files/tests/to-atomic-level-dna-fasta-input.csv +11 -0
- package/files/tests/to-atomic-level-dna-output.csv +15299 -0
- package/files/tests/to-atomic-level-msa-output.csv +3594 -0
- package/files/tests/to-atomic-level-msa-separator-input.csv +12 -0
- package/files/tests/to-atomic-level-peptides-fasta-input.csv +65 -0
- package/files/tests/to-atomic-level-peptides-output.csv +34901 -0
- package/package.json +4 -4
- package/src/demo/bio01-similarity-diversity.ts +15 -9
- package/src/demo/bio01a-hierarchical-clustering-and-sequence-space.ts +18 -10
- package/src/demo/bio01b-hierarchical-clustering-and-activity-cliffs.ts +16 -14
- package/src/demo/bio03-atomic-level.ts +25 -3
- package/src/demo/bio05-helm-msa-sequence-space.ts +10 -8
- package/src/demo/utils.ts +0 -12
- package/src/package-test.ts +2 -0
- package/src/package.ts +18 -8
- package/src/tests/converters-test.ts +24 -24
- package/src/tests/mm-distance-tests.ts +138 -0
- package/src/tests/to-atomic-level-tests.ts +187 -0
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import * as grok from 'datagrok-api/grok';
|
|
2
|
+
import * as ui from 'datagrok-api/ui';
|
|
3
|
+
import * as DG from 'datagrok-api/dg';
|
|
4
|
+
|
|
5
|
+
import {category, expect, test} from '@datagrok-libraries/utils/src/test';
|
|
6
|
+
import {UnitsHandler} from '@datagrok-libraries/bio/src/utils/units-handler';
|
|
7
|
+
import {MmDistanceFunctionsNames, mmDistanceFunctions}
|
|
8
|
+
from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
|
|
9
|
+
|
|
10
|
+
category('Distance', async () => {
|
|
11
|
+
const scoringMatrix = [
|
|
12
|
+
[1, 0, 0, 0],
|
|
13
|
+
[0, 1, 0, 0],
|
|
14
|
+
[0, 0, 1, 0],
|
|
15
|
+
[0, 0, 0, 1],
|
|
16
|
+
];
|
|
17
|
+
|
|
18
|
+
const alphabetIndexes = {'F': 0, 'W': 1, 'R': 2, 'Y': 3};
|
|
19
|
+
|
|
20
|
+
const prot1 = 'FWRWY';
|
|
21
|
+
const prot2 = 'FWRWW';
|
|
22
|
+
|
|
23
|
+
const prot3 = 'FWY';
|
|
24
|
+
const prot4 = 'FWRWY';
|
|
25
|
+
|
|
26
|
+
const prot5 = 'FWY';
|
|
27
|
+
const prot6 = 'FWRRRRY';
|
|
28
|
+
|
|
29
|
+
const protTable = `seq
|
|
30
|
+
FWRWYVKHP
|
|
31
|
+
YNRWYVKHP
|
|
32
|
+
MWRSWYCKHP`;
|
|
33
|
+
|
|
34
|
+
const DNATable = `seq
|
|
35
|
+
ATAACG
|
|
36
|
+
ATCGA
|
|
37
|
+
ATCGA`;
|
|
38
|
+
|
|
39
|
+
const MSATable = `seq
|
|
40
|
+
ATAAC
|
|
41
|
+
ATCGA
|
|
42
|
+
ATCGA`;
|
|
43
|
+
test('protein-distance-function', async () => {
|
|
44
|
+
const uh = await _initMacromoleculeColumn(protTable);
|
|
45
|
+
const distFunc = uh.getDistanceFunctionName();
|
|
46
|
+
expect(distFunc, MmDistanceFunctionsNames.NEEDLEMANN_WUNSCH);
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
test('DNA-distance-function', async () => {
|
|
50
|
+
const uh = await _initMacromoleculeColumn(DNATable);
|
|
51
|
+
const distFunc = uh.getDistanceFunctionName();
|
|
52
|
+
expect(distFunc, MmDistanceFunctionsNames.LEVENSHTEIN);
|
|
53
|
+
});
|
|
54
|
+
|
|
55
|
+
test('MSA-distance-function', async () => {
|
|
56
|
+
const uh = await _initMacromoleculeColumn(MSATable);
|
|
57
|
+
const distFunc = uh.getDistanceFunctionName();
|
|
58
|
+
expect(distFunc, MmDistanceFunctionsNames.HAMMING);
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
test('levenstein-sub', async () => {
|
|
62
|
+
const df = mmDistanceFunctions[MmDistanceFunctionsNames.LEVENSHTEIN]();
|
|
63
|
+
_testDistance(prot1, prot2, df, 1);
|
|
64
|
+
});
|
|
65
|
+
test('levenstein-del', async () => {
|
|
66
|
+
const df = mmDistanceFunctions[MmDistanceFunctionsNames.LEVENSHTEIN]();
|
|
67
|
+
_testDistance(prot3, prot4, df, 2);
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
test('hamming', async () => {
|
|
71
|
+
const df = mmDistanceFunctions[MmDistanceFunctionsNames.HAMMING]();
|
|
72
|
+
_testDistance(prot3, prot4, df, 3);
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
// Note that here the result is actually an inverted value of alignment score, which is coorelated with distance
|
|
76
|
+
// tests using default BLOSUM62 matrix are in agreement with the results of the online tool
|
|
77
|
+
test('needleman-blosum62', async () => {
|
|
78
|
+
const df = mmDistanceFunctions[MmDistanceFunctionsNames.NEEDLEMANN_WUNSCH]();
|
|
79
|
+
_testDistance(prot1, prot2, df, -35);
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
test('needleman-blosum62-del', async () => {
|
|
83
|
+
const df = mmDistanceFunctions[MmDistanceFunctionsNames.NEEDLEMANN_WUNSCH]();
|
|
84
|
+
_testDistance(prot3, prot4, df, -14);
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
test('needleman-custom-sub', async () => {
|
|
88
|
+
const df = mmDistanceFunctions[MmDistanceFunctionsNames.NEEDLEMANN_WUNSCH](
|
|
89
|
+
{scoringMatrix, alphabetIndexes, gapOpen: 1, gapExtend: 1}
|
|
90
|
+
);
|
|
91
|
+
_testDistance(prot1, prot2, df, -4);
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
test('needleman-custom-del', async () => {
|
|
95
|
+
const df = mmDistanceFunctions[MmDistanceFunctionsNames.NEEDLEMANN_WUNSCH](
|
|
96
|
+
{scoringMatrix, alphabetIndexes, gapOpen: 1, gapExtend: 1}
|
|
97
|
+
);
|
|
98
|
+
_testDistance(prot3, prot4, df, -1);
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
test('needleman-custom-zero-extend', async () => {
|
|
102
|
+
const df = mmDistanceFunctions[MmDistanceFunctionsNames.NEEDLEMANN_WUNSCH](
|
|
103
|
+
{scoringMatrix, alphabetIndexes, gapOpen: 1, gapExtend: 0}
|
|
104
|
+
);
|
|
105
|
+
_testDistance(prot5, prot6, df, -2);
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
test('needleman-custom-half-extend', async () => {
|
|
109
|
+
const df = mmDistanceFunctions[MmDistanceFunctionsNames.NEEDLEMANN_WUNSCH](
|
|
110
|
+
{scoringMatrix, alphabetIndexes, gapOpen: 2, gapExtend: 1}
|
|
111
|
+
);
|
|
112
|
+
_testDistance(prot5, prot6, df, 2);
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
test('needleman-custom-same-extend', async () => {
|
|
116
|
+
const df = mmDistanceFunctions[MmDistanceFunctionsNames.NEEDLEMANN_WUNSCH](
|
|
117
|
+
{scoringMatrix, alphabetIndexes, gapOpen: 1, gapExtend: 1}
|
|
118
|
+
);
|
|
119
|
+
_testDistance(prot5, prot6, df, 1);
|
|
120
|
+
});
|
|
121
|
+
});
|
|
122
|
+
|
|
123
|
+
async function _initMacromoleculeColumn(csv: string): Promise<UnitsHandler> {
|
|
124
|
+
const srcDf: DG.DataFrame = DG.DataFrame.fromCsv(csv);
|
|
125
|
+
const seqCol = srcDf.col('seq')!;
|
|
126
|
+
const semType: string = await grok.functions
|
|
127
|
+
.call('Bio:detectMacromolecule', {col: seqCol}) as unknown as string;
|
|
128
|
+
if (semType)
|
|
129
|
+
seqCol.semType = semType;
|
|
130
|
+
await grok.data.detectSemanticTypes(srcDf);
|
|
131
|
+
const uh = new UnitsHandler(seqCol);
|
|
132
|
+
return uh;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
function _testDistance(seq1: string, seq2: string, df: (a: string, b: string) => number, expected: number) {
|
|
136
|
+
const d = df(seq1, seq2);
|
|
137
|
+
expect(d, expected);
|
|
138
|
+
}
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
/* Do not change these import lines to match external modules in webpack configuration */
|
|
2
|
+
import * as grok from 'datagrok-api/grok';
|
|
3
|
+
import * as ui from 'datagrok-api/ui';
|
|
4
|
+
import * as DG from 'datagrok-api/dg';
|
|
5
|
+
|
|
6
|
+
import {before, after, category, test, expectArray} from '@datagrok-libraries/utils/src/test';
|
|
7
|
+
|
|
8
|
+
import {getMonomerLibHelper, toAtomicLevel} from '../package';
|
|
9
|
+
import {_toAtomicLevel} from '@datagrok-libraries/bio/src/monomer-works/to-atomic-level';
|
|
10
|
+
import {IMonomerLib} from '@datagrok-libraries/bio/src/types/index';
|
|
11
|
+
import {IMonomerLibHelper} from '@datagrok-libraries/bio/src/monomer-works/monomer-utils';
|
|
12
|
+
import {LIB_STORAGE_NAME} from '../utils/monomer-lib';
|
|
13
|
+
|
|
14
|
+
const appPath = 'System:AppData/Bio';
|
|
15
|
+
const fileSource = new DG.FileSource(appPath);
|
|
16
|
+
|
|
17
|
+
const testNames: { [k: string]: string } = {
|
|
18
|
+
PT: 'peptides fasta',
|
|
19
|
+
DNA: 'dna fasta',
|
|
20
|
+
MSA: 'msa separator',
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
const inputPath: { [k: string]: string } = {
|
|
24
|
+
PT: 'tests/to-atomic-level-peptides-fasta-input.csv',
|
|
25
|
+
DNA: 'tests/to-atomic-level-dna-fasta-input.csv',
|
|
26
|
+
MSA: 'tests/to-atomic-level-msa-separator-input.csv',
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
const outputPath: { [k: string]: string } = {
|
|
30
|
+
PT: 'tests/to-atomic-level-peptides-output.csv',
|
|
31
|
+
DNA: 'tests/to-atomic-level-dna-output.csv',
|
|
32
|
+
MSA: 'tests/to-atomic-level-msa-output.csv',
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
const inputColName = 'sequence';
|
|
36
|
+
const outputColName = 'molfile(sequence)';
|
|
37
|
+
|
|
38
|
+
category('toAtomicLevel', async () => {
|
|
39
|
+
const sourceDf: { [key: string]: DG.DataFrame } = {};
|
|
40
|
+
const targetDf: { [key: string]: DG.DataFrame } = {};
|
|
41
|
+
|
|
42
|
+
let monomerLibHelper: IMonomerLibHelper;
|
|
43
|
+
/** Backup actual user's monomer libraries settings */
|
|
44
|
+
let userLibrariesSettings: any = null;
|
|
45
|
+
|
|
46
|
+
before(async () => {
|
|
47
|
+
monomerLibHelper = await getMonomerLibHelper();
|
|
48
|
+
userLibrariesSettings = await grok.dapi.userDataStorage.get(LIB_STORAGE_NAME, true);
|
|
49
|
+
// Clear settings to test default
|
|
50
|
+
await grok.dapi.userDataStorage.put(LIB_STORAGE_NAME, {}, true);
|
|
51
|
+
await monomerLibHelper.loadLibraries(true);
|
|
52
|
+
|
|
53
|
+
for (const key in testNames) {
|
|
54
|
+
sourceDf[key] = await fileSource.readCsv(inputPath[key]);
|
|
55
|
+
await grok.data.detectSemanticTypes(sourceDf[key]);
|
|
56
|
+
targetDf[key] = await fileSource.readCsv(outputPath[key]);
|
|
57
|
+
}
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
after(async () => {
|
|
61
|
+
await grok.dapi.userDataStorage.put(LIB_STORAGE_NAME, userLibrariesSettings, true);
|
|
62
|
+
await monomerLibHelper.loadLibraries(true);
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
async function getTestResult(source: DG.DataFrame, target: DG.DataFrame): Promise<void> {
|
|
66
|
+
const inputCol = source.getCol(inputColName);
|
|
67
|
+
await toAtomicLevel(source, inputCol);
|
|
68
|
+
const obtainedCol = source.getCol(outputColName);
|
|
69
|
+
const expectedCol = target.getCol(outputColName);
|
|
70
|
+
const obtainedArray = [...obtainedCol.values()];
|
|
71
|
+
const expectedArray = [...expectedCol.values()];
|
|
72
|
+
expectArray(obtainedArray, expectedArray);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
for (const key in testNames) {
|
|
76
|
+
test(`${testNames[key]}`, async () => {
|
|
77
|
+
await getTestResult(sourceDf[key], targetDf[key]);
|
|
78
|
+
}, {skipReason: 'GROK-13100'});
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
enum csvTests {
|
|
82
|
+
fastaDna = 'fastaDna',
|
|
83
|
+
fastaRna = 'fastaRna',
|
|
84
|
+
fastaPt = 'fastaPt',
|
|
85
|
+
|
|
86
|
+
separatorDna = 'separatorDna',
|
|
87
|
+
separatorRna = 'separatorRna',
|
|
88
|
+
separatorPt = 'separatorPt',
|
|
89
|
+
separatorUn = 'separatorUn',
|
|
90
|
+
|
|
91
|
+
helm = 'helm',
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
const csvData: { [key in csvTests]: string } = {
|
|
95
|
+
[csvTests.fastaDna]: `seq
|
|
96
|
+
ACGTC
|
|
97
|
+
CAGTGT
|
|
98
|
+
TTCAAC
|
|
99
|
+
`,
|
|
100
|
+
[csvTests.fastaRna]: `seq
|
|
101
|
+
ACGUC
|
|
102
|
+
CAGUGU
|
|
103
|
+
UUCAAC
|
|
104
|
+
`,
|
|
105
|
+
[csvTests.fastaPt]: `seq
|
|
106
|
+
FWPHEY
|
|
107
|
+
YNRQWYV
|
|
108
|
+
MKPSEYV
|
|
109
|
+
`,
|
|
110
|
+
[csvTests.separatorDna]: `seq
|
|
111
|
+
A/C/G/T/C
|
|
112
|
+
C/A/G/T/G/T
|
|
113
|
+
T/T/C/A/A/C
|
|
114
|
+
`,
|
|
115
|
+
[csvTests.separatorRna]: `seq
|
|
116
|
+
A*C*G*U*C
|
|
117
|
+
C*A*G*U*G*U
|
|
118
|
+
U*U*C*A*A*C
|
|
119
|
+
`,
|
|
120
|
+
[csvTests.separatorPt]: `seq
|
|
121
|
+
F-W-P-H-E-Y
|
|
122
|
+
Y-N-R-Q-W-Y-V
|
|
123
|
+
M-K-P-S-E-Y-V
|
|
124
|
+
`,
|
|
125
|
+
[csvTests.separatorUn]: `seq
|
|
126
|
+
meI-hHis-Aca-N-T-dE-Thr_PO3H2-Aca-D
|
|
127
|
+
meI-hHis-Aca-Cys_SEt-T-dK-Thr_PO3H2-Aca-Tyr_PO3H2
|
|
128
|
+
Lys_Boc-hHis-Aca-Cys_SEt-T-dK-Thr_PO3H2-Aca-Tyr_PO3H2
|
|
129
|
+
`,
|
|
130
|
+
|
|
131
|
+
[csvTests.helm]: `seq
|
|
132
|
+
PEPTIDE1{meI.D-gGlu.Aca.N.T.dE.Thr_PO3H2.Aca.D}$$$
|
|
133
|
+
PEPTIDE1{meI.hHis.Aca.Cys_SEt.T.dK.Thr_PO3H2.Aca.Tyr_PO3H2}$$$
|
|
134
|
+
PEPTIDE1{Lys_Boc.hHis.Aca.Cys_SEt.T.dK.Thr_PO3H2.Aca.Tyr_PO3H2}$$$
|
|
135
|
+
`,
|
|
136
|
+
};
|
|
137
|
+
|
|
138
|
+
/** Also detects semantic types
|
|
139
|
+
* @param {string} key
|
|
140
|
+
* @return {Promise<DG.DataFrame>}
|
|
141
|
+
*/
|
|
142
|
+
async function readCsv(key: csvTests): Promise<DG.DataFrame> {
|
|
143
|
+
// Always recreate test data frame from CSV for reproducible detector behavior in tests.
|
|
144
|
+
const csv: string = csvData[key];
|
|
145
|
+
const df: DG.DataFrame = DG.DataFrame.fromCsv(csv);
|
|
146
|
+
await grok.data.detectSemanticTypes(df);
|
|
147
|
+
return df;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
test('fastaDna', async () => {
|
|
151
|
+
await _testToAtomicLevel(await readCsv(csvTests.fastaDna), 'seq', monomerLibHelper);
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
test('fastaRna', async () => {
|
|
155
|
+
await _testToAtomicLevel(await readCsv(csvTests.fastaRna), 'seq', monomerLibHelper);
|
|
156
|
+
});
|
|
157
|
+
|
|
158
|
+
test('fastaPt', async () => {
|
|
159
|
+
await _testToAtomicLevel(await readCsv(csvTests.fastaPt), 'seq', monomerLibHelper);
|
|
160
|
+
});
|
|
161
|
+
|
|
162
|
+
test('separatorDna', async () => {
|
|
163
|
+
await _testToAtomicLevel(await readCsv(csvTests.separatorDna), 'seq', monomerLibHelper);
|
|
164
|
+
});
|
|
165
|
+
|
|
166
|
+
test('separatorDna', async () => {
|
|
167
|
+
await _testToAtomicLevel(await readCsv(csvTests.separatorRna), 'seq', monomerLibHelper);
|
|
168
|
+
});
|
|
169
|
+
|
|
170
|
+
test('separatorPt', async () => {
|
|
171
|
+
await _testToAtomicLevel(await readCsv(csvTests.separatorPt), 'seq', monomerLibHelper);
|
|
172
|
+
});
|
|
173
|
+
|
|
174
|
+
test('separatorUn', async () => {
|
|
175
|
+
await _testToAtomicLevel(await readCsv(csvTests.separatorUn), 'seq', monomerLibHelper);
|
|
176
|
+
});
|
|
177
|
+
|
|
178
|
+
test('helm', async () => {
|
|
179
|
+
await _testToAtomicLevel(await readCsv(csvTests.helm), 'seq', monomerLibHelper);
|
|
180
|
+
});
|
|
181
|
+
});
|
|
182
|
+
|
|
183
|
+
async function _testToAtomicLevel(df: DG.DataFrame, seqColName: string = 'seq', monomerLibHelper: IMonomerLibHelper) {
|
|
184
|
+
const seqCol: DG.Column<string> = df.getCol(seqColName);
|
|
185
|
+
const monomerLib: IMonomerLib = monomerLibHelper.getBioLib();
|
|
186
|
+
const resCol = await _toAtomicLevel(df, seqCol, monomerLib);
|
|
187
|
+
}
|