@datagrok/bio 2.4.18 → 2.4.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,138 @@
1
+ import * as grok from 'datagrok-api/grok';
2
+ import * as ui from 'datagrok-api/ui';
3
+ import * as DG from 'datagrok-api/dg';
4
+
5
+ import {category, expect, test} from '@datagrok-libraries/utils/src/test';
6
+ import {UnitsHandler} from '@datagrok-libraries/bio/src/utils/units-handler';
7
+ import {MmDistanceFunctionsNames, mmDistanceFunctions}
8
+ from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
9
+
10
+ category('Distance', async () => {
11
+ const scoringMatrix = [
12
+ [1, 0, 0, 0],
13
+ [0, 1, 0, 0],
14
+ [0, 0, 1, 0],
15
+ [0, 0, 0, 1],
16
+ ];
17
+
18
+ const alphabetIndexes = {'F': 0, 'W': 1, 'R': 2, 'Y': 3};
19
+
20
+ const prot1 = 'FWRWY';
21
+ const prot2 = 'FWRWW';
22
+
23
+ const prot3 = 'FWY';
24
+ const prot4 = 'FWRWY';
25
+
26
+ const prot5 = 'FWY';
27
+ const prot6 = 'FWRRRRY';
28
+
29
+ const protTable = `seq
30
+ FWRWYVKHP
31
+ YNRWYVKHP
32
+ MWRSWYCKHP`;
33
+
34
+ const DNATable = `seq
35
+ ATAACG
36
+ ATCGA
37
+ ATCGA`;
38
+
39
+ const MSATable = `seq
40
+ ATAAC
41
+ ATCGA
42
+ ATCGA`;
43
+ test('protein-distance-function', async () => {
44
+ const uh = await _initMacromoleculeColumn(protTable);
45
+ const distFunc = uh.getDistanceFunctionName();
46
+ expect(distFunc, MmDistanceFunctionsNames.NEEDLEMANN_WUNSCH);
47
+ });
48
+
49
+ test('DNA-distance-function', async () => {
50
+ const uh = await _initMacromoleculeColumn(DNATable);
51
+ const distFunc = uh.getDistanceFunctionName();
52
+ expect(distFunc, MmDistanceFunctionsNames.LEVENSHTEIN);
53
+ });
54
+
55
+ test('MSA-distance-function', async () => {
56
+ const uh = await _initMacromoleculeColumn(MSATable);
57
+ const distFunc = uh.getDistanceFunctionName();
58
+ expect(distFunc, MmDistanceFunctionsNames.HAMMING);
59
+ });
60
+
61
+ test('levenstein-sub', async () => {
62
+ const df = mmDistanceFunctions[MmDistanceFunctionsNames.LEVENSHTEIN]();
63
+ _testDistance(prot1, prot2, df, 1);
64
+ });
65
+ test('levenstein-del', async () => {
66
+ const df = mmDistanceFunctions[MmDistanceFunctionsNames.LEVENSHTEIN]();
67
+ _testDistance(prot3, prot4, df, 2);
68
+ });
69
+
70
+ test('hamming', async () => {
71
+ const df = mmDistanceFunctions[MmDistanceFunctionsNames.HAMMING]();
72
+ _testDistance(prot3, prot4, df, 3);
73
+ });
74
+
75
+ // Note that here the result is actually an inverted value of alignment score, which is coorelated with distance
76
+ // tests using default BLOSUM62 matrix are in agreement with the results of the online tool
77
+ test('needleman-blosum62', async () => {
78
+ const df = mmDistanceFunctions[MmDistanceFunctionsNames.NEEDLEMANN_WUNSCH]();
79
+ _testDistance(prot1, prot2, df, -35);
80
+ });
81
+
82
+ test('needleman-blosum62-del', async () => {
83
+ const df = mmDistanceFunctions[MmDistanceFunctionsNames.NEEDLEMANN_WUNSCH]();
84
+ _testDistance(prot3, prot4, df, -14);
85
+ });
86
+
87
+ test('needleman-custom-sub', async () => {
88
+ const df = mmDistanceFunctions[MmDistanceFunctionsNames.NEEDLEMANN_WUNSCH](
89
+ {scoringMatrix, alphabetIndexes, gapOpen: 1, gapExtend: 1}
90
+ );
91
+ _testDistance(prot1, prot2, df, -4);
92
+ });
93
+
94
+ test('needleman-custom-del', async () => {
95
+ const df = mmDistanceFunctions[MmDistanceFunctionsNames.NEEDLEMANN_WUNSCH](
96
+ {scoringMatrix, alphabetIndexes, gapOpen: 1, gapExtend: 1}
97
+ );
98
+ _testDistance(prot3, prot4, df, -1);
99
+ });
100
+
101
+ test('needleman-custom-zero-extend', async () => {
102
+ const df = mmDistanceFunctions[MmDistanceFunctionsNames.NEEDLEMANN_WUNSCH](
103
+ {scoringMatrix, alphabetIndexes, gapOpen: 1, gapExtend: 0}
104
+ );
105
+ _testDistance(prot5, prot6, df, -2);
106
+ });
107
+
108
+ test('needleman-custom-half-extend', async () => {
109
+ const df = mmDistanceFunctions[MmDistanceFunctionsNames.NEEDLEMANN_WUNSCH](
110
+ {scoringMatrix, alphabetIndexes, gapOpen: 2, gapExtend: 1}
111
+ );
112
+ _testDistance(prot5, prot6, df, 2);
113
+ });
114
+
115
+ test('needleman-custom-same-extend', async () => {
116
+ const df = mmDistanceFunctions[MmDistanceFunctionsNames.NEEDLEMANN_WUNSCH](
117
+ {scoringMatrix, alphabetIndexes, gapOpen: 1, gapExtend: 1}
118
+ );
119
+ _testDistance(prot5, prot6, df, 1);
120
+ });
121
+ });
122
+
123
+ async function _initMacromoleculeColumn(csv: string): Promise<UnitsHandler> {
124
+ const srcDf: DG.DataFrame = DG.DataFrame.fromCsv(csv);
125
+ const seqCol = srcDf.col('seq')!;
126
+ const semType: string = await grok.functions
127
+ .call('Bio:detectMacromolecule', {col: seqCol}) as unknown as string;
128
+ if (semType)
129
+ seqCol.semType = semType;
130
+ await grok.data.detectSemanticTypes(srcDf);
131
+ const uh = new UnitsHandler(seqCol);
132
+ return uh;
133
+ }
134
+
135
+ function _testDistance(seq1: string, seq2: string, df: (a: string, b: string) => number, expected: number) {
136
+ const d = df(seq1, seq2);
137
+ expect(d, expected);
138
+ }
@@ -0,0 +1,187 @@
1
+ /* Do not change these import lines to match external modules in webpack configuration */
2
+ import * as grok from 'datagrok-api/grok';
3
+ import * as ui from 'datagrok-api/ui';
4
+ import * as DG from 'datagrok-api/dg';
5
+
6
+ import {before, after, category, test, expectArray} from '@datagrok-libraries/utils/src/test';
7
+
8
+ import {getMonomerLibHelper, toAtomicLevel} from '../package';
9
+ import {_toAtomicLevel} from '@datagrok-libraries/bio/src/monomer-works/to-atomic-level';
10
+ import {IMonomerLib} from '@datagrok-libraries/bio/src/types/index';
11
+ import {IMonomerLibHelper} from '@datagrok-libraries/bio/src/monomer-works/monomer-utils';
12
+ import {LIB_STORAGE_NAME} from '../utils/monomer-lib';
13
+
14
+ const appPath = 'System:AppData/Bio';
15
+ const fileSource = new DG.FileSource(appPath);
16
+
17
+ const testNames: { [k: string]: string } = {
18
+ PT: 'peptides fasta',
19
+ DNA: 'dna fasta',
20
+ MSA: 'msa separator',
21
+ };
22
+
23
+ const inputPath: { [k: string]: string } = {
24
+ PT: 'tests/to-atomic-level-peptides-fasta-input.csv',
25
+ DNA: 'tests/to-atomic-level-dna-fasta-input.csv',
26
+ MSA: 'tests/to-atomic-level-msa-separator-input.csv',
27
+ };
28
+
29
+ const outputPath: { [k: string]: string } = {
30
+ PT: 'tests/to-atomic-level-peptides-output.csv',
31
+ DNA: 'tests/to-atomic-level-dna-output.csv',
32
+ MSA: 'tests/to-atomic-level-msa-output.csv',
33
+ };
34
+
35
+ const inputColName = 'sequence';
36
+ const outputColName = 'molfile(sequence)';
37
+
38
+ category('toAtomicLevel', async () => {
39
+ const sourceDf: { [key: string]: DG.DataFrame } = {};
40
+ const targetDf: { [key: string]: DG.DataFrame } = {};
41
+
42
+ let monomerLibHelper: IMonomerLibHelper;
43
+ /** Backup actual user's monomer libraries settings */
44
+ let userLibrariesSettings: any = null;
45
+
46
+ before(async () => {
47
+ monomerLibHelper = await getMonomerLibHelper();
48
+ userLibrariesSettings = await grok.dapi.userDataStorage.get(LIB_STORAGE_NAME, true);
49
+ // Clear settings to test default
50
+ await grok.dapi.userDataStorage.put(LIB_STORAGE_NAME, {}, true);
51
+ await monomerLibHelper.loadLibraries(true);
52
+
53
+ for (const key in testNames) {
54
+ sourceDf[key] = await fileSource.readCsv(inputPath[key]);
55
+ await grok.data.detectSemanticTypes(sourceDf[key]);
56
+ targetDf[key] = await fileSource.readCsv(outputPath[key]);
57
+ }
58
+ });
59
+
60
+ after(async () => {
61
+ await grok.dapi.userDataStorage.put(LIB_STORAGE_NAME, userLibrariesSettings, true);
62
+ await monomerLibHelper.loadLibraries(true);
63
+ });
64
+
65
+ async function getTestResult(source: DG.DataFrame, target: DG.DataFrame): Promise<void> {
66
+ const inputCol = source.getCol(inputColName);
67
+ await toAtomicLevel(source, inputCol);
68
+ const obtainedCol = source.getCol(outputColName);
69
+ const expectedCol = target.getCol(outputColName);
70
+ const obtainedArray = [...obtainedCol.values()];
71
+ const expectedArray = [...expectedCol.values()];
72
+ expectArray(obtainedArray, expectedArray);
73
+ }
74
+
75
+ for (const key in testNames) {
76
+ test(`${testNames[key]}`, async () => {
77
+ await getTestResult(sourceDf[key], targetDf[key]);
78
+ }, {skipReason: 'GROK-13100'});
79
+ }
80
+
81
+ enum csvTests {
82
+ fastaDna = 'fastaDna',
83
+ fastaRna = 'fastaRna',
84
+ fastaPt = 'fastaPt',
85
+
86
+ separatorDna = 'separatorDna',
87
+ separatorRna = 'separatorRna',
88
+ separatorPt = 'separatorPt',
89
+ separatorUn = 'separatorUn',
90
+
91
+ helm = 'helm',
92
+ }
93
+
94
+ const csvData: { [key in csvTests]: string } = {
95
+ [csvTests.fastaDna]: `seq
96
+ ACGTC
97
+ CAGTGT
98
+ TTCAAC
99
+ `,
100
+ [csvTests.fastaRna]: `seq
101
+ ACGUC
102
+ CAGUGU
103
+ UUCAAC
104
+ `,
105
+ [csvTests.fastaPt]: `seq
106
+ FWPHEY
107
+ YNRQWYV
108
+ MKPSEYV
109
+ `,
110
+ [csvTests.separatorDna]: `seq
111
+ A/C/G/T/C
112
+ C/A/G/T/G/T
113
+ T/T/C/A/A/C
114
+ `,
115
+ [csvTests.separatorRna]: `seq
116
+ A*C*G*U*C
117
+ C*A*G*U*G*U
118
+ U*U*C*A*A*C
119
+ `,
120
+ [csvTests.separatorPt]: `seq
121
+ F-W-P-H-E-Y
122
+ Y-N-R-Q-W-Y-V
123
+ M-K-P-S-E-Y-V
124
+ `,
125
+ [csvTests.separatorUn]: `seq
126
+ meI-hHis-Aca-N-T-dE-Thr_PO3H2-Aca-D
127
+ meI-hHis-Aca-Cys_SEt-T-dK-Thr_PO3H2-Aca-Tyr_PO3H2
128
+ Lys_Boc-hHis-Aca-Cys_SEt-T-dK-Thr_PO3H2-Aca-Tyr_PO3H2
129
+ `,
130
+
131
+ [csvTests.helm]: `seq
132
+ PEPTIDE1{meI.D-gGlu.Aca.N.T.dE.Thr_PO3H2.Aca.D}$$$
133
+ PEPTIDE1{meI.hHis.Aca.Cys_SEt.T.dK.Thr_PO3H2.Aca.Tyr_PO3H2}$$$
134
+ PEPTIDE1{Lys_Boc.hHis.Aca.Cys_SEt.T.dK.Thr_PO3H2.Aca.Tyr_PO3H2}$$$
135
+ `,
136
+ };
137
+
138
+ /** Also detects semantic types
139
+ * @param {string} key
140
+ * @return {Promise<DG.DataFrame>}
141
+ */
142
+ async function readCsv(key: csvTests): Promise<DG.DataFrame> {
143
+ // Always recreate test data frame from CSV for reproducible detector behavior in tests.
144
+ const csv: string = csvData[key];
145
+ const df: DG.DataFrame = DG.DataFrame.fromCsv(csv);
146
+ await grok.data.detectSemanticTypes(df);
147
+ return df;
148
+ }
149
+
150
+ test('fastaDna', async () => {
151
+ await _testToAtomicLevel(await readCsv(csvTests.fastaDna), 'seq', monomerLibHelper);
152
+ });
153
+
154
+ test('fastaRna', async () => {
155
+ await _testToAtomicLevel(await readCsv(csvTests.fastaRna), 'seq', monomerLibHelper);
156
+ });
157
+
158
+ test('fastaPt', async () => {
159
+ await _testToAtomicLevel(await readCsv(csvTests.fastaPt), 'seq', monomerLibHelper);
160
+ });
161
+
162
+ test('separatorDna', async () => {
163
+ await _testToAtomicLevel(await readCsv(csvTests.separatorDna), 'seq', monomerLibHelper);
164
+ });
165
+
166
+ test('separatorDna', async () => {
167
+ await _testToAtomicLevel(await readCsv(csvTests.separatorRna), 'seq', monomerLibHelper);
168
+ });
169
+
170
+ test('separatorPt', async () => {
171
+ await _testToAtomicLevel(await readCsv(csvTests.separatorPt), 'seq', monomerLibHelper);
172
+ });
173
+
174
+ test('separatorUn', async () => {
175
+ await _testToAtomicLevel(await readCsv(csvTests.separatorUn), 'seq', monomerLibHelper);
176
+ });
177
+
178
+ test('helm', async () => {
179
+ await _testToAtomicLevel(await readCsv(csvTests.helm), 'seq', monomerLibHelper);
180
+ });
181
+ });
182
+
183
+ async function _testToAtomicLevel(df: DG.DataFrame, seqColName: string = 'seq', monomerLibHelper: IMonomerLibHelper) {
184
+ const seqCol: DG.Column<string> = df.getCol(seqColName);
185
+ const monomerLib: IMonomerLib = monomerLibHelper.getBioLib();
186
+ const resCol = await _toAtomicLevel(df, seqCol, monomerLib);
187
+ }