@datagrok/bio 1.3.1 → 1.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/detectors.js +38 -15
- package/dist/package-test.js +3811 -590
- package/dist/package.js +2172 -54
- package/dist/vendors-node_modules_datagrok-libraries_ml_src_workers_dimensionality-reducer_js.js +8975 -0
- package/files/sample_FASTA.csv +66 -0
- package/files/sample_FASTA_with_activities.csv +66 -0
- package/files/samples/HELMCoreLibrary.json +18218 -0
- package/files/samples/peptides_complex_msa.csv +10275 -0
- package/files/samples/peptides_simple_msa.csv +648 -0
- package/files/samples/sample_FASTA.csv +66 -0
- package/files/samples/sample_FASTA.fasta +196 -0
- package/files/samples/sample_HELM.csv +541 -0
- package/files/samples/sample_MSA.csv +541 -0
- package/package.json +9 -5
- package/setup.cmd +10 -0
- package/src/const.ts +5 -0
- package/src/package-test.ts +3 -1
- package/src/package.ts +116 -0
- package/src/tests/WebLogo-test.ts +6 -3
- package/src/tests/detectors-test.ts +93 -46
- package/src/tests/msa-tests.ts +34 -0
- package/src/tests/sequence-space-test.ts +24 -0
- package/src/tests/utils.ts +39 -0
- package/src/utils/constants.ts +62 -0
- package/src/utils/convert.ts +24 -0
- package/src/utils/multiple-sequence-alignment.ts +76 -0
- package/src/utils/sequence-space.ts +43 -0
- package/test-Bio-69a4761f6044-51a4ab35.html +0 -245
package/package.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "@datagrok/bio",
|
|
3
3
|
"beta": false,
|
|
4
4
|
"friendlyName": "Bio",
|
|
5
|
-
"version": "1.
|
|
5
|
+
"version": "1.4.2",
|
|
6
6
|
"description": "Bio is a [package](https://datagrok.ai/help/develop/develop#packages) for the [Datagrok](https://datagrok.ai) platform",
|
|
7
7
|
"repository": {
|
|
8
8
|
"type": "git",
|
|
@@ -10,10 +10,12 @@
|
|
|
10
10
|
"directory": "packages/Bio"
|
|
11
11
|
},
|
|
12
12
|
"dependencies": {
|
|
13
|
-
"@
|
|
13
|
+
"@biowasm/aioli": ">=2.4.0",
|
|
14
|
+
"@datagrok-libraries/bio": "^2.2.0",
|
|
14
15
|
"@datagrok-libraries/utils": "^0.4.2",
|
|
16
|
+
"@datagrok-libraries/ml": "^2.0.4",
|
|
15
17
|
"cash-dom": "latest",
|
|
16
|
-
"datagrok-api": "^1.4.
|
|
18
|
+
"datagrok-api": "^1.4.11",
|
|
17
19
|
"dayjs": "latest",
|
|
18
20
|
"ts-loader": "^9.2.5",
|
|
19
21
|
"typescript": "^4.4.2"
|
|
@@ -34,7 +36,8 @@
|
|
|
34
36
|
"scripts": {
|
|
35
37
|
"link-api": "npm link datagrok-api",
|
|
36
38
|
"link-bio": "npm link @datagrok-libraries/bio",
|
|
37
|
-
"link-
|
|
39
|
+
"link-ml": "npm link @datagrok-libraries/ml",
|
|
40
|
+
"link-all": "npm link datagrok-api @datagrok-libraries/utils @datagrok-libraries/bio @datagrok-libraries/ml",
|
|
38
41
|
"debug-sequences1": "grok publish --rebuild",
|
|
39
42
|
"release-sequences1": "grok publish --rebuild --release",
|
|
40
43
|
"build-sequences1": "webpack",
|
|
@@ -47,7 +50,8 @@
|
|
|
47
50
|
"lint": "eslint \"./src/**/*.ts\"",
|
|
48
51
|
"lint-fix": "eslint \"./src/**/*.ts\" --fix",
|
|
49
52
|
"test": "jest",
|
|
50
|
-
"test-local": "set HOST=localhost && jest"
|
|
53
|
+
"test-local": "set HOST=localhost && jest",
|
|
54
|
+
"build-bio-local": "npm --prefix ./../../js-api run build && npm --prefix ./../../libraries/utils run build && npm --prefix ./../../libraries/ml run build && npm run build && npm --prefix ./../../libraries/bio run build && npm run build"
|
|
51
55
|
},
|
|
52
56
|
"canEdit": [
|
|
53
57
|
"Developers"
|
package/setup.cmd
ADDED
package/src/const.ts
ADDED
package/src/package-test.ts
CHANGED
|
@@ -5,8 +5,10 @@ import {runTests, tests} from '@datagrok-libraries/utils/src/test';
|
|
|
5
5
|
import './tests/WebLogo-test';
|
|
6
6
|
import './tests/Palettes-test';
|
|
7
7
|
import './tests/detectors-test';
|
|
8
|
+
import './tests/msa-tests';
|
|
9
|
+
import './tests/sequence-space-test';
|
|
8
10
|
|
|
9
|
-
export const
|
|
11
|
+
export const _package = new DG.Package();
|
|
10
12
|
export {tests};
|
|
11
13
|
|
|
12
14
|
/** For the 'test' function argument names are fixed as 'category' and 'test' because of way it is called. */
|
package/src/package.ts
CHANGED
|
@@ -8,6 +8,11 @@ export const _package = new DG.Package();
|
|
|
8
8
|
|
|
9
9
|
import {WebLogo} from '@datagrok-libraries/bio/src/viewers/web-logo';
|
|
10
10
|
import {VdRegionsViewer} from './viewers/vd-regions-viewer';
|
|
11
|
+
import {runKalign, testMSAEnoughMemory} from './utils/multiple-sequence-alignment';
|
|
12
|
+
import {convert} from './utils/convert';
|
|
13
|
+
import {TableView} from 'datagrok-api/dg';
|
|
14
|
+
import { getEmbeddingColsNames, sequenceSpace } from './utils/sequence-space';
|
|
15
|
+
import { AvailableMetrics } from '@datagrok-libraries/ml/src/typed-metrics';
|
|
11
16
|
|
|
12
17
|
//name: sequenceAlignment
|
|
13
18
|
//input: string alignType {choices: ['Local alignment', 'Global alignment']}
|
|
@@ -38,3 +43,114 @@ export function webLogoViewer() {
|
|
|
38
43
|
export function vdRegionViewer() {
|
|
39
44
|
return new VdRegionsViewer();
|
|
40
45
|
}
|
|
46
|
+
|
|
47
|
+
//top-menu: Bio | Activity Cliffs...
|
|
48
|
+
//name: Activity Cliffs
|
|
49
|
+
//description: detect activity cliffs
|
|
50
|
+
//input: dataframe df [Input data table]
|
|
51
|
+
//input: column smiles {type:categorical; semType: Macromolecule}
|
|
52
|
+
//input: column activities
|
|
53
|
+
//input: double similarity = 80 [Similarity cutoff]
|
|
54
|
+
//input: string methodName { choices:["UMAP", "t-SNE", "SPE"] }
|
|
55
|
+
export async function activityCliffs(df: DG.DataFrame, smiles: DG.Column, activities: DG.Column,
|
|
56
|
+
similarity: number, methodName: string): Promise<void> {
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
//top-menu: Bio | Sequence Space...
|
|
60
|
+
//name: Sequence Space
|
|
61
|
+
//input: dataframe table
|
|
62
|
+
//input: column macroMolecule { semType: Macromolecule }
|
|
63
|
+
//input: string methodName { choices:["UMAP", "t-SNE", "SPE"] }
|
|
64
|
+
//input: string similarityMetric { choices:["Levenshtein", "Tanimoto"] }
|
|
65
|
+
//input: bool plotEmbeddings = true
|
|
66
|
+
export async function sequenceSpaceTopMenu(table: DG.DataFrame, macroMolecule: DG.Column, methodName: string,
|
|
67
|
+
similarityMetric: string = 'Levenshtein', plotEmbeddings: boolean) : Promise<void> {
|
|
68
|
+
const embedColsNames = getEmbeddingColsNames(table);
|
|
69
|
+
const sequenceSpaceRes = await sequenceSpace(macroMolecule, methodName, similarityMetric, embedColsNames);
|
|
70
|
+
const embeddings = sequenceSpaceRes.coordinates;
|
|
71
|
+
for (const col of embeddings)
|
|
72
|
+
table.columns.add(col);
|
|
73
|
+
if (plotEmbeddings) {
|
|
74
|
+
for (let v of grok.shell.views) {
|
|
75
|
+
if (v.name === table.name)
|
|
76
|
+
(v as DG.TableView).scatterPlot({x: embedColsNames[0], y: embedColsNames[1]});
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
};
|
|
80
|
+
|
|
81
|
+
//top-menu: Bio | MSA...
|
|
82
|
+
//name: MSA
|
|
83
|
+
//input: dataframe table
|
|
84
|
+
//input: column sequence { semType: Macromolecule }
|
|
85
|
+
export async function multipleSequenceAlignmentAny(table: DG.DataFrame, col: DG.Column): Promise<void> {
|
|
86
|
+
const msaCol = await runKalign(col, false);
|
|
87
|
+
table.columns.add(msaCol);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
//name: Composition Analysis
|
|
91
|
+
//top-menu: Bio | Composition Analysis
|
|
92
|
+
//output: viewer result
|
|
93
|
+
export async function compositionAnalysis(): Promise<void> {
|
|
94
|
+
const col = grok.shell.t.columns.bySemType('Macromolecule');//DG.SEMTYPE.MACROMOLECULE);
|
|
95
|
+
if (col === null) {
|
|
96
|
+
grok.shell.error('Current table does not contain sequences');
|
|
97
|
+
return;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
const wl = await col.dataFrame.plot.fromType('WebLogo', {});
|
|
101
|
+
|
|
102
|
+
for (const v of grok.shell.views) {
|
|
103
|
+
if (v instanceof TableView && (v as DG.TableView).dataFrame.name === col.dataFrame.name) {
|
|
104
|
+
(v as DG.TableView).dockManager.dock(wl.root, 'down');
|
|
105
|
+
break;
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
// helper function for importFasta
|
|
111
|
+
function parseMacromolecule(
|
|
112
|
+
fileContent: string,
|
|
113
|
+
startOfSequence: number,
|
|
114
|
+
endOfSequence: number
|
|
115
|
+
): string {
|
|
116
|
+
const seq = fileContent.slice(startOfSequence, endOfSequence);
|
|
117
|
+
const seqArray = seq.split(/\s/);
|
|
118
|
+
return seqArray.join('');
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
//name: importFasta
|
|
122
|
+
//description: Opens FASTA file
|
|
123
|
+
//tags: file-handler
|
|
124
|
+
//meta.ext: fasta, fna, ffn, faa, frn, fa
|
|
125
|
+
//input: string content
|
|
126
|
+
//output: list tables
|
|
127
|
+
export function importFasta(fileContent: string): DG.DataFrame [] {
|
|
128
|
+
const regex = /^>(.*)$/gm; // match the line starting with >
|
|
129
|
+
const descriptionsArray = [];
|
|
130
|
+
const sequencesArray: string[] = [];
|
|
131
|
+
let startOfSequence = 0;
|
|
132
|
+
let match; // match.index is the beginning of the matched line
|
|
133
|
+
while (match = regex.exec(fileContent)) {
|
|
134
|
+
const description = fileContent.substring(match.index + 1, regex.lastIndex);
|
|
135
|
+
descriptionsArray.push(description);
|
|
136
|
+
if (startOfSequence !== 0)
|
|
137
|
+
sequencesArray.push(parseMacromolecule(fileContent, startOfSequence, match.index));
|
|
138
|
+
startOfSequence = regex.lastIndex + 1;
|
|
139
|
+
}
|
|
140
|
+
sequencesArray.push(parseMacromolecule(fileContent, startOfSequence, -1));
|
|
141
|
+
const descriptionsArrayCol = DG.Column.fromStrings('description', descriptionsArray);
|
|
142
|
+
const sequenceCol = DG.Column.fromStrings('sequence', sequencesArray);
|
|
143
|
+
sequenceCol.semType = 'Macromolecule';
|
|
144
|
+
return [DG.DataFrame.fromColumns([
|
|
145
|
+
descriptionsArrayCol,
|
|
146
|
+
sequenceCol,
|
|
147
|
+
])];
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
//name: Bio | Convert
|
|
151
|
+
//friendly-name: Bio | Convert
|
|
152
|
+
//tags: panel, bio
|
|
153
|
+
//input: column col {semType: Macromolecule}
|
|
154
|
+
export function convertPanel(col: DG.Column): void {
|
|
155
|
+
convert(col);
|
|
156
|
+
}
|
|
@@ -52,6 +52,7 @@ XZJ{}2
|
|
|
52
52
|
`;
|
|
53
53
|
|
|
54
54
|
// anonymous functions specified in test() registering must return Promise<any>
|
|
55
|
+
test('testGetStats', async () => { await _testGetStats(csvDfN1); });
|
|
55
56
|
test('testGetAlphabetSimilarity', async () => { await _testGetAlphabetSimilarity(); });
|
|
56
57
|
|
|
57
58
|
test('testPickupPaletteN1', async () => { await _testPickupPaletteN1(csvDfN1); });
|
|
@@ -61,16 +62,18 @@ XZJ{}2
|
|
|
61
62
|
});
|
|
62
63
|
|
|
63
64
|
|
|
64
|
-
export async function
|
|
65
|
+
export async function _testGetStats(csvDfN1: string) {
|
|
66
|
+
const dfN1: DG.DataFrame = DG.DataFrame.fromCsv(csvDfN1);
|
|
65
67
|
const seqCol: DG.Column = dfN1.col('seq')!;
|
|
66
|
-
const
|
|
68
|
+
const stats = WebLogo.getStats(seqCol, 5, WebLogo.splitterAsFasta);
|
|
67
69
|
|
|
68
|
-
expectObject(
|
|
70
|
+
expectObject(stats.freq, {
|
|
69
71
|
'A': 4,
|
|
70
72
|
'C': 5,
|
|
71
73
|
'G': 3,
|
|
72
74
|
'T': 6
|
|
73
75
|
});
|
|
76
|
+
expect(stats.sameLength, true);
|
|
74
77
|
}
|
|
75
78
|
|
|
76
79
|
export async function _testGetAlphabetSimilarity() {
|
|
@@ -4,24 +4,33 @@ import * as grok from 'datagrok-api/grok';
|
|
|
4
4
|
import * as ui from 'datagrok-api/ui';
|
|
5
5
|
import * as DG from 'datagrok-api/dg';
|
|
6
6
|
|
|
7
|
+
import {mmSemType} from '../const';
|
|
8
|
+
import {importFasta} from '../package';
|
|
9
|
+
|
|
7
10
|
category('detectors', () => {
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
11
|
+
const csvDf1: string = `col1
|
|
12
|
+
1
|
|
13
|
+
2
|
|
14
|
+
3`;
|
|
15
|
+
|
|
16
|
+
const csvDf2: string = `col1
|
|
17
|
+
4
|
|
18
|
+
5
|
|
19
|
+
6
|
|
20
|
+
7`;
|
|
21
|
+
|
|
22
|
+
const csvDf3: string = `col1
|
|
23
|
+
8
|
|
24
|
+
9
|
|
25
|
+
10
|
|
26
|
+
11
|
|
27
|
+
12`;
|
|
28
|
+
|
|
29
|
+
const csvDfSmiles: string = `col1
|
|
30
|
+
CCCCN1C(=O)CN=C(c2cc(F)ccc12)C3CCCCC3
|
|
31
|
+
C1CCCCC1
|
|
32
|
+
CCCCCC
|
|
33
|
+
`;
|
|
25
34
|
|
|
26
35
|
const csvDfN1: string = `seq
|
|
27
36
|
ACGTC
|
|
@@ -78,36 +87,41 @@ YNR-WYV-KHP
|
|
|
78
87
|
MWRSWY-CKHP
|
|
79
88
|
`;
|
|
80
89
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
90
|
+
test('testDetectorsNegative1', async () => { await _testDetectorsNegative(csvDf1); });
|
|
91
|
+
test('testDetectorsNegative2', async () => { await _testDetectorsNegative(csvDf2); });
|
|
92
|
+
test('testDetectorsNegative3', async () => { await _testDetectorsNegative(csvDf3); });
|
|
93
|
+
test('testDetectorsNegativeSmiles', async () => { await _testDetectorsNegative(csvDfSmiles); });
|
|
84
94
|
|
|
85
95
|
test('testDetectorsN1', async () => { await _testDetectorsN1(csvDfN1); });
|
|
86
96
|
test('testDetectorsAA1', async () => { await _testDetectorsAA1(csvDfAA1); });
|
|
87
97
|
test('testDetectorsMsaN1', async () => { await _testDetectorsMsaN1(csvDfMsaN1); });
|
|
88
98
|
test('testDetectorsMsaAA1', async () => { await _testDetectorsMsaAA1(csvDfMsaAA1); });
|
|
89
99
|
|
|
90
|
-
test('
|
|
91
|
-
test('
|
|
100
|
+
test('testDetectorsSepNt', async () => { await _testDetectorsSepNt(csvDfSepNt, '*'); });
|
|
101
|
+
test('testDetectorsSepPt', async () => { await _testDetectorsSepPt(csvDfSepPt, '-'); });
|
|
102
|
+
test('testDetectorsSepUn1', async () => { await _testDetectorsSepUn(csvDfSepUn1, '-'); });
|
|
103
|
+
test('testDetectorsSepUn2', async () => { await _testDetectorsSepUn(csvDfSepUn2, '/'); });
|
|
92
104
|
|
|
93
105
|
test('testDetectorsSepMsaN1', async () => { await _testDetectorsSepMsaN1(csvDfSepMsaN1); });
|
|
106
|
+
|
|
107
|
+
test('testDetectorsSamplesFastaCsvPt', async () => { await _testDetectorsSamplesFastaCsvPt(); });
|
|
108
|
+
test('testDetectorsSamplesFastaFastaPt', async () => { await _testDetectorsSamplesFastaFastaPt(); });
|
|
94
109
|
});
|
|
95
110
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
// }
|
|
111
|
+
export async function _testDetectorsNegative(csvDf: string) {
|
|
112
|
+
const df: DG.DataFrame = DG.DataFrame.fromCsv(csvDf);
|
|
113
|
+
await grok.data.detectSemanticTypes(df);
|
|
114
|
+
|
|
115
|
+
const col1: DG.Column = df.col('col1')!;
|
|
116
|
+
expect(col1.semType == mmSemType, false);
|
|
117
|
+
}
|
|
104
118
|
|
|
105
119
|
export async function _testDetectorsN1(csvDfN1: string) {
|
|
106
120
|
const dfN1: DG.DataFrame = DG.DataFrame.fromCsv(csvDfN1);
|
|
107
121
|
await grok.data.detectSemanticTypes(dfN1);
|
|
108
122
|
|
|
109
123
|
const col: DG.Column = dfN1.col('seq')!;
|
|
110
|
-
expect(col.semType,
|
|
124
|
+
expect(col.semType, mmSemType);
|
|
111
125
|
expect(col.getTag(DG.TAGS.UNITS), 'fasta:SEQ:NT');
|
|
112
126
|
}
|
|
113
127
|
|
|
@@ -116,7 +130,7 @@ export async function _testDetectorsAA1(csvDfAA1: string) {
|
|
|
116
130
|
await grok.data.detectSemanticTypes(dfAA1);
|
|
117
131
|
|
|
118
132
|
const col: DG.Column = dfAA1.col('seq')!;
|
|
119
|
-
expect(col.semType,
|
|
133
|
+
expect(col.semType, mmSemType);
|
|
120
134
|
expect(col.getTag(DG.TAGS.UNITS), 'fasta:SEQ:PT');
|
|
121
135
|
}
|
|
122
136
|
|
|
@@ -125,7 +139,7 @@ export async function _testDetectorsMsaN1(csvDfMsaN1: string) {
|
|
|
125
139
|
await grok.data.detectSemanticTypes(dfMsaN1);
|
|
126
140
|
|
|
127
141
|
const col: DG.Column = dfMsaN1.col('seq')!;
|
|
128
|
-
expect(col.semType,
|
|
142
|
+
expect(col.semType, mmSemType);
|
|
129
143
|
expect(col.getTag(DG.TAGS.UNITS), 'fasta:SEQ.MSA:NT');
|
|
130
144
|
}
|
|
131
145
|
|
|
@@ -134,26 +148,38 @@ export async function _testDetectorsMsaAA1(csvDfMsaAA1: string) {
|
|
|
134
148
|
await grok.data.detectSemanticTypes(dfMsaAA1);
|
|
135
149
|
|
|
136
150
|
const col: DG.Column = dfMsaAA1.col('seq')!;
|
|
137
|
-
expect(col.semType,
|
|
151
|
+
expect(col.semType, mmSemType);
|
|
138
152
|
expect(col.getTag(DG.TAGS.UNITS), 'fasta:SEQ.MSA:PT');
|
|
139
153
|
}
|
|
140
154
|
|
|
141
|
-
export async function
|
|
142
|
-
const
|
|
143
|
-
await grok.data.detectSemanticTypes(
|
|
155
|
+
export async function _testDetectorsSepNt(csv: string, separator: string) {
|
|
156
|
+
const df: DG.DataFrame = DG.DataFrame.fromCsv(csv);
|
|
157
|
+
await grok.data.detectSemanticTypes(df);
|
|
144
158
|
|
|
145
|
-
const col: DG.Column =
|
|
146
|
-
expect(col.semType,
|
|
147
|
-
expect(col.getTag(DG.TAGS.UNITS), 'separator:SEQ:
|
|
159
|
+
const col: DG.Column = df.col('seq')!;
|
|
160
|
+
expect(col.semType, mmSemType);
|
|
161
|
+
expect(col.getTag(DG.TAGS.UNITS), 'separator:SEQ:NT');
|
|
162
|
+
expect(col.getTag('separator'), separator);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
export async function _testDetectorsSepPt(csv: string, separator: string) {
|
|
166
|
+
const df: DG.DataFrame = DG.DataFrame.fromCsv(csv);
|
|
167
|
+
await grok.data.detectSemanticTypes(df);
|
|
168
|
+
|
|
169
|
+
const col: DG.Column = df.col('seq')!;
|
|
170
|
+
expect(col.semType, mmSemType);
|
|
171
|
+
expect(col.getTag(DG.TAGS.UNITS), 'separator:SEQ:PT');
|
|
172
|
+
expect(col.getTag('separator'), separator);
|
|
148
173
|
}
|
|
149
174
|
|
|
150
|
-
export async function
|
|
151
|
-
const
|
|
152
|
-
await grok.data.detectSemanticTypes(
|
|
175
|
+
export async function _testDetectorsSepUn(csv: string, separator: string) {
|
|
176
|
+
const df: DG.DataFrame = DG.DataFrame.fromCsv(csv);
|
|
177
|
+
await grok.data.detectSemanticTypes(df);
|
|
153
178
|
|
|
154
|
-
const col: DG.Column =
|
|
155
|
-
expect(col.semType,
|
|
179
|
+
const col: DG.Column = df.col('seq')!;
|
|
180
|
+
expect(col.semType, mmSemType);
|
|
156
181
|
expect(col.getTag(DG.TAGS.UNITS), 'separator:SEQ:UN');
|
|
182
|
+
expect(col.getTag('separator'), separator);
|
|
157
183
|
}
|
|
158
184
|
|
|
159
185
|
export async function _testDetectorsSepMsaN1(csvDfSepMsaN1: string) {
|
|
@@ -161,6 +187,27 @@ export async function _testDetectorsSepMsaN1(csvDfSepMsaN1: string) {
|
|
|
161
187
|
await grok.data.detectSemanticTypes(dfSepMsaN1);
|
|
162
188
|
|
|
163
189
|
const col: DG.Column = dfSepMsaN1.col('seq')!;
|
|
164
|
-
expect(col.semType,
|
|
190
|
+
expect(col.semType, mmSemType);
|
|
165
191
|
expect(col.getTag(DG.TAGS.UNITS), 'separator:SEQ.MSA:NT');
|
|
166
192
|
}
|
|
193
|
+
|
|
194
|
+
export async function _testDetectorsSamplesFastaCsvPt() {
|
|
195
|
+
const csv: string = await grok.dapi.files.readAsText('System:AppData/Bio/samples/sample_FASTA.csv');
|
|
196
|
+
const df: DG.DataFrame = DG.DataFrame.fromCsv(csv);
|
|
197
|
+
await grok.data.detectSemanticTypes(df);
|
|
198
|
+
|
|
199
|
+
const col: DG.Column = df.col('sequence')!;
|
|
200
|
+
expect(col.semType, mmSemType);
|
|
201
|
+
expect(col.getTag(DG.TAGS.UNITS), 'fasta:SEQ:PT');
|
|
202
|
+
expect(col.getTag('separator'), null);
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
export async function _testDetectorsSamplesFastaFastaPt() {
|
|
206
|
+
const fasta: string = await grok.dapi.files.readAsText('System:AppData/Bio/samples/sample_FASTA.fasta');
|
|
207
|
+
const df: DG.DataFrame = importFasta(fasta)[0];
|
|
208
|
+
|
|
209
|
+
const col: DG.Column = df.col('sequence')!;
|
|
210
|
+
expect(col.semType, mmSemType);
|
|
211
|
+
expect(col.getTag(DG.TAGS.UNITS), 'fasta:SEQ:PT');
|
|
212
|
+
expect(col.getTag('separator'), null);
|
|
213
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import {category, test} from '@datagrok-libraries/utils/src/test';
|
|
2
|
+
import {
|
|
3
|
+
_testMSAIsCorrect,
|
|
4
|
+
_testTableIsNotEmpty,
|
|
5
|
+
} from './utils';
|
|
6
|
+
|
|
7
|
+
import * as DG from 'datagrok-api/dg';
|
|
8
|
+
//import * as grok from 'datagrok-api/grok';
|
|
9
|
+
|
|
10
|
+
export const _package = new DG.Package();
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
category('MSA', async () => {
|
|
14
|
+
//table = await grok.data.files.openTable('Demo:Files/bio/peptides.csv');
|
|
15
|
+
const fromCsv = `seq
|
|
16
|
+
FWRWYVKHP
|
|
17
|
+
YNRWYVKHP
|
|
18
|
+
MWRSWYCKHP`;
|
|
19
|
+
const toCsv = `seq
|
|
20
|
+
-F-W-R--W-Y-V-K-H-P
|
|
21
|
+
-Y-N-R--W-Y-V-K-H-P
|
|
22
|
+
-M-W-R-S-W-Y-C-K-H-P`;
|
|
23
|
+
const table: DG.DataFrame = DG.DataFrame.fromCsv(fromCsv);
|
|
24
|
+
const toTable: DG.DataFrame = DG.DataFrame.fromCsv(toCsv);
|
|
25
|
+
const alignedSequencesColumn = toTable.getCol('seq');
|
|
26
|
+
|
|
27
|
+
test('test_table.is_not_empty', async () => {
|
|
28
|
+
await _testTableIsNotEmpty(table);
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
test('is_correct', async () => {
|
|
32
|
+
await _testMSAIsCorrect(alignedSequencesColumn);
|
|
33
|
+
});
|
|
34
|
+
});
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import {before, category, test, expect} from '@datagrok-libraries/utils/src/test';
|
|
2
|
+
import * as DG from "datagrok-api/dg";
|
|
3
|
+
import { sequenceSpace } from '../utils/sequence-space';
|
|
4
|
+
import { readDataframe } from './utils';
|
|
5
|
+
//import * as grok from 'datagrok-api/grok';
|
|
6
|
+
|
|
7
|
+
category('sequenceSpace', async () => {
|
|
8
|
+
|
|
9
|
+
let testFastaDf: DG.DataFrame;
|
|
10
|
+
|
|
11
|
+
before(async () => {
|
|
12
|
+
//@ts-ignore
|
|
13
|
+
testFastaDf = await readDataframe('sample_FASTA.csv');
|
|
14
|
+
});
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
test('sequenceSpaceOpens', async () => {
|
|
18
|
+
//@ts-ignore
|
|
19
|
+
const res = await sequenceSpace(testFastaDf.col('Sequence')!, 't-SNE', 'Levenshtein', ['Embed_X', 'Embed_Y']);
|
|
20
|
+
expect(res.coordinates != undefined, true);
|
|
21
|
+
expect(res.distance != undefined, true);
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
});
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import * as DG from 'datagrok-api/dg';
|
|
2
|
+
import * as grok from "datagrok-api/grok";
|
|
3
|
+
import {expect} from '@datagrok-libraries/utils/src/test';
|
|
4
|
+
import {runKalign} from '../utils/multiple-sequence-alignment';
|
|
5
|
+
import { _package} from '../package-test';
|
|
6
|
+
|
|
7
|
+
export async function loadFileAsText(name: string): Promise<string> {
|
|
8
|
+
return await _package.files.readAsText(name);
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export async function readDataframe(tableName: string): Promise<DG.DataFrame> {
|
|
12
|
+
const file = await loadFileAsText(tableName);
|
|
13
|
+
const df = DG.DataFrame.fromCsv(file);
|
|
14
|
+
df.name = tableName.replace('.csv', '');
|
|
15
|
+
return df;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Tests if a table has non zero rows and columns.
|
|
21
|
+
*
|
|
22
|
+
* @param {DG.DataFrame} table Target table.
|
|
23
|
+
*/
|
|
24
|
+
export function _testTableIsNotEmpty(table: DG.DataFrame): void {
|
|
25
|
+
expect(table.columns.length > 0 && table.rowCount > 0, true);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Tests if MSA works and returns consistent result.
|
|
31
|
+
*
|
|
32
|
+
* @export
|
|
33
|
+
* @param {DG.Column} col Aligned sequences column.
|
|
34
|
+
*/
|
|
35
|
+
export async function _testMSAIsCorrect(col: DG.Column): Promise<void> {
|
|
36
|
+
const msaCol = await runKalign(col, true);
|
|
37
|
+
expect(msaCol.toList().every((v, i) => (v == col.get(i) || v == null)), true);
|
|
38
|
+
|
|
39
|
+
}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
export enum COLUMNS_NAMES {
|
|
2
|
+
SPLIT_COL = '~split',
|
|
3
|
+
ACTIVITY = '~activity',
|
|
4
|
+
ACTIVITY_SCALED = 'activity_scaled',
|
|
5
|
+
ALIGNED_SEQUENCE = '~aligned_sequence',
|
|
6
|
+
AMINO_ACID_RESIDUE = 'AAR',
|
|
7
|
+
POSITION = 'Pos',
|
|
8
|
+
P_VALUE = 'pValue',
|
|
9
|
+
MEAN_DIFFERENCE = 'Mean difference',
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export enum CATEGORIES {
|
|
13
|
+
OTHER = 'Other',
|
|
14
|
+
ALL = 'All',
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export enum TAGS {
|
|
18
|
+
AAR = 'AAR',
|
|
19
|
+
POSITION = 'Pos',
|
|
20
|
+
SEPARATOR = 'monomer-separator',
|
|
21
|
+
SELECTION = 'selection',
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export enum SEM_TYPES {
|
|
25
|
+
AMINO_ACIDS = 'aminoAcids',
|
|
26
|
+
ALIGNED_SEQUENCE = 'alignedSequence',
|
|
27
|
+
ALIGNED_SEQUENCE_DIFFERENCE = 'alignedSequenceDifference',
|
|
28
|
+
ACTIVITY = 'activity',
|
|
29
|
+
ACTIVITY_SCALED = 'activityScaled',
|
|
30
|
+
Macro_Molecule = 'Macromolecule',
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export const STATS = 'stats';
|
|
34
|
+
|
|
35
|
+
export const EMBEDDING_STATUS = 'embeddingStatus';
|
|
36
|
+
|
|
37
|
+
export const PEPTIDES_ANALYSIS = 'isPeptidesAnalysis';
|
|
38
|
+
|
|
39
|
+
export enum FLAGS {
|
|
40
|
+
CELL_CHANGING = 'isCellChanging',
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
export const aarGroups = {
|
|
44
|
+
'R': 'PC', 'H': 'PC', 'K': 'PC',
|
|
45
|
+
'D': 'NC', 'E': 'NC',
|
|
46
|
+
'S': 'U', 'T': 'U', 'N': 'U', 'Q': 'U',
|
|
47
|
+
'C': 'SC', 'U': 'SC', 'G': 'SC', 'P': 'SC',
|
|
48
|
+
'A': 'H', 'V': 'H', 'I': 'H', 'L': 'H', 'M': 'H', 'F': 'H', 'Y': 'H', 'W': 'H',
|
|
49
|
+
'-': '-',
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
export const groupDescription: {[key: string]: {'description': string, aminoAcids: string[]}} = {
|
|
53
|
+
'PC': {'description': 'Positive Amino Acids, with Electrically Charged Side Chains', 'aminoAcids': ['R', 'H', 'K']},
|
|
54
|
+
'NC': {'description': 'Negative Amino Acids, with Electrically Charged Side Chains', 'aminoAcids': ['D', 'E']},
|
|
55
|
+
'U': {'description': 'Amino Acids with Polar Uncharged Side Chains', 'aminoAcids': ['S', 'T', 'N', 'Q']},
|
|
56
|
+
'SC': {'description': 'Special Cases', 'aminoAcids': ['C', 'U', 'G', 'P']},
|
|
57
|
+
'H': {
|
|
58
|
+
'description': 'Amino Acids with Hydrophobic Side Chain',
|
|
59
|
+
'aminoAcids': ['A', 'V', 'I', 'L', 'M', 'F', 'Y', 'W'],
|
|
60
|
+
},
|
|
61
|
+
'-': {'description': 'Unknown Amino Acid', 'aminoAcids': ['-']},
|
|
62
|
+
};
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import * as DG from 'datagrok-api/dg';
|
|
2
|
+
import * as ui from 'datagrok-api/ui';
|
|
3
|
+
|
|
4
|
+
export function convert(col: DG.Column): void {
|
|
5
|
+
|
|
6
|
+
const current = col.tags[DG.TAGS.UNITS];
|
|
7
|
+
//TODO: read all notations
|
|
8
|
+
const notations = ['fasta:SEQ:NT', 'fasta:SEQ:PT', 'fasta:SEQ.MSA:NT', 'fasta:SEQ.MSA:PT', 'HELM'];
|
|
9
|
+
const choices = ui.choiceInput("convert to", "", notations.filter(e => e !== current));
|
|
10
|
+
|
|
11
|
+
ui.dialog('Convert sequence')
|
|
12
|
+
.add(
|
|
13
|
+
ui.div([
|
|
14
|
+
ui.h1('current notation'),
|
|
15
|
+
ui.div(current),
|
|
16
|
+
choices.root
|
|
17
|
+
])
|
|
18
|
+
)
|
|
19
|
+
.onOK(() => {
|
|
20
|
+
//TODO: create new converted column
|
|
21
|
+
//col.dataFrame.columns.add();
|
|
22
|
+
})
|
|
23
|
+
.show();
|
|
24
|
+
}
|