@datagrok/bio 1.3.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "@datagrok/bio",
3
3
  "beta": false,
4
4
  "friendlyName": "Bio",
5
- "version": "1.3.0",
5
+ "version": "1.4.1",
6
6
  "description": "Bio is a [package](https://datagrok.ai/help/develop/develop#packages) for the [Datagrok](https://datagrok.ai) platform",
7
7
  "repository": {
8
8
  "type": "git",
@@ -10,10 +10,12 @@
10
10
  "directory": "packages/Bio"
11
11
  },
12
12
  "dependencies": {
13
- "@datagrok-libraries/bio": "^2.1.1",
13
+ "@biowasm/aioli": ">=2.4.0",
14
+ "@datagrok-libraries/bio": "^2.2.0",
14
15
  "@datagrok-libraries/utils": "^0.4.2",
16
+ "@datagrok-libraries/ml": "^2.0.2",
15
17
  "cash-dom": "latest",
16
- "datagrok-api": "^1.3.5",
18
+ "datagrok-api": "^1.4.11",
17
19
  "dayjs": "latest",
18
20
  "ts-loader": "^9.2.5",
19
21
  "typescript": "^4.4.2"
@@ -34,7 +36,8 @@
34
36
  "scripts": {
35
37
  "link-api": "npm link datagrok-api",
36
38
  "link-bio": "npm link @datagrok-libraries/bio",
37
- "link-all": "npm link datagrok-api @datagrok-libraries/bio",
39
+ "link-ml": "npm link @datagrok-libraries/ml",
40
+ "link-all": "npm link datagrok-api @datagrok-libraries/utils @datagrok-libraries/bio @datagrok-libraries/ml",
38
41
  "debug-sequences1": "grok publish --rebuild",
39
42
  "release-sequences1": "grok publish --rebuild --release",
40
43
  "build-sequences1": "webpack",
@@ -47,7 +50,8 @@
47
50
  "lint": "eslint \"./src/**/*.ts\"",
48
51
  "lint-fix": "eslint \"./src/**/*.ts\" --fix",
49
52
  "test": "jest",
50
- "test-local": "set HOST=localhost && jest"
53
+ "test-local": "set HOST=localhost && jest",
54
+ "build-bio-local": "npm --prefix ./../../js-api run build && npm --prefix ./../../libraries/utils run build && npm --prefix ./../../libraries/ml run build && npm run build && npm --prefix ./../../libraries/bio run build && npm run build"
51
55
  },
52
56
  "canEdit": [
53
57
  "Developers"
package/setup.cmd ADDED
@@ -0,0 +1,10 @@
1
+ cd ../../js-api
2
+ call npm install
3
+ call npm link
4
+ cd ../libraries/bio
5
+ call npm install
6
+ call npm link
7
+ cd ../../packages/Bio
8
+ call npm install
9
+ call npm link datagrok-api @datagrok-libraries/bio
10
+ webpack
package/src/const.ts ADDED
@@ -0,0 +1,5 @@
1
+ import * as ui from 'datagrok-api/ui';
2
+ import * as grok from 'datagrok-api/grok';
3
+ import * as DG from 'datagrok-api/dg';
4
+
5
+ export const mmSemType = 'Macromolecule';
@@ -5,8 +5,10 @@ import {runTests, tests} from '@datagrok-libraries/utils/src/test';
5
5
  import './tests/WebLogo-test';
6
6
  import './tests/Palettes-test';
7
7
  import './tests/detectors-test';
8
+ import './tests/msa-tests';
9
+ import './tests/sequence-space-test';
8
10
 
9
- export const _packageTest = new DG.Package();
11
+ export const _package = new DG.Package();
10
12
  export {tests};
11
13
 
12
14
  /** For the 'test' function argument names are fixed as 'category' and 'test' because of way it is called. */
package/src/package.ts CHANGED
@@ -8,6 +8,11 @@ export const _package = new DG.Package();
8
8
 
9
9
  import {WebLogo} from '@datagrok-libraries/bio/src/viewers/web-logo';
10
10
  import {VdRegionsViewer} from './viewers/vd-regions-viewer';
11
+ import {runKalign, testMSAEnoughMemory} from './utils/multiple-sequence-alignment';
12
+ import {convert} from './utils/convert';
13
+ import {TableView} from 'datagrok-api/dg';
14
+ import { getEmbeddingColsNames, sequenceSpace } from './utils/sequence-space';
15
+ import { AvailableMetrics } from '@datagrok-libraries/ml/src/typed-metrics';
11
16
 
12
17
  //name: sequenceAlignment
13
18
  //input: string alignType {choices: ['Local alignment', 'Global alignment']}
@@ -38,3 +43,114 @@ export function webLogoViewer() {
38
43
  export function vdRegionViewer() {
39
44
  return new VdRegionsViewer();
40
45
  }
46
+
47
+ //top-menu: Bio | Activity Cliffs...
48
+ //name: Activity Cliffs
49
+ //description: detect activity cliffs
50
+ //input: dataframe df [Input data table]
51
+ //input: column smiles {type:categorical; semType: Macromolecule}
52
+ //input: column activities
53
+ //input: double similarity = 80 [Similarity cutoff]
54
+ //input: string methodName { choices:["UMAP", "t-SNE", "SPE"] }
55
+ export async function activityCliffs(df: DG.DataFrame, smiles: DG.Column, activities: DG.Column,
56
+ similarity: number, methodName: string): Promise<void> {
57
+ }
58
+
59
+ //top-menu: Bio | Sequence Space...
60
+ //name: Sequence Space
61
+ //input: dataframe table
62
+ //input: column macroMolecule { semType: Macromolecule }
63
+ //input: string methodName { choices:["UMAP", "t-SNE", "SPE"] }
64
+ //input: string similarityMetric { choices:["Levenshtein", "Tanimoto"] }
65
+ //input: bool plotEmbeddings = true
66
+ export async function sequenceSpaceTopMenu(table: DG.DataFrame, macroMolecule: DG.Column, methodName: string,
67
+ similarityMetric: string = 'Levenshtein', plotEmbeddings: boolean) : Promise<void> {
68
+ const embedColsNames = getEmbeddingColsNames(table);
69
+ const sequenceSpaceRes = await sequenceSpace(macroMolecule, methodName, similarityMetric, embedColsNames);
70
+ const embeddings = sequenceSpaceRes.coordinates;
71
+ for (const col of embeddings)
72
+ table.columns.add(col);
73
+ if (plotEmbeddings) {
74
+ for (let v of grok.shell.views) {
75
+ if (v.name === table.name)
76
+ (v as DG.TableView).scatterPlot({x: embedColsNames[0], y: embedColsNames[1]});
77
+ }
78
+ }
79
+ };
80
+
81
+ //top-menu: Bio | MSA...
82
+ //name: MSA
83
+ //input: dataframe table
84
+ //input: column sequence { semType: Macromolecule }
85
+ export async function multipleSequenceAlignmentAny(table: DG.DataFrame, col: DG.Column): Promise<void> {
86
+ const msaCol = await runKalign(col, false);
87
+ table.columns.add(msaCol);
88
+ }
89
+
90
+ //name: Composition Analysis
91
+ //top-menu: Bio | Composition Analysis
92
+ //output: viewer result
93
+ export async function compositionAnalysis(): Promise<void> {
94
+ const col = grok.shell.t.columns.bySemType('Macromolecule');//DG.SEMTYPE.MACROMOLECULE);
95
+ if (col === null) {
96
+ grok.shell.error('Current table does not contain sequences');
97
+ return;
98
+ }
99
+
100
+ const wl = await col.dataFrame.plot.fromType('WebLogo', {});
101
+
102
+ for (const v of grok.shell.views) {
103
+ if (v instanceof TableView && (v as DG.TableView).dataFrame.name === col.dataFrame.name) {
104
+ (v as DG.TableView).dockManager.dock(wl.root, 'down');
105
+ break;
106
+ }
107
+ }
108
+ }
109
+
110
+ // helper function for importFasta
111
+ function parseMacromolecule(
112
+ fileContent: string,
113
+ startOfSequence: number,
114
+ endOfSequence: number
115
+ ): string {
116
+ const seq = fileContent.slice(startOfSequence, endOfSequence);
117
+ const seqArray = seq.split(/\s/);
118
+ return seqArray.join('');
119
+ }
120
+
121
+ //name: importFasta
122
+ //description: Opens FASTA file
123
+ //tags: file-handler
124
+ //meta.ext: fasta, fna, ffn, faa, frn, fa
125
+ //input: string content
126
+ //output: list tables
127
+ export function importFasta(fileContent: string): DG.DataFrame [] {
128
+ const regex = /^>(.*)$/gm; // match the line starting with >
129
+ const descriptionsArray = [];
130
+ const sequencesArray: string[] = [];
131
+ let startOfSequence = 0;
132
+ let match; // match.index is the beginning of the matched line
133
+ while (match = regex.exec(fileContent)) {
134
+ const description = fileContent.substring(match.index + 1, regex.lastIndex);
135
+ descriptionsArray.push(description);
136
+ if (startOfSequence !== 0)
137
+ sequencesArray.push(parseMacromolecule(fileContent, startOfSequence, match.index));
138
+ startOfSequence = regex.lastIndex + 1;
139
+ }
140
+ sequencesArray.push(parseMacromolecule(fileContent, startOfSequence, -1));
141
+ const descriptionsArrayCol = DG.Column.fromStrings('description', descriptionsArray);
142
+ const sequenceCol = DG.Column.fromStrings('sequence', sequencesArray);
143
+ sequenceCol.semType = 'Macromolecule';
144
+ return [DG.DataFrame.fromColumns([
145
+ descriptionsArrayCol,
146
+ sequenceCol,
147
+ ])];
148
+ }
149
+
150
+ //name: Bio | Convert
151
+ //friendly-name: Bio | Convert
152
+ //tags: panel, bio
153
+ //input: column col {semType: Macromolecule}
154
+ export function convertPanel(col: DG.Column): void {
155
+ convert(col);
156
+ }
@@ -52,6 +52,7 @@ XZJ{}2
52
52
  `;
53
53
 
54
54
  // anonymous functions specified in test() registering must return Promise<any>
55
+ test('testGetStats', async () => { await _testGetStats(csvDfN1); });
55
56
  test('testGetAlphabetSimilarity', async () => { await _testGetAlphabetSimilarity(); });
56
57
 
57
58
  test('testPickupPaletteN1', async () => { await _testPickupPaletteN1(csvDfN1); });
@@ -61,16 +62,18 @@ XZJ{}2
61
62
  });
62
63
 
63
64
 
64
- export async function _testGetAlphabetFreqs(dfN1: DG.DataFrame) {
65
+ export async function _testGetStats(csvDfN1: string) {
66
+ const dfN1: DG.DataFrame = DG.DataFrame.fromCsv(csvDfN1);
65
67
  const seqCol: DG.Column = dfN1.col('seq')!;
66
- const mFreq = WebLogo.getAlphabetFreqs(seqCol);
68
+ const stats = WebLogo.getStats(seqCol, 5, WebLogo.splitterAsFasta);
67
69
 
68
- expectObject(mFreq, {
70
+ expectObject(stats.freq, {
69
71
  'A': 4,
70
72
  'C': 5,
71
73
  'G': 3,
72
74
  'T': 6
73
75
  });
76
+ expect(stats.sameLength, true);
74
77
  }
75
78
 
76
79
  export async function _testGetAlphabetSimilarity() {
@@ -4,7 +4,34 @@ import * as grok from 'datagrok-api/grok';
4
4
  import * as ui from 'datagrok-api/ui';
5
5
  import * as DG from 'datagrok-api/dg';
6
6
 
7
+ import {mmSemType} from '../const';
8
+ import {importFasta} from '../package';
9
+
7
10
  category('detectors', () => {
11
+ const csvDf1: string = `col1
12
+ 1
13
+ 2
14
+ 3`;
15
+
16
+ const csvDf2: string = `col1
17
+ 4
18
+ 5
19
+ 6
20
+ 7`;
21
+
22
+ const csvDf3: string = `col1
23
+ 8
24
+ 9
25
+ 10
26
+ 11
27
+ 12`;
28
+
29
+ const csvDfSmiles: string = `col1
30
+ CCCCN1C(=O)CN=C(c2cc(F)ccc12)C3CCCCC3
31
+ C1CCCCC1
32
+ CCCCCC
33
+ `;
34
+
8
35
  const csvDfN1: string = `seq
9
36
  ACGTC
10
37
  CAGTGT
@@ -60,23 +87,41 @@ YNR-WYV-KHP
60
87
  MWRSWY-CKHP
61
88
  `;
62
89
 
90
+ test('testDetectorsNegative1', async () => { await _testDetectorsNegative(csvDf1); });
91
+ test('testDetectorsNegative2', async () => { await _testDetectorsNegative(csvDf2); });
92
+ test('testDetectorsNegative3', async () => { await _testDetectorsNegative(csvDf3); });
93
+ test('testDetectorsNegativeSmiles', async () => { await _testDetectorsNegative(csvDfSmiles); });
94
+
63
95
  test('testDetectorsN1', async () => { await _testDetectorsN1(csvDfN1); });
64
96
  test('testDetectorsAA1', async () => { await _testDetectorsAA1(csvDfAA1); });
65
97
  test('testDetectorsMsaN1', async () => { await _testDetectorsMsaN1(csvDfMsaN1); });
66
98
  test('testDetectorsMsaAA1', async () => { await _testDetectorsMsaAA1(csvDfMsaAA1); });
67
99
 
68
- test('testDetectorsSepUn1', async () => { await _testDetectorsSepUn1(csvDfSepUn1); });
69
- test('testDetectorsSepUn2', async () => { await _testDetectorsSepUn2(csvDfSepUn2); });
100
+ test('testDetectorsSepNt', async () => { await _testDetectorsSepNt(csvDfSepNt, '*'); });
101
+ test('testDetectorsSepPt', async () => { await _testDetectorsSepPt(csvDfSepPt, '-'); });
102
+ test('testDetectorsSepUn1', async () => { await _testDetectorsSepUn(csvDfSepUn1, '-'); });
103
+ test('testDetectorsSepUn2', async () => { await _testDetectorsSepUn(csvDfSepUn2, '/'); });
70
104
 
71
105
  test('testDetectorsSepMsaN1', async () => { await _testDetectorsSepMsaN1(csvDfSepMsaN1); });
106
+
107
+ test('testDetectorsSamplesFastaCsvPt', async () => { await _testDetectorsSamplesFastaCsvPt(); });
108
+ test('testDetectorsSamplesFastaFastaPt', async () => { await _testDetectorsSamplesFastaFastaPt(); });
72
109
  });
73
110
 
111
+ export async function _testDetectorsNegative(csvDf: string) {
112
+ const df: DG.DataFrame = DG.DataFrame.fromCsv(csvDf);
113
+ await grok.data.detectSemanticTypes(df);
114
+
115
+ const col1: DG.Column = df.col('col1')!;
116
+ expect(col1.semType == mmSemType, false);
117
+ }
118
+
74
119
  export async function _testDetectorsN1(csvDfN1: string) {
75
120
  const dfN1: DG.DataFrame = DG.DataFrame.fromCsv(csvDfN1);
76
121
  await grok.data.detectSemanticTypes(dfN1);
77
122
 
78
123
  const col: DG.Column = dfN1.col('seq')!;
79
- expect(col.semType, 'MACROMOLECULE');
124
+ expect(col.semType, mmSemType);
80
125
  expect(col.getTag(DG.TAGS.UNITS), 'fasta:SEQ:NT');
81
126
  }
82
127
 
@@ -85,7 +130,7 @@ export async function _testDetectorsAA1(csvDfAA1: string) {
85
130
  await grok.data.detectSemanticTypes(dfAA1);
86
131
 
87
132
  const col: DG.Column = dfAA1.col('seq')!;
88
- expect(col.semType, 'MACROMOLECULE');
133
+ expect(col.semType, mmSemType);
89
134
  expect(col.getTag(DG.TAGS.UNITS), 'fasta:SEQ:PT');
90
135
  }
91
136
 
@@ -94,7 +139,7 @@ export async function _testDetectorsMsaN1(csvDfMsaN1: string) {
94
139
  await grok.data.detectSemanticTypes(dfMsaN1);
95
140
 
96
141
  const col: DG.Column = dfMsaN1.col('seq')!;
97
- expect(col.semType, 'MACROMOLECULE');
142
+ expect(col.semType, mmSemType);
98
143
  expect(col.getTag(DG.TAGS.UNITS), 'fasta:SEQ.MSA:NT');
99
144
  }
100
145
 
@@ -103,26 +148,38 @@ export async function _testDetectorsMsaAA1(csvDfMsaAA1: string) {
103
148
  await grok.data.detectSemanticTypes(dfMsaAA1);
104
149
 
105
150
  const col: DG.Column = dfMsaAA1.col('seq')!;
106
- expect(col.semType, 'MACROMOLECULE');
151
+ expect(col.semType, mmSemType);
107
152
  expect(col.getTag(DG.TAGS.UNITS), 'fasta:SEQ.MSA:PT');
108
153
  }
109
154
 
110
- export async function _testDetectorsSepUn1(csvDfSepUn1: string) {
111
- const dfSepUn1: DG.DataFrame = DG.DataFrame.fromCsv(csvDfSepUn1);
112
- await grok.data.detectSemanticTypes(dfSepUn1);
155
+ export async function _testDetectorsSepNt(csv: string, separator: string) {
156
+ const df: DG.DataFrame = DG.DataFrame.fromCsv(csv);
157
+ await grok.data.detectSemanticTypes(df);
113
158
 
114
- const col: DG.Column = dfSepUn1.col('seq')!;
115
- expect(col.semType, 'MACROMOLECULE');
116
- expect(col.getTag(DG.TAGS.UNITS), 'separator:SEQ:UN');
159
+ const col: DG.Column = df.col('seq')!;
160
+ expect(col.semType, mmSemType);
161
+ expect(col.getTag(DG.TAGS.UNITS), 'separator:SEQ:NT');
162
+ expect(col.getTag('separator'), separator);
117
163
  }
118
164
 
119
- export async function _testDetectorsSepUn2(csvDfSepUn2: string) {
120
- const dfSepUn2: DG.DataFrame = DG.DataFrame.fromCsv(csvDfSepUn2);
121
- await grok.data.detectSemanticTypes(dfSepUn2);
165
+ export async function _testDetectorsSepPt(csv: string, separator: string) {
166
+ const df: DG.DataFrame = DG.DataFrame.fromCsv(csv);
167
+ await grok.data.detectSemanticTypes(df);
122
168
 
123
- const col: DG.Column = dfSepUn2.col('seq')!;
124
- expect(col.semType, 'MACROMOLECULE');
169
+ const col: DG.Column = df.col('seq')!;
170
+ expect(col.semType, mmSemType);
171
+ expect(col.getTag(DG.TAGS.UNITS), 'separator:SEQ:PT');
172
+ expect(col.getTag('separator'), separator);
173
+ }
174
+
175
+ export async function _testDetectorsSepUn(csv: string, separator: string) {
176
+ const df: DG.DataFrame = DG.DataFrame.fromCsv(csv);
177
+ await grok.data.detectSemanticTypes(df);
178
+
179
+ const col: DG.Column = df.col('seq')!;
180
+ expect(col.semType, mmSemType);
125
181
  expect(col.getTag(DG.TAGS.UNITS), 'separator:SEQ:UN');
182
+ expect(col.getTag('separator'), separator);
126
183
  }
127
184
 
128
185
  export async function _testDetectorsSepMsaN1(csvDfSepMsaN1: string) {
@@ -130,6 +187,27 @@ export async function _testDetectorsSepMsaN1(csvDfSepMsaN1: string) {
130
187
  await grok.data.detectSemanticTypes(dfSepMsaN1);
131
188
 
132
189
  const col: DG.Column = dfSepMsaN1.col('seq')!;
133
- expect(col.semType, 'MACROMOLECULE');
190
+ expect(col.semType, mmSemType);
134
191
  expect(col.getTag(DG.TAGS.UNITS), 'separator:SEQ.MSA:NT');
135
192
  }
193
+
194
+ export async function _testDetectorsSamplesFastaCsvPt() {
195
+ const csv: string = await grok.dapi.files.readAsText('System:AppData/Bio/samples/sample_FASTA.csv');
196
+ const df: DG.DataFrame = DG.DataFrame.fromCsv(csv);
197
+ await grok.data.detectSemanticTypes(df);
198
+
199
+ const col: DG.Column = df.col('sequence')!;
200
+ expect(col.semType, mmSemType);
201
+ expect(col.getTag(DG.TAGS.UNITS), 'fasta:SEQ:PT');
202
+ expect(col.getTag('separator'), null);
203
+ }
204
+
205
+ export async function _testDetectorsSamplesFastaFastaPt() {
206
+ const fasta: string = await grok.dapi.files.readAsText('System:AppData/Bio/samples/sample_FASTA.fasta');
207
+ const df: DG.DataFrame = importFasta(fasta)[0];
208
+
209
+ const col: DG.Column = df.col('sequence')!;
210
+ expect(col.semType, mmSemType);
211
+ expect(col.getTag(DG.TAGS.UNITS), 'fasta:SEQ:PT');
212
+ expect(col.getTag('separator'), null);
213
+ }
@@ -0,0 +1,34 @@
1
+ import {category, test} from '@datagrok-libraries/utils/src/test';
2
+ import {
3
+ _testMSAIsCorrect,
4
+ _testTableIsNotEmpty,
5
+ } from './utils';
6
+
7
+ import * as DG from 'datagrok-api/dg';
8
+ //import * as grok from 'datagrok-api/grok';
9
+
10
+ export const _package = new DG.Package();
11
+
12
+
13
+ category('MSA', async () => {
14
+ //table = await grok.data.files.openTable('Demo:Files/bio/peptides.csv');
15
+ const fromCsv = `seq
16
+ FWRWYVKHP
17
+ YNRWYVKHP
18
+ MWRSWYCKHP`;
19
+ const toCsv = `seq
20
+ -F-W-R--W-Y-V-K-H-P
21
+ -Y-N-R--W-Y-V-K-H-P
22
+ -M-W-R-S-W-Y-C-K-H-P`;
23
+ const table: DG.DataFrame = DG.DataFrame.fromCsv(fromCsv);
24
+ const toTable: DG.DataFrame = DG.DataFrame.fromCsv(toCsv);
25
+ const alignedSequencesColumn = toTable.getCol('seq');
26
+
27
+ test('test_table.is_not_empty', async () => {
28
+ await _testTableIsNotEmpty(table);
29
+ });
30
+
31
+ test('is_correct', async () => {
32
+ await _testMSAIsCorrect(alignedSequencesColumn);
33
+ });
34
+ });
@@ -0,0 +1,24 @@
1
+ import {before, category, test, expect} from '@datagrok-libraries/utils/src/test';
2
+ import * as DG from "datagrok-api/dg";
3
+ import { sequenceSpace } from '../utils/sequence-space';
4
+ import { readDataframe } from './utils';
5
+ //import * as grok from 'datagrok-api/grok';
6
+
7
+ category('sequenceSpace', async () => {
8
+
9
+ let testFastaDf: DG.DataFrame;
10
+
11
+ before(async () => {
12
+ //@ts-ignore
13
+ testFastaDf = await readDataframe('sample_FASTA.csv');
14
+ });
15
+
16
+
17
+ test('sequenceSpaceOpens', async () => {
18
+ //@ts-ignore
19
+ const res = await sequenceSpace(testFastaDf.col('Sequence')!, 't-SNE', 'Levenshtein', ['Embed_X', 'Embed_Y']);
20
+ expect(res.coordinates != undefined, true);
21
+ expect(res.distance != undefined, true);
22
+ });
23
+
24
+ });
@@ -0,0 +1,39 @@
1
+ import * as DG from 'datagrok-api/dg';
2
+ import * as grok from "datagrok-api/grok";
3
+ import {expect} from '@datagrok-libraries/utils/src/test';
4
+ import {runKalign} from '../utils/multiple-sequence-alignment';
5
+ import { _package} from '../package-test';
6
+
7
+ export async function loadFileAsText(name: string): Promise<string> {
8
+ return await _package.files.readAsText(name);
9
+ }
10
+
11
+ export async function readDataframe(tableName: string): Promise<DG.DataFrame> {
12
+ const file = await loadFileAsText(tableName);
13
+ const df = DG.DataFrame.fromCsv(file);
14
+ df.name = tableName.replace('.csv', '');
15
+ return df;
16
+ }
17
+
18
+
19
+ /**
20
+ * Tests if a table has non zero rows and columns.
21
+ *
22
+ * @param {DG.DataFrame} table Target table.
23
+ */
24
+ export function _testTableIsNotEmpty(table: DG.DataFrame): void {
25
+ expect(table.columns.length > 0 && table.rowCount > 0, true);
26
+ }
27
+
28
+
29
+ /**
30
+ * Tests if MSA works and returns consistent result.
31
+ *
32
+ * @export
33
+ * @param {DG.Column} col Aligned sequences column.
34
+ */
35
+ export async function _testMSAIsCorrect(col: DG.Column): Promise<void> {
36
+ const msaCol = await runKalign(col, true);
37
+ expect(msaCol.toList().every((v, i) => (v == col.get(i) || v == null)), true);
38
+
39
+ }
@@ -0,0 +1,62 @@
1
+ export enum COLUMNS_NAMES {
2
+ SPLIT_COL = '~split',
3
+ ACTIVITY = '~activity',
4
+ ACTIVITY_SCALED = 'activity_scaled',
5
+ ALIGNED_SEQUENCE = '~aligned_sequence',
6
+ AMINO_ACID_RESIDUE = 'AAR',
7
+ POSITION = 'Pos',
8
+ P_VALUE = 'pValue',
9
+ MEAN_DIFFERENCE = 'Mean difference',
10
+ }
11
+
12
+ export enum CATEGORIES {
13
+ OTHER = 'Other',
14
+ ALL = 'All',
15
+ }
16
+
17
+ export enum TAGS {
18
+ AAR = 'AAR',
19
+ POSITION = 'Pos',
20
+ SEPARATOR = 'monomer-separator',
21
+ SELECTION = 'selection',
22
+ }
23
+
24
+ export enum SEM_TYPES {
25
+ AMINO_ACIDS = 'aminoAcids',
26
+ ALIGNED_SEQUENCE = 'alignedSequence',
27
+ ALIGNED_SEQUENCE_DIFFERENCE = 'alignedSequenceDifference',
28
+ ACTIVITY = 'activity',
29
+ ACTIVITY_SCALED = 'activityScaled',
30
+ Macro_Molecule = 'Macromolecule',
31
+ }
32
+
33
+ export const STATS = 'stats';
34
+
35
+ export const EMBEDDING_STATUS = 'embeddingStatus';
36
+
37
+ export const PEPTIDES_ANALYSIS = 'isPeptidesAnalysis';
38
+
39
+ export enum FLAGS {
40
+ CELL_CHANGING = 'isCellChanging',
41
+ }
42
+
43
+ export const aarGroups = {
44
+ 'R': 'PC', 'H': 'PC', 'K': 'PC',
45
+ 'D': 'NC', 'E': 'NC',
46
+ 'S': 'U', 'T': 'U', 'N': 'U', 'Q': 'U',
47
+ 'C': 'SC', 'U': 'SC', 'G': 'SC', 'P': 'SC',
48
+ 'A': 'H', 'V': 'H', 'I': 'H', 'L': 'H', 'M': 'H', 'F': 'H', 'Y': 'H', 'W': 'H',
49
+ '-': '-',
50
+ };
51
+
52
+ export const groupDescription: {[key: string]: {'description': string, aminoAcids: string[]}} = {
53
+ 'PC': {'description': 'Positive Amino Acids, with Electrically Charged Side Chains', 'aminoAcids': ['R', 'H', 'K']},
54
+ 'NC': {'description': 'Negative Amino Acids, with Electrically Charged Side Chains', 'aminoAcids': ['D', 'E']},
55
+ 'U': {'description': 'Amino Acids with Polar Uncharged Side Chains', 'aminoAcids': ['S', 'T', 'N', 'Q']},
56
+ 'SC': {'description': 'Special Cases', 'aminoAcids': ['C', 'U', 'G', 'P']},
57
+ 'H': {
58
+ 'description': 'Amino Acids with Hydrophobic Side Chain',
59
+ 'aminoAcids': ['A', 'V', 'I', 'L', 'M', 'F', 'Y', 'W'],
60
+ },
61
+ '-': {'description': 'Unknown Amino Acid', 'aminoAcids': ['-']},
62
+ };
@@ -0,0 +1,24 @@
1
+ import * as DG from 'datagrok-api/dg';
2
+ import * as ui from 'datagrok-api/ui';
3
+
4
+ export function convert(col: DG.Column): void {
5
+
6
+ const current = col.tags[DG.TAGS.UNITS];
7
+ //TODO: read all notations
8
+ const notations = ['fasta:SEQ:NT', 'fasta:SEQ:PT', 'fasta:SEQ.MSA:NT', 'fasta:SEQ.MSA:PT', 'HELM'];
9
+ const choices = ui.choiceInput("convert to", "", notations.filter(e => e !== current));
10
+
11
+ ui.dialog('Convert sequence')
12
+ .add(
13
+ ui.div([
14
+ ui.h1('current notation'),
15
+ ui.div(current),
16
+ choices.root
17
+ ])
18
+ )
19
+ .onOK(() => {
20
+ //TODO: create new converted column
21
+ //col.dataFrame.columns.add();
22
+ })
23
+ .show();
24
+ }
@@ -0,0 +1,76 @@
1
+ import * as DG from 'datagrok-api/dg';
2
+
3
+ //@ts-ignore
4
+ import Aioli from '@biowasm/aioli';
5
+
6
+ import {AlignedSequenceEncoder} from '@datagrok-libraries/bio/src/sequence-encoder';
7
+ import * as C from './constants';
8
+
9
+ /**
10
+ * Converts array of sequences into simple fasta string.
11
+ *
12
+ * @param {string[]} sequences Input list of sequences.
13
+ * @return {string} Fasta-formatted string.
14
+ */
15
+ function _stringsToFasta(sequences: string[]): string {
16
+ return sequences.reduce((a, v, i) => a + `>sample${i + 1}\n${v}\n`, '');
17
+ }
18
+
19
+ /**
20
+ * Extracts array of sequences from simple fasta string.
21
+ *
22
+ * @param {string} fasta Fasta-formatted string.
23
+ * @return {string[]} Output list of sequences.
24
+ */
25
+ function _fastaToStrings(fasta: string): string[] {
26
+ return fasta.replace(/>sample\d+(\r\n|\r|\n)/g, '').split('\n');
27
+ }
28
+
29
+ /**
30
+ * Runs Aioli environment with kalign tool.
31
+ *
32
+ * @param {DG.Column} col Column with sequences.
33
+ * @param {boolean} isAligned Whether the column is aligned.
34
+ * @return {Promise<DG.Column>} Aligned sequences.
35
+ */
36
+ export async function runKalign(col: DG.Column, isAligned = false) : Promise<DG.Column> {
37
+ let sequences = col.toList();
38
+
39
+ if (isAligned)
40
+ sequences = sequences.map((v: string, _) => AlignedSequenceEncoder.clean(v).replace(/\-/g, ''));
41
+
42
+ const fasta = _stringsToFasta(sequences);
43
+ const CLI = await new Aioli({
44
+ tool: 'kalign',
45
+ version: '3.3.1',
46
+ reinit: true,
47
+ });
48
+
49
+ console.log(['fasta.length =', fasta.length]);
50
+
51
+ await CLI.fs.writeFile('input.fa', fasta);
52
+ const output = await CLI.exec('kalign input.fa -f fasta -o result.fasta');
53
+ const buf = await CLI.cat('result.fasta');
54
+
55
+ console.warn(output);
56
+
57
+ const aligned = _fastaToStrings(buf).slice(0, sequences.length);
58
+ const alignedCol = DG.Column.fromStrings(`msa(${col.name})`, aligned);
59
+ alignedCol.setTag(DG.TAGS.UNITS, '');
60
+ alignedCol.semType = C.SEM_TYPES.Macro_Molecule;
61
+ return alignedCol;
62
+ }
63
+
64
+ export async function testMSAEnoughMemory(col: DG.Column): Promise<void> {
65
+ const sequencesCount = col.length;
66
+ const delta = sequencesCount/100;
67
+
68
+ for (let i = delta; i < sequencesCount; i += delta) {
69
+ try {
70
+ await runKalign(DG.Column.fromStrings(col.name, col.toList().slice(0, Math.round(i))));
71
+ console.log(`runKalign succeeded on ${i}`);
72
+ } catch (error) {
73
+ console.log(`runKalign failed on ${i} with '${error}'`);
74
+ }
75
+ }
76
+ }