@datagrok/bio 1.3.1 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/package.ts CHANGED
@@ -2,12 +2,18 @@
2
2
  import * as grok from 'datagrok-api/grok';
3
3
  import * as ui from 'datagrok-api/ui';
4
4
  import * as DG from 'datagrok-api/dg';
5
+
5
6
  import {SequenceAlignment, Aligned} from './seq_align';
6
7
 
7
8
  export const _package = new DG.Package();
8
9
 
9
10
  import {WebLogo} from '@datagrok-libraries/bio/src/viewers/web-logo';
10
11
  import {VdRegionsViewer} from './viewers/vd-regions-viewer';
12
+ import {runKalign, testMSAEnoughMemory} from './utils/multiple-sequence-alignment';
13
+ import {TableView} from 'datagrok-api/dg';
14
+ import {mmSemType} from './const';
15
+ import {Nucleotides} from '@datagrok-libraries/bio/src/nucleotides';
16
+ import {Aminoacids} from '@datagrok-libraries/bio/src/aminoacids';
11
17
 
12
18
  //name: sequenceAlignment
13
19
  //input: string alignType {choices: ['Local alignment', 'Global alignment']}
@@ -38,3 +44,97 @@ export function webLogoViewer() {
38
44
  export function vdRegionViewer() {
39
45
  return new VdRegionsViewer();
40
46
  }
47
+
48
+ //top-menu: Bio | Activity Cliffs...
49
+ //name: Activity Cliffs
50
+ //description: detect activity cliffs
51
+ //input: dataframe df [Input data table]
52
+ //input: column smiles {type:categorical; semType: Macromolecule}
53
+ //input: column activities
54
+ //input: double similarity = 80 [Similarity cutoff]
55
+ //input: string methodName { choices:["UMAP", "t-SNE", "SPE"] }
56
+ export async function activityCliffs(df: DG.DataFrame, smiles: DG.Column, activities: DG.Column,
57
+ similarity: number, methodName: string): Promise<void> {
58
+ }
59
+
60
+ //top-menu: Bio | Sequence Space...
61
+ //name: Sequence Space
62
+ //input: dataframe table
63
+ //input: column smiles { semType: Macromolecule }
64
+ //input: string methodName { choices:["UMAP", "t-SNE", "SPE", "pSPE", "OriginalSPE"] }
65
+ //input: string similarityMetric { choices:["Tanimoto", "Asymmetric", "Cosine", "Sokal"] }
66
+ //input: bool plotEmbeddings = true
67
+ export async function chemSpaceTopMenu(table: DG.DataFrame, smiles: DG.Column, methodName: string,
68
+ similarityMetric: string = 'Tanimoto', plotEmbeddings: boolean): Promise<void> {
69
+ };
70
+
71
+ //top-menu: Bio | MSA...
72
+ //name: MSA
73
+ //input: dataframe table
74
+ //input: column sequence { semType: Macromolecule }
75
+ export async function multipleSequenceAlignmentAny(table: DG.DataFrame, col: DG.Column): Promise<void> {
76
+ const msaCol = await runKalign(col, false);
77
+ table.columns.add(msaCol);
78
+ }
79
+
80
+ //name: Composition Analysis
81
+ //top-menu: Bio | Composition Analysis
82
+ //output: viewer result
83
+ export async function compositionAnalysis(): Promise<void> {
84
+ const col = grok.shell.t.columns.bySemType('Macromolecule');//DG.SEMTYPE.MACROMOLECULE);
85
+ if (col === null) {
86
+ grok.shell.error('Current table does not contain sequences');
87
+ return;
88
+ }
89
+
90
+ const wl = await col.dataFrame.plot.fromType('WebLogo', {});
91
+
92
+ for (const v of grok.shell.views) {
93
+ if (v instanceof TableView && (v as DG.TableView).dataFrame.name === col.dataFrame.name) {
94
+ (v as DG.TableView).dockManager.dock(wl.root, 'down');
95
+ break;
96
+ }
97
+ }
98
+ }
99
+
100
+ //name: importFasta
101
+ //description: Opens FASTA file
102
+ //tags: file-handler
103
+ //meta.ext: fasta, fna, ffn, faa, frn, fa
104
+ //input: string content
105
+ //output: list tables
106
+ export function importFasta(content: string): DG.DataFrame [] {
107
+ const regex = /^>(.*)$/gm;
108
+ const descriptions = [];
109
+ const sequences = [];
110
+ let index = 0;
111
+ let match;
112
+ while (match = regex.exec(content)) {
113
+ descriptions.push(content.substring(match.index + 1, regex.lastIndex));
114
+ if (index !== 0)
115
+ sequences.push(content.substring(index, regex.lastIndex));
116
+ index = regex.lastIndex + 1;
117
+ }
118
+ sequences.push(content.substring(index));
119
+ const descriptionsCol = DG.Column.fromStrings('description', descriptions);
120
+ const sequenceCol = DG.Column.fromStrings('sequence', sequences);
121
+
122
+ const stats: { freq: { [m: string]: number }, sameLength: boolean } = WebLogo.getStats(sequenceCol, 5, WebLogo.splitterAsFasta);
123
+ const seqType = stats.sameLength ? 'SEQ.MSA' : 'SEQ';
124
+ const alphabetCandidates: [string, Set<string>][] = [
125
+ ['NT', new Set(Object.keys(Nucleotides.Names)),],
126
+ ['PT', new Set(Object.keys(Aminoacids.Names)),],
127
+ ];
128
+ // Calculate likelihoods for alphabet_candidates
129
+ const alphabetCandidatesSim: number[] = alphabetCandidates.map(
130
+ (c) => WebLogo.getAlphabetSimilarity(stats.freq, c[1]));
131
+ const maxCos = Math.max(...alphabetCandidatesSim);
132
+ const alphabet = maxCos > 0.65 ? alphabetCandidates[alphabetCandidatesSim.indexOf(maxCos)][0] : 'UN';
133
+ sequenceCol.semType = mmSemType;
134
+ sequenceCol.setTag(DG.TAGS.UNITS, `fasta:${seqType}:${alphabet}`);
135
+
136
+ return [DG.DataFrame.fromColumns([
137
+ descriptionsCol,
138
+ sequenceCol,
139
+ ])];
140
+ }
@@ -52,6 +52,7 @@ XZJ{}2
52
52
  `;
53
53
 
54
54
  // anonymous functions specified in test() registering must return Promise<any>
55
+ test('testGetStats', async () => { await _testGetStats(csvDfN1); });
55
56
  test('testGetAlphabetSimilarity', async () => { await _testGetAlphabetSimilarity(); });
56
57
 
57
58
  test('testPickupPaletteN1', async () => { await _testPickupPaletteN1(csvDfN1); });
@@ -61,16 +62,18 @@ XZJ{}2
61
62
  });
62
63
 
63
64
 
64
- export async function _testGetAlphabetFreqs(dfN1: DG.DataFrame) {
65
+ export async function _testGetStats(csvDfN1: string) {
66
+ const dfN1: DG.DataFrame = DG.DataFrame.fromCsv(csvDfN1);
65
67
  const seqCol: DG.Column = dfN1.col('seq')!;
66
- const mFreq = WebLogo.getAlphabetFreqs(seqCol);
68
+ const stats = WebLogo.getStats(seqCol, 5, WebLogo.splitterAsFasta);
67
69
 
68
- expectObject(mFreq, {
70
+ expectObject(stats.freq, {
69
71
  'A': 4,
70
72
  'C': 5,
71
73
  'G': 3,
72
74
  'T': 6
73
75
  });
76
+ expect(stats.sameLength, true);
74
77
  }
75
78
 
76
79
  export async function _testGetAlphabetSimilarity() {
@@ -4,24 +4,33 @@ import * as grok from 'datagrok-api/grok';
4
4
  import * as ui from 'datagrok-api/ui';
5
5
  import * as DG from 'datagrok-api/dg';
6
6
 
7
+ import {mmSemType} from '../const';
8
+ import {importFasta} from '../package';
9
+
7
10
  category('detectors', () => {
8
- // const csvDf1: string = `col1
9
- // 1
10
- // 2
11
- // 3`;
12
- //
13
- // const csvDf2: string = `col1
14
- // 4
15
- // 5
16
- // 6
17
- // 7`;
18
- //
19
- // const csvDf3: string = `col1
20
- // 8
21
- // 9
22
- // 10
23
- // 11
24
- // 12`;
11
+ const csvDf1: string = `col1
12
+ 1
13
+ 2
14
+ 3`;
15
+
16
+ const csvDf2: string = `col1
17
+ 4
18
+ 5
19
+ 6
20
+ 7`;
21
+
22
+ const csvDf3: string = `col1
23
+ 8
24
+ 9
25
+ 10
26
+ 11
27
+ 12`;
28
+
29
+ const csvDfSmiles: string = `col1
30
+ CCCCN1C(=O)CN=C(c2cc(F)ccc12)C3CCCCC3
31
+ C1CCCCC1
32
+ CCCCCC
33
+ `;
25
34
 
26
35
  const csvDfN1: string = `seq
27
36
  ACGTC
@@ -78,36 +87,41 @@ YNR-WYV-KHP
78
87
  MWRSWY-CKHP
79
88
  `;
80
89
 
81
- // test('testDetectors1', async () => { await _testDetectors(csvDf1); });
82
- // test('testDetectors2', async () => { await _testDetectors(csvDf2); });
83
- // test('testDetectors3', async () => { await _testDetectors(csvDf3); });
90
+ test('testDetectorsNegative1', async () => { await _testDetectorsNegative(csvDf1); });
91
+ test('testDetectorsNegative2', async () => { await _testDetectorsNegative(csvDf2); });
92
+ test('testDetectorsNegative3', async () => { await _testDetectorsNegative(csvDf3); });
93
+ test('testDetectorsNegativeSmiles', async () => { await _testDetectorsNegative(csvDfSmiles); });
84
94
 
85
95
  test('testDetectorsN1', async () => { await _testDetectorsN1(csvDfN1); });
86
96
  test('testDetectorsAA1', async () => { await _testDetectorsAA1(csvDfAA1); });
87
97
  test('testDetectorsMsaN1', async () => { await _testDetectorsMsaN1(csvDfMsaN1); });
88
98
  test('testDetectorsMsaAA1', async () => { await _testDetectorsMsaAA1(csvDfMsaAA1); });
89
99
 
90
- test('testDetectorsSepUn1', async () => { await _testDetectorsSepUn1(csvDfSepUn1); });
91
- test('testDetectorsSepUn2', async () => { await _testDetectorsSepUn2(csvDfSepUn2); });
100
+ test('testDetectorsSepNt', async () => { await _testDetectorsSepNt(csvDfSepNt, '*'); });
101
+ test('testDetectorsSepPt', async () => { await _testDetectorsSepPt(csvDfSepPt, '-'); });
102
+ test('testDetectorsSepUn1', async () => { await _testDetectorsSepUn(csvDfSepUn1, '-'); });
103
+ test('testDetectorsSepUn2', async () => { await _testDetectorsSepUn(csvDfSepUn2, '/'); });
92
104
 
93
105
  test('testDetectorsSepMsaN1', async () => { await _testDetectorsSepMsaN1(csvDfSepMsaN1); });
106
+
107
+ test('testDetectorsSamplesFastaCsvPt', async () => { await _testDetectorsSamplesFastaCsvPt(); });
108
+ test('testDetectorsSamplesFastaFastaPt', async () => { await _testDetectorsSamplesFastaFastaPt(); });
94
109
  });
95
110
 
96
- // export async function _testDetectors(csvDf: string) {
97
- // const df: DG.DataFrame = DG.DataFrame.fromCsv(csvDf);
98
- // await grok.data.detectSemanticTypes(df);
99
- //
100
- // const col1: DG.Column = df.col('col1')!;
101
- // expect(col1.semType, null);
102
- // expect(col1.getTag(DG.TAGS.UNITS), null);
103
- // }
111
+ export async function _testDetectorsNegative(csvDf: string) {
112
+ const df: DG.DataFrame = DG.DataFrame.fromCsv(csvDf);
113
+ await grok.data.detectSemanticTypes(df);
114
+
115
+ const col1: DG.Column = df.col('col1')!;
116
+ expect(col1.semType == mmSemType, false);
117
+ }
104
118
 
105
119
  export async function _testDetectorsN1(csvDfN1: string) {
106
120
  const dfN1: DG.DataFrame = DG.DataFrame.fromCsv(csvDfN1);
107
121
  await grok.data.detectSemanticTypes(dfN1);
108
122
 
109
123
  const col: DG.Column = dfN1.col('seq')!;
110
- expect(col.semType, 'MACROMOLECULE');
124
+ expect(col.semType, mmSemType);
111
125
  expect(col.getTag(DG.TAGS.UNITS), 'fasta:SEQ:NT');
112
126
  }
113
127
 
@@ -116,7 +130,7 @@ export async function _testDetectorsAA1(csvDfAA1: string) {
116
130
  await grok.data.detectSemanticTypes(dfAA1);
117
131
 
118
132
  const col: DG.Column = dfAA1.col('seq')!;
119
- expect(col.semType, 'MACROMOLECULE');
133
+ expect(col.semType, mmSemType);
120
134
  expect(col.getTag(DG.TAGS.UNITS), 'fasta:SEQ:PT');
121
135
  }
122
136
 
@@ -125,7 +139,7 @@ export async function _testDetectorsMsaN1(csvDfMsaN1: string) {
125
139
  await grok.data.detectSemanticTypes(dfMsaN1);
126
140
 
127
141
  const col: DG.Column = dfMsaN1.col('seq')!;
128
- expect(col.semType, 'MACROMOLECULE');
142
+ expect(col.semType, mmSemType);
129
143
  expect(col.getTag(DG.TAGS.UNITS), 'fasta:SEQ.MSA:NT');
130
144
  }
131
145
 
@@ -134,26 +148,38 @@ export async function _testDetectorsMsaAA1(csvDfMsaAA1: string) {
134
148
  await grok.data.detectSemanticTypes(dfMsaAA1);
135
149
 
136
150
  const col: DG.Column = dfMsaAA1.col('seq')!;
137
- expect(col.semType, 'MACROMOLECULE');
151
+ expect(col.semType, mmSemType);
138
152
  expect(col.getTag(DG.TAGS.UNITS), 'fasta:SEQ.MSA:PT');
139
153
  }
140
154
 
141
- export async function _testDetectorsSepUn1(csvDfSepUn1: string) {
142
- const dfSepUn1: DG.DataFrame = DG.DataFrame.fromCsv(csvDfSepUn1);
143
- await grok.data.detectSemanticTypes(dfSepUn1);
155
+ export async function _testDetectorsSepNt(csv: string, separator: string) {
156
+ const df: DG.DataFrame = DG.DataFrame.fromCsv(csv);
157
+ await grok.data.detectSemanticTypes(df);
144
158
 
145
- const col: DG.Column = dfSepUn1.col('seq')!;
146
- expect(col.semType, 'MACROMOLECULE');
147
- expect(col.getTag(DG.TAGS.UNITS), 'separator:SEQ:UN');
159
+ const col: DG.Column = df.col('seq')!;
160
+ expect(col.semType, mmSemType);
161
+ expect(col.getTag(DG.TAGS.UNITS), 'separator:SEQ:NT');
162
+ expect(col.getTag('separator'), separator);
163
+ }
164
+
165
+ export async function _testDetectorsSepPt(csv: string, separator: string) {
166
+ const df: DG.DataFrame = DG.DataFrame.fromCsv(csv);
167
+ await grok.data.detectSemanticTypes(df);
168
+
169
+ const col: DG.Column = df.col('seq')!;
170
+ expect(col.semType, mmSemType);
171
+ expect(col.getTag(DG.TAGS.UNITS), 'separator:SEQ:PT');
172
+ expect(col.getTag('separator'), separator);
148
173
  }
149
174
 
150
- export async function _testDetectorsSepUn2(csvDfSepUn2: string) {
151
- const dfSepUn2: DG.DataFrame = DG.DataFrame.fromCsv(csvDfSepUn2);
152
- await grok.data.detectSemanticTypes(dfSepUn2);
175
+ export async function _testDetectorsSepUn(csv: string, separator: string) {
176
+ const df: DG.DataFrame = DG.DataFrame.fromCsv(csv);
177
+ await grok.data.detectSemanticTypes(df);
153
178
 
154
- const col: DG.Column = dfSepUn2.col('seq')!;
155
- expect(col.semType, 'MACROMOLECULE');
179
+ const col: DG.Column = df.col('seq')!;
180
+ expect(col.semType, mmSemType);
156
181
  expect(col.getTag(DG.TAGS.UNITS), 'separator:SEQ:UN');
182
+ expect(col.getTag('separator'), separator);
157
183
  }
158
184
 
159
185
  export async function _testDetectorsSepMsaN1(csvDfSepMsaN1: string) {
@@ -161,6 +187,27 @@ export async function _testDetectorsSepMsaN1(csvDfSepMsaN1: string) {
161
187
  await grok.data.detectSemanticTypes(dfSepMsaN1);
162
188
 
163
189
  const col: DG.Column = dfSepMsaN1.col('seq')!;
164
- expect(col.semType, 'MACROMOLECULE');
190
+ expect(col.semType, mmSemType);
165
191
  expect(col.getTag(DG.TAGS.UNITS), 'separator:SEQ.MSA:NT');
166
192
  }
193
+
194
+ export async function _testDetectorsSamplesFastaCsvPt() {
195
+ const csv: string = await grok.dapi.files.readAsText('System:AppData/Bio/samples/sample_FASTA.csv');
196
+ const df: DG.DataFrame = DG.DataFrame.fromCsv(csv);
197
+ await grok.data.detectSemanticTypes(df);
198
+
199
+ const col: DG.Column = df.col('sequence')!;
200
+ expect(col.semType, mmSemType);
201
+ expect(col.getTag(DG.TAGS.UNITS), 'fasta:SEQ:PT');
202
+ expect(col.getTag('separator'), null);
203
+ }
204
+
205
+ export async function _testDetectorsSamplesFastaFastaPt() {
206
+ const fasta: string = await grok.dapi.files.readAsText('System:AppData/Bio/samples/sample_FASTA.fasta');
207
+ const df: DG.DataFrame = importFasta(fasta)[0];
208
+
209
+ const col: DG.Column = df.col('sequence')!;
210
+ expect(col.semType, mmSemType);
211
+ expect(col.getTag(DG.TAGS.UNITS), 'fasta:SEQ:PT');
212
+ expect(col.getTag('separator'), null);
213
+ }
@@ -0,0 +1,34 @@
1
+ import {category, test} from '@datagrok-libraries/utils/src/test';
2
+ import {
3
+ _testMSAIsCorrect,
4
+ _testTableIsNotEmpty,
5
+ } from './utils';
6
+
7
+ import * as DG from 'datagrok-api/dg';
8
+ //import * as grok from 'datagrok-api/grok';
9
+
10
+ export const _package = new DG.Package();
11
+
12
+
13
+ category('MSA', async () => {
14
+ //table = await grok.data.files.openTable('Demo:Files/bio/peptides.csv');
15
+ const fromCsv = `seq
16
+ FWRWYVKHP
17
+ YNRWYVKHP
18
+ MWRSWYCKHP`;
19
+ const toCsv = `seq
20
+ -F-W-R--W-Y-V-K-H-P
21
+ -Y-N-R--W-Y-V-K-H-P
22
+ -M-W-R-S-W-Y-C-K-H-P`;
23
+ const table: DG.DataFrame = DG.DataFrame.fromCsv(fromCsv);
24
+ const toTable: DG.DataFrame = DG.DataFrame.fromCsv(toCsv);
25
+ const alignedSequencesColumn = toTable.getCol('seq');
26
+
27
+ test('test_table.is_not_empty', async () => {
28
+ await _testTableIsNotEmpty(table);
29
+ });
30
+
31
+ test('is_correct', async () => {
32
+ await _testMSAIsCorrect(alignedSequencesColumn);
33
+ });
34
+ });
@@ -0,0 +1,26 @@
1
+ import * as DG from 'datagrok-api/dg';
2
+
3
+ import {expect} from '@datagrok-libraries/utils/src/test';
4
+ import {runKalign} from '../utils/multiple-sequence-alignment';
5
+
6
+ /**
7
+ * Tests if a table has non zero rows and columns.
8
+ *
9
+ * @param {DG.DataFrame} table Target table.
10
+ */
11
+ export function _testTableIsNotEmpty(table: DG.DataFrame): void {
12
+ expect(table.columns.length > 0 && table.rowCount > 0, true);
13
+ }
14
+
15
+
16
+ /**
17
+ * Tests if MSA works and returns consistent result.
18
+ *
19
+ * @export
20
+ * @param {DG.Column} col Aligned sequences column.
21
+ */
22
+ export async function _testMSAIsCorrect(col: DG.Column): Promise<void> {
23
+ const msaCol = await runKalign(col, true);
24
+ expect(msaCol.toList().every((v, i) => (v == col.get(i) || v == null)), true);
25
+
26
+ }
@@ -0,0 +1,62 @@
1
+ export enum COLUMNS_NAMES {
2
+ SPLIT_COL = '~split',
3
+ ACTIVITY = '~activity',
4
+ ACTIVITY_SCALED = 'activity_scaled',
5
+ ALIGNED_SEQUENCE = '~aligned_sequence',
6
+ AMINO_ACID_RESIDUE = 'AAR',
7
+ POSITION = 'Pos',
8
+ P_VALUE = 'pValue',
9
+ MEAN_DIFFERENCE = 'Mean difference',
10
+ }
11
+
12
+ export enum CATEGORIES {
13
+ OTHER = 'Other',
14
+ ALL = 'All',
15
+ }
16
+
17
+ export enum TAGS {
18
+ AAR = 'AAR',
19
+ POSITION = 'Pos',
20
+ SEPARATOR = 'monomer-separator',
21
+ SELECTION = 'selection',
22
+ }
23
+
24
+ export enum SEM_TYPES {
25
+ AMINO_ACIDS = 'aminoAcids',
26
+ ALIGNED_SEQUENCE = 'alignedSequence',
27
+ ALIGNED_SEQUENCE_DIFFERENCE = 'alignedSequenceDifference',
28
+ ACTIVITY = 'activity',
29
+ ACTIVITY_SCALED = 'activityScaled',
30
+ Macro_Molecule = 'Macromolecule',
31
+ }
32
+
33
+ export const STATS = 'stats';
34
+
35
+ export const EMBEDDING_STATUS = 'embeddingStatus';
36
+
37
+ export const PEPTIDES_ANALYSIS = 'isPeptidesAnalysis';
38
+
39
+ export enum FLAGS {
40
+ CELL_CHANGING = 'isCellChanging',
41
+ }
42
+
43
+ export const aarGroups = {
44
+ 'R': 'PC', 'H': 'PC', 'K': 'PC',
45
+ 'D': 'NC', 'E': 'NC',
46
+ 'S': 'U', 'T': 'U', 'N': 'U', 'Q': 'U',
47
+ 'C': 'SC', 'U': 'SC', 'G': 'SC', 'P': 'SC',
48
+ 'A': 'H', 'V': 'H', 'I': 'H', 'L': 'H', 'M': 'H', 'F': 'H', 'Y': 'H', 'W': 'H',
49
+ '-': '-',
50
+ };
51
+
52
+ export const groupDescription: {[key: string]: {'description': string, aminoAcids: string[]}} = {
53
+ 'PC': {'description': 'Positive Amino Acids, with Electrically Charged Side Chains', 'aminoAcids': ['R', 'H', 'K']},
54
+ 'NC': {'description': 'Negative Amino Acids, with Electrically Charged Side Chains', 'aminoAcids': ['D', 'E']},
55
+ 'U': {'description': 'Amino Acids with Polar Uncharged Side Chains', 'aminoAcids': ['S', 'T', 'N', 'Q']},
56
+ 'SC': {'description': 'Special Cases', 'aminoAcids': ['C', 'U', 'G', 'P']},
57
+ 'H': {
58
+ 'description': 'Amino Acids with Hydrophobic Side Chain',
59
+ 'aminoAcids': ['A', 'V', 'I', 'L', 'M', 'F', 'Y', 'W'],
60
+ },
61
+ '-': {'description': 'Unknown Amino Acid', 'aminoAcids': ['-']},
62
+ };
@@ -0,0 +1,107 @@
1
+ import * as DG from 'datagrok-api/dg';
2
+
3
+ //@ts-ignore
4
+ import Aioli from '@biowasm/aioli';
5
+
6
+ import {AlignedSequenceEncoder} from '@datagrok-libraries/bio/src/sequence-encoder';
7
+ import * as C from './constants';
8
+
9
+ /**
10
+ * Converts array of sequences into simple fasta string.
11
+ *
12
+ * @param {string[]} sequences Input list of sequences.
13
+ * @return {string} Fasta-formatted string.
14
+ */
15
+ function _stringsToFasta(sequences: string[]): string {
16
+ return sequences.reduce((a, v, i) => a + `>sample${i + 1}\n${v}\n`, '');
17
+ }
18
+
19
+ /**
20
+ * Extracts array of sequences from simple fasta string.
21
+ *
22
+ * @param {string} fasta Fasta-formatted string.
23
+ * @return {string[]} Output list of sequences.
24
+ */
25
+ function _fastaToStrings(fasta: string): string[] {
26
+ return fasta.replace(/>sample\d+(\r\n|\r|\n)/g, '').split('\n');
27
+ }
28
+
29
+ /**
30
+ * Converts aligned sequence to semantic type format.
31
+ *
32
+ * @param {string} seq Source sequence.
33
+ * @return {string} Formatted sequence.
34
+ */
35
+ function _castAligned(seq: string): string {
36
+ let delimited = '';
37
+
38
+ for (const char of seq)
39
+ delimited += char == '-' ? char : `-${char}`;
40
+
41
+ return delimited;
42
+ }
43
+
44
+ /**
45
+ * Formats a batch of sequences to correspond the semantic type.
46
+ *
47
+ * @param {string[]} alignment List of aligned sequences.
48
+ * @return {string[]} Formatted sequences.
49
+ */
50
+ function _stringsToAligned(alignment: string[]): string[] {
51
+ const nItems = alignment.length;
52
+ const aligned = new Array<string>(nItems);
53
+
54
+ for (let i = 0; i < nItems; ++i)
55
+ aligned[i] = _castAligned(alignment[i]);
56
+
57
+ return aligned;
58
+ }
59
+
60
+ /**
61
+ * Runs Aioli environment with kalign tool.
62
+ *
63
+ * @param {DG.Column} col Column with sequences.
64
+ * @param {boolean} isAligned Whether the column is aligned.
65
+ * @return {Promise<DG.Column>} Aligned sequences.
66
+ */
67
+ export async function runKalign(col: DG.Column, isAligned = false) : Promise<DG.Column> {
68
+ let sequences = col.toList();
69
+
70
+ if (isAligned)
71
+ sequences = sequences.map((v: string, _) => AlignedSequenceEncoder.clean(v).replace(/\-/g, ''));
72
+
73
+ const fasta = _stringsToFasta(sequences);
74
+ const CLI = await new Aioli({
75
+ tool: 'kalign',
76
+ version: '3.3.1',
77
+ reinit: true,
78
+ });
79
+
80
+ console.log(['fasta.length =', fasta.length]);
81
+
82
+ await CLI.fs.writeFile('input.fa', fasta);
83
+ const output = await CLI.exec('kalign input.fa -f fasta -o result.fasta');
84
+ const buf = await CLI.cat('result.fasta');
85
+
86
+ console.warn(output);
87
+
88
+ const aligned = _fastaToStrings(buf).slice(0, sequences.length);
89
+ const alignedCol = DG.Column.fromStrings(`msa(${col.name})`, _stringsToAligned(aligned));
90
+ alignedCol.setTag(DG.TAGS.UNITS, '');
91
+ alignedCol.semType = C.SEM_TYPES.Macro_Molecule;
92
+ return alignedCol;
93
+ }
94
+
95
+ export async function testMSAEnoughMemory(col: DG.Column): Promise<void> {
96
+ const sequencesCount = col.length;
97
+ const delta = sequencesCount/100;
98
+
99
+ for (let i = delta; i < sequencesCount; i += delta) {
100
+ try {
101
+ await runKalign(DG.Column.fromStrings(col.name, col.toList().slice(0, Math.round(i))));
102
+ console.log(`runKalign succeeded on ${i}`);
103
+ } catch (error) {
104
+ console.log(`runKalign failed on ${i} with '${error}'`);
105
+ }
106
+ }
107
+ }