@datagrok/bio 2.4.15 → 2.4.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -5,7 +5,7 @@
5
5
  "name": "Leonid Stolbov",
6
6
  "email": "lstolbov@datagrok.ai"
7
7
  },
8
- "version": "2.4.15",
8
+ "version": "2.4.17",
9
9
  "description": "Bioinformatics support (import/export of sequences, conversion, visualization, analysis). [See more](https://github.com/datagrok-ai/public/blob/master/packages/Bio/README.md) for details.",
10
10
  "repository": {
11
11
  "type": "git",
@@ -14,9 +14,9 @@
14
14
  },
15
15
  "dependencies": {
16
16
  "@biowasm/aioli": "^3.1.0",
17
- "@datagrok-libraries/bio": "^5.28.4",
17
+ "@datagrok-libraries/bio": "^5.29.3",
18
18
  "@datagrok-libraries/chem-meta": "^1.0.1",
19
- "@datagrok-libraries/ml": "^6.3.16",
19
+ "@datagrok-libraries/ml": "^6.3.22",
20
20
  "@datagrok-libraries/tutorials": "^1.2.1",
21
21
  "@datagrok-libraries/utils": "^2.1.3",
22
22
  "cash-dom": "^8.0.0",
@@ -0,0 +1,289 @@
1
+ #!/usr/bin/env python3
2
+ # name: Sequence generator
3
+ # description: Create the model peptides/DNA sequences with peptides data
4
+ # language: python
5
+ # tags: template, demo
6
+ # input: int clusters = 1 [Number of superclusters]
7
+ # input: int num_sequences = 500 [Number of sequences in each supercluster]
8
+ # input: int motif_length = 12 [Average length of motif]
9
+ # input: int max_variants_position = 3 [Maximum number of different letters in conservative position in motif]
10
+ # input: int random_length = 3 [Average length of random sequence parts before and after motif]
11
+ # input: int dispersion = 2 [Variation of total sequence length]
12
+ # input: string alphabet_key = 'PT' [Sequence alphabet: PT/DNA/RNA/custom. Custom alphabet is a list of values separated by comma]
13
+ # input: bool disable_cliffs = False [Disable generation of cliffs]
14
+ # input: double cliff_probability = 0.01 [Probability to make activity cliff of a sequence]
15
+ # input: double cliff_strength = 4.0 [Strength of cliff]
16
+ # output: dataframe sequences
17
+
18
+ import random
19
+ import argparse
20
+ import sys
21
+
22
+ from typing import List, Tuple, Dict, Iterator, Any
23
+
24
+ alphabet_type = List[str]
25
+
26
+ letter_choice_type = List[str]
27
+ motif_template_type = List[letter_choice_type]
28
+
29
+ sequence_record_type = Tuple[int, str, float, bool]
30
+ sequence_record_cluster_type = Tuple[int, str, str, float, bool]
31
+
32
+ alphabets: Dict[str, str] = {
33
+ "PT": "A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y",
34
+ "DNA": "A,T,G,C",
35
+ "RNA": "A,U,G,C",
36
+ }
37
+
38
+
39
+ def mean_range(mean: int, disp: int) -> int:
40
+ return random.randint(max(mean - disp, 0), mean + disp)
41
+
42
+
43
+ def generate_motif_template(
44
+ motif_length: int,
45
+ alphabet: alphabet_type,
46
+ max_variants_cluster: int,
47
+ prob_any: float = 0.2,
48
+ ) -> motif_template_type:
49
+ motif_template = []
50
+ for position in range(motif_length):
51
+ # Selecting letters for position i
52
+ if (0 < position < motif_length - 1) and (random.random() <= prob_any):
53
+ letters = ["?"] # this stands for any symbol
54
+ else:
55
+ n_variants = random.randrange(max_variants_cluster) + 1
56
+ letters = [random.choice(alphabet) for i in range(n_variants)]
57
+ motif_template.append(letters)
58
+ return motif_template
59
+
60
+
61
+ def generate_motif(template: motif_template_type, alphabet: alphabet_type) -> str:
62
+ template_with_any = [(letters if not "?" in letters else alphabet) for letters in template]
63
+ return "".join([random.choice(letters) for letters in template_with_any])
64
+
65
+
66
+ def motif_notation(motif_template: motif_template_type) -> str:
67
+ def motif_notation_code(letter_choice: letter_choice_type) -> str:
68
+ if len(letter_choice) == 1:
69
+ return letter_choice[0]
70
+ else:
71
+ return f"[{''.join(letter_choice)}]"
72
+
73
+ return "".join([motif_notation_code(letter_choice) for letter_choice in motif_template])
74
+
75
+
76
+ def generate_random(n: int, alphabet: alphabet_type) -> str:
77
+ return "".join([random.choice(alphabet) for i in range(n)])
78
+
79
+
80
+ def make_cliff(motif_template: motif_template_type, alphabet: alphabet_type, motif: str) -> str:
81
+ # Mutate conservative letter in motif
82
+ pos = random.randrange(len(motif_template))
83
+ while "?" in motif_template[pos]:
84
+ pos = (pos + 1) % len(motif_template) # always will find letters since ends of motif can't be any symbol
85
+ outlier_letters = list(set(alphabet) - set(motif_template[pos]))
86
+ return motif[:pos] + random.choice(outlier_letters) + motif[pos + 1 :]
87
+
88
+
89
+ def generate_cluster(
90
+ n_sequences: int,
91
+ motif_length: int,
92
+ prefix_length: int,
93
+ suffix_length: int,
94
+ max_variants_position: int,
95
+ make_cliffs: bool,
96
+ alphabet: alphabet_type,
97
+ cliff_probability: float,
98
+ cliff_strength: float,
99
+ ) -> Iterator[sequence_record_type]:
100
+ motif_template = generate_motif_template(motif_length, alphabet, max_variants_position)
101
+
102
+ activity_average = random.random() * 10
103
+ activity_dispersion = random.random()
104
+ sys.stderr.write(f"Motif template: {motif_notation(motif_template)}\n")
105
+
106
+ for n_seq in range(n_sequences):
107
+ activity = random.gauss(activity_average, activity_dispersion)
108
+
109
+ motif = generate_motif(motif_template, alphabet)
110
+ prefix = generate_random(prefix_length, alphabet)
111
+ suffix = generate_random(suffix_length, alphabet)
112
+ seq = prefix + motif + suffix
113
+
114
+ is_cliff = make_cliffs and (random.random() <= cliff_probability)
115
+ sequence_record: sequence_record_type = (n_seq, seq, activity, is_cliff)
116
+ yield sequence_record
117
+
118
+ if is_cliff:
119
+ # Making activity cliff
120
+ cliff_motif = make_cliff(motif_template, alphabet, motif)
121
+ cliff_seq = prefix + cliff_motif + suffix
122
+ # Recalculating activity
123
+ cliff_disp = activity_dispersion * cliff_strength * (0.5 + random.random())
124
+ activity = activity_average - cliff_disp
125
+ cliff_activity = activity_average + cliff_disp
126
+
127
+ # sys.stderr.write(f"Cliff for sequence #{line_number:4}, cluster {n_cluster} \n")
128
+ # sys.stderr.write(f"{activity_average}\t{motif}\t{activity}\n")
129
+ # sys.stderr.write(f"{activity_average}\t{cliff_motif}\t{cliff_activity}\n")
130
+ n_seq += 1
131
+ sequence_record = (n_seq, cliff_seq, cliff_activity, is_cliff)
132
+ yield sequence_record
133
+
134
+
135
+ def generate_sequences(
136
+ n_clusters: int,
137
+ n_sequences: int,
138
+ average_motif_length: int,
139
+ max_variants_position: int,
140
+ average_random_length: int,
141
+ dispersion: int,
142
+ alphabet: alphabet_type,
143
+ make_cliffs: bool,
144
+ cliff_probability: float,
145
+ cliff_strength: float,
146
+ ) -> Tuple[List[str], List[sequence_record_cluster_type]]:
147
+ headers: List[str] = ["cluster", "sequence_id", "sequence", "activity", "is_cliff"]
148
+ sequences: List[sequence_record_cluster_type] = []
149
+
150
+ for n_cluster in range(n_clusters):
151
+ motif_length = mean_range(average_motif_length, dispersion)
152
+
153
+ # sys.stderr.write(f"Cluster {n_cluster:2} motif template: {motif_notation(motif_template)}\n")
154
+ total_length = mean_range(average_random_length * 2, args.dispersion) + motif_length
155
+ prefix_length = mean_range(average_random_length, args.dispersion // 2)
156
+ suffix_length = total_length - motif_length - prefix_length
157
+ sys.stderr.write(f"Generating sequences for cluster {n_cluster}\n")
158
+ for n_seq, seq, activity, is_cliff in generate_cluster(
159
+ n_sequences,
160
+ motif_length,
161
+ prefix_length,
162
+ suffix_length,
163
+ max_variants_position,
164
+ make_cliffs,
165
+ alphabet,
166
+ cliff_probability,
167
+ cliff_strength,
168
+ ):
169
+ sequences.append((n_cluster, f"c{n_cluster}_s{n_seq}", seq, activity, is_cliff))
170
+ return headers, sequences
171
+
172
+
173
+ def parse_command_line_args() -> Any:
174
+ parser = argparse.ArgumentParser(
175
+ prog="MotifSequencesGenerator",
176
+ description="The program generates set of sequences containing sequence motifs "
177
+ "for SAR fucntionality testing",
178
+ epilog="Utility support: Gennadii Zakharov",
179
+ )
180
+
181
+ parser.add_argument("-c", "--clusters", type=int, default=1, help="Number of superclusters")
182
+ parser.add_argument(
183
+ "-s",
184
+ "--sequences",
185
+ type=int,
186
+ default=500,
187
+ help="Number of sequences in each supercluster",
188
+ )
189
+ parser.add_argument("-m,", "--motif-length", type=int, default=12, help="Average length of motif")
190
+
191
+ parser.add_argument(
192
+ "-r,",
193
+ "--random-length",
194
+ type=int,
195
+ default=3,
196
+ help="Average length of random sequence parts before and after motif",
197
+ )
198
+ parser.add_argument(
199
+ "-d,",
200
+ "--dispersion",
201
+ type=int,
202
+ default=2,
203
+ help="Variation of total sequence length",
204
+ )
205
+
206
+ available_alphabets = ",".join(list(alphabets.keys()) + ["custom"])
207
+ parser.add_argument(
208
+ "--alphabet",
209
+ type=str,
210
+ default=list(alphabets.keys())[0],
211
+ help=f"Sequence alphabet: {available_alphabets}. Custom alphabet is a list of values separated " f"by comma",
212
+ )
213
+ parser.add_argument(
214
+ "--max-variants-position",
215
+ type=int,
216
+ default=3,
217
+ help="Maximum number of different letters in conservative position in motif",
218
+ )
219
+ parser.add_argument(
220
+ "--cliff-probability",
221
+ type=float,
222
+ default=0.01,
223
+ help="Probability to make activity cliff of a sequence",
224
+ )
225
+ parser.add_argument(
226
+ "--cliff-strength",
227
+ type=float,
228
+ default=4.0,
229
+ help="Strength of cliff",
230
+ )
231
+ parser.add_argument(
232
+ "--disable-cliffs",
233
+ type=bool,
234
+ default=False,
235
+ help="Disable generation of cliffs",
236
+ )
237
+
238
+ command_line_args = parser.parse_args()
239
+
240
+ return command_line_args
241
+
242
+
243
+ # ====================================================================================
244
+
245
+ grok = "clusters" in globals()
246
+
247
+ if not grok:
248
+ # We are not in Datagrok - need to parse command line arguments
249
+ args = parse_command_line_args()
250
+ clusters = args.clusters
251
+ num_sequences = args.sequences
252
+ motif_length = args.motif_length
253
+ max_variants_position = args.max_variants_position
254
+ random_length = args.random_length
255
+ dispersion = args.dispersion
256
+ alphabet_key = args.alphabet
257
+ disable_cliffs = args.disable_cliffs
258
+ cliff_probability = args.cliff_probability
259
+ cliff_strength = args.cliff_strength
260
+
261
+ alphabet: alphabet_type = alphabets[alphabet_key].split(",") if alphabet_key in alphabets else alphabet_key.split(",")
262
+
263
+ # Running sequence generator
264
+ header, data = generate_sequences(
265
+ clusters,
266
+ num_sequences,
267
+ motif_length,
268
+ max_variants_position,
269
+ random_length,
270
+ dispersion,
271
+ alphabet,
272
+ not disable_cliffs,
273
+ cliff_probability,
274
+ cliff_strength,
275
+ )
276
+
277
+ if grok:
278
+ # Exporting data to Datagrok as a pandas dataframe
279
+ import pandas as pd
280
+
281
+ sequences = pd.DataFrame.from_records(data, columns=header)
282
+ else:
283
+ # Writing results to stdout - no need to work with big and heavy Pandas
284
+ import csv
285
+
286
+ csv_writer = csv.writer(sys.stdout, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
287
+ csv_writer.writerow(header)
288
+ for line in data:
289
+ csv_writer.writerow(line)
@@ -43,8 +43,8 @@ export async function getSimilaritiesMatrix(
43
43
  }
44
44
 
45
45
  export async function getChemSimilaritiesMatrix(dim: number, seqCol: DG.Column,
46
- df: DG.DataFrame, colName: string, simArr: DG.Column[])
47
- : Promise<DG.Column[]> {
46
+ df: DG.DataFrame, colName: string, simArr: (DG.Column | null)[])
47
+ : Promise<(DG.Column | null)[]> {
48
48
  if (seqCol.version !== seqCol.temp[MONOMERIC_COL_TAGS.LAST_INVALIDATED_VERSION])
49
49
  await invalidateMols(seqCol, false);
50
50
  const fpDf = DG.DataFrame.create(seqCol.length);
@@ -12,15 +12,17 @@ import {updateDivInnerHTML} from '../utils/ui-utils';
12
12
  import {Subject} from 'rxjs';
13
13
 
14
14
  export class SequenceDiversityViewer extends SequenceSearchBaseViewer {
15
+ diverseColumnLabel: string | null; // Use postfix Label to prevent activating table column selection editor
16
+
15
17
  renderMolIds: number[] | null = null;
16
18
  columnNames = [];
17
19
  computeCompleted = new Subject<boolean>();
18
20
 
19
21
  constructor() {
20
22
  super('diversity');
23
+ this.diverseColumnLabel = this.string('diverseColumnLabel', null);
21
24
  }
22
25
 
23
-
24
26
  async render(computeData = true): Promise<void> {
25
27
  if (!this.beforeRender())
26
28
  return;
@@ -29,14 +31,15 @@ export class SequenceDiversityViewer extends SequenceSearchBaseViewer {
29
31
  const monomericMols = await getMonomericMols(this.moleculeColumn);
30
32
  //need to create df to calculate fingerprints
31
33
  const monomericMolsDf = DG.DataFrame.fromColumns([monomericMols]);
32
- this.renderMolIds =
33
- await grok.functions.call('Chem:callChemDiversitySearch', {
34
+ this.renderMolIds = await grok.functions.call('Chem:callChemDiversitySearch', {
34
35
  col: monomericMols,
35
36
  metricName: this.distanceMetric,
36
37
  limit: this.limit,
37
38
  fingerprint: this.fingerprint
38
39
  });
39
- const resCol = DG.Column.string('sequence', this.renderMolIds!.length)
40
+ const diverseColumnName: string = this.diverseColumnLabel != null ? this.diverseColumnLabel :
41
+ `diverse (${this.moleculeColumnName})`;
42
+ const resCol = DG.Column.string(diverseColumnName, this.renderMolIds!.length)
40
43
  .init((i) => this.moleculeColumn?.get(this.renderMolIds![i]));
41
44
  resCol.semType = DG.SEMTYPE.MACROMOLECULE;
42
45
  this.tags.forEach((tag) => resCol.setTag(tag, this.moleculeColumn!.getTag(tag)));
@@ -11,13 +11,15 @@ import {Subject} from 'rxjs';
11
11
  import {TAGS as bioTAGS, getSplitter} from '@datagrok-libraries/bio/src/utils/macromolecule';
12
12
 
13
13
  export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
14
+ cutoff: number;
14
15
  hotSearch: boolean;
16
+ similarColumnLabel: string | null; // Use postfix Label to prevent activating table column selection editor
17
+
15
18
  sketchedMolecule: string = '';
16
19
  curIdx: number = 0;
17
20
  molCol: DG.Column | null = null;
18
21
  idxs: DG.Column | null = null;
19
22
  scores: DG.Column | null = null;
20
- cutoff: number;
21
23
  gridSelect: boolean = false;
22
24
  targetMoleculeIdx: number = 0;
23
25
  computeCompleted = new Subject<boolean>();
@@ -26,6 +28,7 @@ export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
26
28
  super('similarity');
27
29
  this.cutoff = this.float('cutoff', 0.01, {min: 0, max: 1});
28
30
  this.hotSearch = this.bool('hotSearch', true);
31
+ this.similarColumnLabel = this.string('similarColumnLabel', null);
29
32
  }
30
33
 
31
34
  init(): void {
@@ -54,7 +57,9 @@ export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
54
57
  });
55
58
  this.idxs = df.getCol('indexes');
56
59
  this.scores = df.getCol('score');
57
- this.molCol = DG.Column.string('sequence',
60
+ const similarColumnName: string = this.similarColumnLabel != null ? this.similarColumnLabel :
61
+ `similar (${this.moleculeColumnName})`;
62
+ this.molCol = DG.Column.string(similarColumnName,
58
63
  this.idxs!.length).init((i) => this.moleculeColumn?.get(this.idxs?.get(i)));
59
64
  this.molCol.semType = DG.SEMTYPE.MACROMOLECULE;
60
65
  this.tags.forEach((tag) => this.molCol!.setTag(tag, this.moleculeColumn!.getTag(tag)));
@@ -6,6 +6,7 @@ import {Matrix} from '@datagrok-libraries/utils/src/type-declarations';
6
6
  import BitArray from '@datagrok-libraries/utils/src/bit-array';
7
7
  import {ISequenceSpaceParams} from '@datagrok-libraries/ml/src/viewers/activity-cliffs';
8
8
  import {invalidateMols, MONOMERIC_COL_TAGS} from '../substructure-search/substructure-search';
9
+ import {UnitsHandler} from '@datagrok-libraries/bio/src/utils/units-handler';
9
10
  import * as grok from 'datagrok-api/grok';
10
11
 
11
12
  export interface ISequenceSpaceResult {
@@ -53,6 +54,23 @@ export async function sequenceSpaceByFingerprints(spaceParams: ISequenceSpacePar
53
54
  return result;
54
55
  }
55
56
 
57
+ export async function getSequenceSpace(spaceParams: ISequenceSpaceParams): Promise<ISequenceSpaceResult> {
58
+ const uh = new UnitsHandler(spaceParams.seqCol);
59
+ if (uh.isFasta()) {
60
+ const distanceFName = uh.getDistanceFunctionName();
61
+ const sequenceSpaceResult = await reduceDimensinalityWithNormalization(
62
+ spaceParams.seqCol.toList(),
63
+ spaceParams.methodName,
64
+ distanceFName,
65
+ spaceParams.options);
66
+ console.log(sequenceSpaceResult);
67
+ const cols: DG.Column[] = spaceParams.embedAxesNames.map(
68
+ (name: string, index: number) => DG.Column.fromFloat32Array(name, sequenceSpaceResult.embedding[index]));
69
+ return {distance: sequenceSpaceResult.distance, coordinates: new DG.ColumnList(cols)};
70
+ } else {
71
+ return await sequenceSpaceByFingerprints(spaceParams);
72
+ }
73
+ }
56
74
 
57
75
  export function getEmbeddingColsNames(df: DG.DataFrame) {
58
76
  const axes = ['Embed_X', 'Embed_Y'];
@@ -6,8 +6,10 @@ import {_package} from '../package';
6
6
  import {DemoScript} from '@datagrok-libraries/tutorials/src/demo-script';
7
7
  import {delay} from '@datagrok-libraries/utils/src/test';
8
8
  import {handleError} from './utils';
9
+ import {SequenceDiversityViewer} from '../analysis/sequence-diversity-viewer';
10
+ import {SequenceSimilarityViewer} from '../analysis/sequence-similarity-viewer';
9
11
 
10
- const dataFn = 'data/sample_FASTA_DNA.csv';
12
+ const dataFn: string = 'data/sample_FASTA_DNA.csv';
11
13
 
12
14
  export async function demoBio01UI() {
13
15
  let view: DG.TableView;
@@ -17,21 +19,34 @@ export async function demoBio01UI() {
17
19
  const demoScript = new DemoScript('Demo', 'Sequence similarity / diversity search');
18
20
  await demoScript
19
21
  .step(`Loading DNA notation 'fasta'`, async () => {
22
+ grok.shell.windows.showContextPanel = false;
23
+ grok.shell.windows.showProperties = false;
24
+
20
25
  df = await _package.files.readCsv(dataFn);
21
26
  view = grok.shell.addTableView(df);
27
+
28
+ view.grid.columns.byName('id')!.width = 0;
29
+ view.grid.columns.byName('sequence')!.width = 500;
30
+ // TODO: Fix column width
22
31
  }, {
23
32
  description: `Load dataset with macromolecules of 'fasta' notation, 'DNA' alphabet.`,
24
- delay: 1600
33
+ delay: 1200
25
34
  })
26
35
  .step('Sequence similarity search', async () => {
27
- const simViewer = await df.plot.fromType('Sequence Similarity Search') as DG.Viewer;
36
+ const simViewer = await df.plot.fromType('Sequence Similarity Search', {
37
+ moleculeColumnName: 'sequence',
38
+ similarColumnLabel: 'Similar to current',
39
+ }) as SequenceSimilarityViewer;
28
40
  view.dockManager.dock(simViewer, DG.DOCK_TYPE.RIGHT, null, 'Similarity search', 0.35);
29
41
  }, {
30
42
  description: `Add 'Sequence Similarity Search' viewer.`,
31
43
  delay: 1600
32
44
  })
33
45
  .step('Sequence diversity search', async () => {
34
- const divViewer = await df.plot.fromType('Sequence Diversity Search') as DG.Viewer;
46
+ const divViewer = await df.plot.fromType('Sequence Diversity Search', {
47
+ moleculeColumnName: 'sequence',
48
+ diverseColumnLabel: 'Top diverse sequences of all data'
49
+ }) as SequenceDiversityViewer;
35
50
  view.dockManager.dock(divViewer, DG.DOCK_TYPE.DOWN, null, 'Diversity search', 0.27);
36
51
  }, {
37
52
  description: `Add 'Sequence Deversity Search' viewer.`,
@@ -36,6 +36,9 @@ export async function demoBio01aUI() {
36
36
  ]);
37
37
  view = grok.shell.addTableView(df);
38
38
  view.grid.props.rowHeight = 22;
39
+
40
+ grok.shell.windows.showContextPanel = false;
41
+ grok.shell.windows.showProperties = false;
39
42
  }, {
40
43
  description: `Load dataset with macromolecules of 'fasta' notation, 'DNA' alphabet.`,
41
44
  delay: 1600,
@@ -29,6 +29,9 @@ export async function demoBio01bUI() {
29
29
  const demoScript = new DemoScript('Demo', '');
30
30
  await demoScript
31
31
  .step(`Loading DNA notation \'fasta\'`, async () => {
32
+ grok.shell.windows.showContextPanel = false;
33
+ grok.shell.windows.showProperties = false;
34
+
32
35
  [df, treeHelper, dendrogramSvc] = await Promise.all([
33
36
  _package.files.readCsv(dataFn),
34
37
  getTreeHelper(),
@@ -28,6 +28,9 @@ export async function demoBio05UI(): Promise<void> {
28
28
  await demoScript
29
29
  .step(`Loading peptides notation 'HELM'`, async () => {
30
30
  view = grok.shell.addTableView(df = await _package.files.readCsv(helmFn));
31
+
32
+ grok.shell.windows.showContextPanel = false;
33
+ grok.shell.windows.showProperties = false;
31
34
  }, {
32
35
  description: 'Load dataset with macromolecules of \'Helm\' notation.',
33
36
  delay: 1600,
@@ -44,15 +47,6 @@ export async function demoBio05UI(): Promise<void> {
44
47
  description: 'Multiple sequence alignment (MSA) performed with PepSeA tool operating on non-natural aminoacids as well.',
45
48
  delay: 1600,
46
49
  })
47
- .step('Composition analysis on MSA results', async () => {
48
- wlViewer = await df.plot.fromType('WebLogo', {
49
- sequenceColumnName: msaHelmColName
50
- }) as DG.Viewer & IWebLogoViewer;
51
- view.dockManager.dock(wlViewer, DG.DOCK_TYPE.DOWN, null, 'Composition analysis', 0.2);
52
- }, {
53
- description: 'Composition analysis allows to reveal functional features of sequences like motifs, or variable loops.',
54
- delay: 1600,
55
- })
56
50
  .step('Building sequence space', async () => {
57
51
  const method: string = 'UMAP';
58
52
  ssViewer = (await sequenceSpaceTopMenu(df, msaHelmCol,
@@ -62,6 +56,16 @@ export async function demoBio05UI(): Promise<void> {
62
56
  description: 'Reduce sequence space dimensionality to display on 2D representation.',
63
57
  delay: 1600
64
58
  })
59
+ .step('Composition analysis on MSA results', async () => {
60
+ wlViewer = await df.plot.fromType('WebLogo', {
61
+ sequenceColumnName: msaHelmColName,
62
+ maxHeight: 50,
63
+ }) as DG.Viewer & IWebLogoViewer;
64
+ view.dockManager.dock(wlViewer, DG.DOCK_TYPE.DOWN, null, 'Composition analysis', 0.2);
65
+ }, {
66
+ description: 'Composition analysis allows to reveal functional features of sequences like motifs, or variable loops.',
67
+ delay: 1600,
68
+ })
65
69
  .start();
66
70
  } catch (err: any) {
67
71
  handleError(err);
package/src/package.ts CHANGED
@@ -9,7 +9,7 @@ import {
9
9
  } from './utils/cell-renderer';
10
10
  import {VdRegionsViewer} from './viewers/vd-regions-viewer';
11
11
  import {SequenceAlignment} from './seq_align';
12
- import {getEmbeddingColsNames, sequenceSpaceByFingerprints} from './analysis/sequence-space';
12
+ import {getEmbeddingColsNames, sequenceSpaceByFingerprints, getSequenceSpace} from './analysis/sequence-space';
13
13
  import {getActivityCliffs} from '@datagrok-libraries/ml/src/viewers/activity-cliffs';
14
14
  import {
15
15
  createLinesGrid,
@@ -49,6 +49,7 @@ import {demoBio01bUI} from './demo/bio01b-hierarchical-clustering-and-activity-c
49
49
  import {demoBio05UI} from './demo/bio05-helm-msa-sequence-space';
50
50
  import {checkInputColumnUI} from './utils/check-input-column';
51
51
  import {multipleSequenceAlignmentUI} from './utils/multiple-sequence-alignment-ui';
52
+ import { runKalign } from './utils/multiple-sequence-alignment';
52
53
 
53
54
  export const _package = new DG.Package();
54
55
 
@@ -286,19 +287,23 @@ export async function activityCliffs(df: DG.DataFrame, macroMolecule: DG.Column,
286
287
  'separator': macroMolecule.getTag(bioTAGS.separator),
287
288
  'alphabet': macroMolecule.getTag(bioTAGS.alphabet),
288
289
  };
290
+ const uh = new UnitsHandler(macroMolecule);
291
+ let columnDistanceMetric = 'Tanimoto';
292
+ if (uh.isFasta())
293
+ columnDistanceMetric = uh.getDistanceFunctionName();
289
294
  const sp = await getActivityCliffs(
290
295
  df,
291
296
  macroMolecule,
292
297
  null,
293
298
  axesNames,
294
- 'Activity cliffs',
299
+ columnDistanceMetric,
295
300
  activities,
296
301
  similarity,
297
302
  'Tanimoto',
298
303
  methodName,
299
304
  DG.SEMTYPE.MACROMOLECULE,
300
305
  tags,
301
- sequenceSpaceByFingerprints,
306
+ getSequenceSpace,
302
307
  getChemSimilaritiesMatrix,
303
308
  createTooltipElement,
304
309
  createPropPanelElement,
@@ -349,7 +354,7 @@ export async function sequenceSpaceTopMenu(table: DG.DataFrame, macroMolecule: D
349
354
  embedAxesNames: embedColsNames,
350
355
  options: options
351
356
  };
352
- const sequenceSpaceRes = await sequenceSpaceByFingerprints(chemSpaceParams);
357
+ const sequenceSpaceRes = await getSequenceSpace(chemSpaceParams);
353
358
  const embeddings = sequenceSpaceRes.coordinates;
354
359
  for (const col of embeddings) {
355
360
  const listValues = col.toList();
@@ -411,10 +416,21 @@ export async function toAtomicLevel(df: DG.DataFrame, macroMolecule: DG.Column):
411
416
  //top-menu: Bio | Alignment | MSA...
412
417
  //name: MSA...
413
418
  //tags: bio, panel
414
- export function multipleSequenceAlignmentAny(): void {
419
+ export function multipleSequenceAlignmentDialog(): void {
415
420
  multipleSequenceAlignmentUI();
416
421
  }
417
422
 
423
+ //name: Multiple Sequence Alignment
424
+ //description: Multiple sequence alignment
425
+ //tags: bio
426
+ //input: column sequenceCol {semType: Macromolecule}
427
+ //input: column clustersCol
428
+ //output: column result
429
+ export async function alignSequences(sequenceCol: DG.Column<string> | null = null,
430
+ clustersCol: DG.Column | null = null): Promise<DG.Column<string>> {
431
+ return multipleSequenceAlignmentUI({col: sequenceCol, clustersCol});
432
+ }
433
+
418
434
  //top-menu: Bio | Structure | Composition Analysis
419
435
  //name: Composition Analysis
420
436
  //meta.icon: files/icons/composition-analysis.svg
@@ -15,7 +15,7 @@ seq3,
15
15
  seq4`;
16
16
 
17
17
  test('testMsaPos', async () => {
18
- const func: DG.Func = DG.Func.find({package: 'Bio', name: 'multipleSequenceAlignmentAny'})[0];
18
+ const func: DG.Func = DG.Func.find({package: 'Bio', name: 'multipleSequenceAlignmentDialog'})[0];
19
19
  const funcInputColumnProperty: DG.Property = func.inputs.find((i) => i.name == 'sequence')!;
20
20
 
21
21
  const k = 11;
@@ -67,7 +67,7 @@ seq4`;
67
67
  });
68
68
 
69
69
  test('testGetActionFunctionMeta', async () => {
70
- const func: DG.Func = DG.Func.find({package: 'Bio', name: 'multipleSequenceAlignmentAny'})[0];
70
+ const func: DG.Func = DG.Func.find({package: 'Bio', name: 'multipleSequenceAlignmentDialog'})[0];
71
71
  const sequenceInput: DG.Property = func.inputs.find((i) => i.name == 'sequence')!;
72
72
  const k = 11;
73
73
  });
@@ -138,7 +138,7 @@ async function _testMSAOnColumn(
138
138
  if (alphabet)
139
139
  expect(srcSeqCol.getTag(bioTAGS.alphabet), alphabet);
140
140
 
141
- const msaSeqCol = await multipleSequenceAlignmentUI(srcSeqCol, pepseaMethod);
141
+ const msaSeqCol = await multipleSequenceAlignmentUI({col: srcSeqCol, pepsea: {method: pepseaMethod}});
142
142
  expect(msaSeqCol.semType, DG.SEMTYPE.MACROMOLECULE);
143
143
  expect(msaSeqCol.getTag(DG.TAGS.UNITS), tgtNotation);
144
144
  expect(msaSeqCol.getTag(bioTAGS.aligned), ALIGNMENT.SEQ_MSA);
@@ -3,7 +3,7 @@ import * as DG from 'datagrok-api/dg';
3
3
 
4
4
  import {after, before, category, delay, expect, test} from '@datagrok-libraries/utils/src/test';
5
5
 
6
- import {importFasta, multipleSequenceAlignmentAny} from '../package';
6
+ import {importFasta} from '../package';
7
7
  import {convertDo} from '../utils/convert';
8
8
  import * as C from '../utils/constants';
9
9
  import {generateLongSequence, generateManySequences, performanceTest} from './utils/sequences-generators';
@@ -146,7 +146,7 @@ category('renderers', () => {
146
146
  expect(srcSeqCol.getTag(bioTAGS.alphabet), ALPHABET.PT);
147
147
  expect(srcSeqCol.getTag(DG.TAGS.CELL_RENDERER), 'sequence');
148
148
 
149
- const msaSeqCol = await multipleSequenceAlignmentUI(srcSeqCol);
149
+ const msaSeqCol = await multipleSequenceAlignmentUI({col: srcSeqCol});
150
150
  tv.grid.invalidate();
151
151
 
152
152
  expect(msaSeqCol.semType, DG.SEMTYPE.MACROMOLECULE);