@datagrok/bio 2.4.15 → 2.4.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/105.js +2 -0
- package/dist/105.js.map +1 -0
- package/dist/367.js +2 -0
- package/dist/367.js.map +1 -0
- package/dist/864.js +2 -0
- package/dist/864.js.map +1 -0
- package/dist/package-test.js +1 -1
- package/dist/package-test.js.map +1 -1
- package/dist/package.js +1 -1
- package/dist/package.js.map +1 -1
- package/package.json +3 -3
- package/scripts/sequence_generator.py +289 -0
- package/src/analysis/sequence-activity-cliffs.ts +2 -2
- package/src/analysis/sequence-diversity-viewer.ts +7 -4
- package/src/analysis/sequence-similarity-viewer.ts +7 -2
- package/src/analysis/sequence-space.ts +18 -0
- package/src/demo/bio01-similarity-diversity.ts +19 -4
- package/src/demo/bio01a-hierarchical-clustering-and-sequence-space.ts +3 -0
- package/src/demo/bio01b-hierarchical-clustering-and-activity-cliffs.ts +3 -0
- package/src/demo/bio05-helm-msa-sequence-space.ts +13 -9
- package/src/package.ts +21 -5
- package/src/tests/checkInputColumn-tests.ts +2 -2
- package/src/tests/msa-tests.ts +1 -1
- package/src/tests/renderers-test.ts +2 -2
- package/src/utils/cell-renderer.ts +4 -4
- package/src/utils/multiple-sequence-alignment-ui.ts +19 -18
- package/dist/153.js +0 -2
- package/dist/153.js.map +0 -1
- package/scripts/motif_generator.py +0 -119
package/package.json
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
"name": "Leonid Stolbov",
|
|
6
6
|
"email": "lstolbov@datagrok.ai"
|
|
7
7
|
},
|
|
8
|
-
"version": "2.4.
|
|
8
|
+
"version": "2.4.17",
|
|
9
9
|
"description": "Bioinformatics support (import/export of sequences, conversion, visualization, analysis). [See more](https://github.com/datagrok-ai/public/blob/master/packages/Bio/README.md) for details.",
|
|
10
10
|
"repository": {
|
|
11
11
|
"type": "git",
|
|
@@ -14,9 +14,9 @@
|
|
|
14
14
|
},
|
|
15
15
|
"dependencies": {
|
|
16
16
|
"@biowasm/aioli": "^3.1.0",
|
|
17
|
-
"@datagrok-libraries/bio": "^5.
|
|
17
|
+
"@datagrok-libraries/bio": "^5.29.3",
|
|
18
18
|
"@datagrok-libraries/chem-meta": "^1.0.1",
|
|
19
|
-
"@datagrok-libraries/ml": "^6.3.
|
|
19
|
+
"@datagrok-libraries/ml": "^6.3.22",
|
|
20
20
|
"@datagrok-libraries/tutorials": "^1.2.1",
|
|
21
21
|
"@datagrok-libraries/utils": "^2.1.3",
|
|
22
22
|
"cash-dom": "^8.0.0",
|
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# name: Sequence generator
|
|
3
|
+
# description: Create the model peptides/DNA sequences with peptides data
|
|
4
|
+
# language: python
|
|
5
|
+
# tags: template, demo
|
|
6
|
+
# input: int clusters = 1 [Number of superclusters]
|
|
7
|
+
# input: int num_sequences = 500 [Number of sequences in each supercluster]
|
|
8
|
+
# input: int motif_length = 12 [Average length of motif]
|
|
9
|
+
# input: int max_variants_position = 3 [Maximum number of different letters in conservative position in motif]
|
|
10
|
+
# input: int random_length = 3 [Average length of random sequence parts before and after motif]
|
|
11
|
+
# input: int dispersion = 2 [Variation of total sequence length]
|
|
12
|
+
# input: string alphabet_key = 'PT' [Sequence alphabet: PT/DNA/RNA/custom. Custom alphabet is a list of values separated by comma]
|
|
13
|
+
# input: bool disable_cliffs = False [Disable generation of cliffs]
|
|
14
|
+
# input: double cliff_probability = 0.01 [Probability to make activity cliff of a sequence]
|
|
15
|
+
# input: double cliff_strength = 4.0 [Strength of cliff]
|
|
16
|
+
# output: dataframe sequences
|
|
17
|
+
|
|
18
|
+
import random
|
|
19
|
+
import argparse
|
|
20
|
+
import sys
|
|
21
|
+
|
|
22
|
+
from typing import List, Tuple, Dict, Iterator, Any
|
|
23
|
+
|
|
24
|
+
alphabet_type = List[str]
|
|
25
|
+
|
|
26
|
+
letter_choice_type = List[str]
|
|
27
|
+
motif_template_type = List[letter_choice_type]
|
|
28
|
+
|
|
29
|
+
sequence_record_type = Tuple[int, str, float, bool]
|
|
30
|
+
sequence_record_cluster_type = Tuple[int, str, str, float, bool]
|
|
31
|
+
|
|
32
|
+
alphabets: Dict[str, str] = {
|
|
33
|
+
"PT": "A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y",
|
|
34
|
+
"DNA": "A,T,G,C",
|
|
35
|
+
"RNA": "A,U,G,C",
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def mean_range(mean: int, disp: int) -> int:
|
|
40
|
+
return random.randint(max(mean - disp, 0), mean + disp)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def generate_motif_template(
|
|
44
|
+
motif_length: int,
|
|
45
|
+
alphabet: alphabet_type,
|
|
46
|
+
max_variants_cluster: int,
|
|
47
|
+
prob_any: float = 0.2,
|
|
48
|
+
) -> motif_template_type:
|
|
49
|
+
motif_template = []
|
|
50
|
+
for position in range(motif_length):
|
|
51
|
+
# Selecting letters for position i
|
|
52
|
+
if (0 < position < motif_length - 1) and (random.random() <= prob_any):
|
|
53
|
+
letters = ["?"] # this stands for any symbol
|
|
54
|
+
else:
|
|
55
|
+
n_variants = random.randrange(max_variants_cluster) + 1
|
|
56
|
+
letters = [random.choice(alphabet) for i in range(n_variants)]
|
|
57
|
+
motif_template.append(letters)
|
|
58
|
+
return motif_template
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def generate_motif(template: motif_template_type, alphabet: alphabet_type) -> str:
|
|
62
|
+
template_with_any = [(letters if not "?" in letters else alphabet) for letters in template]
|
|
63
|
+
return "".join([random.choice(letters) for letters in template_with_any])
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def motif_notation(motif_template: motif_template_type) -> str:
|
|
67
|
+
def motif_notation_code(letter_choice: letter_choice_type) -> str:
|
|
68
|
+
if len(letter_choice) == 1:
|
|
69
|
+
return letter_choice[0]
|
|
70
|
+
else:
|
|
71
|
+
return f"[{''.join(letter_choice)}]"
|
|
72
|
+
|
|
73
|
+
return "".join([motif_notation_code(letter_choice) for letter_choice in motif_template])
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def generate_random(n: int, alphabet: alphabet_type) -> str:
|
|
77
|
+
return "".join([random.choice(alphabet) for i in range(n)])
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def make_cliff(motif_template: motif_template_type, alphabet: alphabet_type, motif: str) -> str:
|
|
81
|
+
# Mutate conservative letter in motif
|
|
82
|
+
pos = random.randrange(len(motif_template))
|
|
83
|
+
while "?" in motif_template[pos]:
|
|
84
|
+
pos = (pos + 1) % len(motif_template) # always will find letters since ends of motif can't be any symbol
|
|
85
|
+
outlier_letters = list(set(alphabet) - set(motif_template[pos]))
|
|
86
|
+
return motif[:pos] + random.choice(outlier_letters) + motif[pos + 1 :]
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def generate_cluster(
|
|
90
|
+
n_sequences: int,
|
|
91
|
+
motif_length: int,
|
|
92
|
+
prefix_length: int,
|
|
93
|
+
suffix_length: int,
|
|
94
|
+
max_variants_position: int,
|
|
95
|
+
make_cliffs: bool,
|
|
96
|
+
alphabet: alphabet_type,
|
|
97
|
+
cliff_probability: float,
|
|
98
|
+
cliff_strength: float,
|
|
99
|
+
) -> Iterator[sequence_record_type]:
|
|
100
|
+
motif_template = generate_motif_template(motif_length, alphabet, max_variants_position)
|
|
101
|
+
|
|
102
|
+
activity_average = random.random() * 10
|
|
103
|
+
activity_dispersion = random.random()
|
|
104
|
+
sys.stderr.write(f"Motif template: {motif_notation(motif_template)}\n")
|
|
105
|
+
|
|
106
|
+
for n_seq in range(n_sequences):
|
|
107
|
+
activity = random.gauss(activity_average, activity_dispersion)
|
|
108
|
+
|
|
109
|
+
motif = generate_motif(motif_template, alphabet)
|
|
110
|
+
prefix = generate_random(prefix_length, alphabet)
|
|
111
|
+
suffix = generate_random(suffix_length, alphabet)
|
|
112
|
+
seq = prefix + motif + suffix
|
|
113
|
+
|
|
114
|
+
is_cliff = make_cliffs and (random.random() <= cliff_probability)
|
|
115
|
+
sequence_record: sequence_record_type = (n_seq, seq, activity, is_cliff)
|
|
116
|
+
yield sequence_record
|
|
117
|
+
|
|
118
|
+
if is_cliff:
|
|
119
|
+
# Making activity cliff
|
|
120
|
+
cliff_motif = make_cliff(motif_template, alphabet, motif)
|
|
121
|
+
cliff_seq = prefix + cliff_motif + suffix
|
|
122
|
+
# Recalculating activity
|
|
123
|
+
cliff_disp = activity_dispersion * cliff_strength * (0.5 + random.random())
|
|
124
|
+
activity = activity_average - cliff_disp
|
|
125
|
+
cliff_activity = activity_average + cliff_disp
|
|
126
|
+
|
|
127
|
+
# sys.stderr.write(f"Cliff for sequence #{line_number:4}, cluster {n_cluster} \n")
|
|
128
|
+
# sys.stderr.write(f"{activity_average}\t{motif}\t{activity}\n")
|
|
129
|
+
# sys.stderr.write(f"{activity_average}\t{cliff_motif}\t{cliff_activity}\n")
|
|
130
|
+
n_seq += 1
|
|
131
|
+
sequence_record = (n_seq, cliff_seq, cliff_activity, is_cliff)
|
|
132
|
+
yield sequence_record
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def generate_sequences(
|
|
136
|
+
n_clusters: int,
|
|
137
|
+
n_sequences: int,
|
|
138
|
+
average_motif_length: int,
|
|
139
|
+
max_variants_position: int,
|
|
140
|
+
average_random_length: int,
|
|
141
|
+
dispersion: int,
|
|
142
|
+
alphabet: alphabet_type,
|
|
143
|
+
make_cliffs: bool,
|
|
144
|
+
cliff_probability: float,
|
|
145
|
+
cliff_strength: float,
|
|
146
|
+
) -> Tuple[List[str], List[sequence_record_cluster_type]]:
|
|
147
|
+
headers: List[str] = ["cluster", "sequence_id", "sequence", "activity", "is_cliff"]
|
|
148
|
+
sequences: List[sequence_record_cluster_type] = []
|
|
149
|
+
|
|
150
|
+
for n_cluster in range(n_clusters):
|
|
151
|
+
motif_length = mean_range(average_motif_length, dispersion)
|
|
152
|
+
|
|
153
|
+
# sys.stderr.write(f"Cluster {n_cluster:2} motif template: {motif_notation(motif_template)}\n")
|
|
154
|
+
total_length = mean_range(average_random_length * 2, args.dispersion) + motif_length
|
|
155
|
+
prefix_length = mean_range(average_random_length, args.dispersion // 2)
|
|
156
|
+
suffix_length = total_length - motif_length - prefix_length
|
|
157
|
+
sys.stderr.write(f"Generating sequences for cluster {n_cluster}\n")
|
|
158
|
+
for n_seq, seq, activity, is_cliff in generate_cluster(
|
|
159
|
+
n_sequences,
|
|
160
|
+
motif_length,
|
|
161
|
+
prefix_length,
|
|
162
|
+
suffix_length,
|
|
163
|
+
max_variants_position,
|
|
164
|
+
make_cliffs,
|
|
165
|
+
alphabet,
|
|
166
|
+
cliff_probability,
|
|
167
|
+
cliff_strength,
|
|
168
|
+
):
|
|
169
|
+
sequences.append((n_cluster, f"c{n_cluster}_s{n_seq}", seq, activity, is_cliff))
|
|
170
|
+
return headers, sequences
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def parse_command_line_args() -> Any:
|
|
174
|
+
parser = argparse.ArgumentParser(
|
|
175
|
+
prog="MotifSequencesGenerator",
|
|
176
|
+
description="The program generates set of sequences containing sequence motifs "
|
|
177
|
+
"for SAR fucntionality testing",
|
|
178
|
+
epilog="Utility support: Gennadii Zakharov",
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
parser.add_argument("-c", "--clusters", type=int, default=1, help="Number of superclusters")
|
|
182
|
+
parser.add_argument(
|
|
183
|
+
"-s",
|
|
184
|
+
"--sequences",
|
|
185
|
+
type=int,
|
|
186
|
+
default=500,
|
|
187
|
+
help="Number of sequences in each supercluster",
|
|
188
|
+
)
|
|
189
|
+
parser.add_argument("-m,", "--motif-length", type=int, default=12, help="Average length of motif")
|
|
190
|
+
|
|
191
|
+
parser.add_argument(
|
|
192
|
+
"-r,",
|
|
193
|
+
"--random-length",
|
|
194
|
+
type=int,
|
|
195
|
+
default=3,
|
|
196
|
+
help="Average length of random sequence parts before and after motif",
|
|
197
|
+
)
|
|
198
|
+
parser.add_argument(
|
|
199
|
+
"-d,",
|
|
200
|
+
"--dispersion",
|
|
201
|
+
type=int,
|
|
202
|
+
default=2,
|
|
203
|
+
help="Variation of total sequence length",
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
available_alphabets = ",".join(list(alphabets.keys()) + ["custom"])
|
|
207
|
+
parser.add_argument(
|
|
208
|
+
"--alphabet",
|
|
209
|
+
type=str,
|
|
210
|
+
default=list(alphabets.keys())[0],
|
|
211
|
+
help=f"Sequence alphabet: {available_alphabets}. Custom alphabet is a list of values separated " f"by comma",
|
|
212
|
+
)
|
|
213
|
+
parser.add_argument(
|
|
214
|
+
"--max-variants-position",
|
|
215
|
+
type=int,
|
|
216
|
+
default=3,
|
|
217
|
+
help="Maximum number of different letters in conservative position in motif",
|
|
218
|
+
)
|
|
219
|
+
parser.add_argument(
|
|
220
|
+
"--cliff-probability",
|
|
221
|
+
type=float,
|
|
222
|
+
default=0.01,
|
|
223
|
+
help="Probability to make activity cliff of a sequence",
|
|
224
|
+
)
|
|
225
|
+
parser.add_argument(
|
|
226
|
+
"--cliff-strength",
|
|
227
|
+
type=float,
|
|
228
|
+
default=4.0,
|
|
229
|
+
help="Strength of cliff",
|
|
230
|
+
)
|
|
231
|
+
parser.add_argument(
|
|
232
|
+
"--disable-cliffs",
|
|
233
|
+
type=bool,
|
|
234
|
+
default=False,
|
|
235
|
+
help="Disable generation of cliffs",
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
command_line_args = parser.parse_args()
|
|
239
|
+
|
|
240
|
+
return command_line_args
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
# ====================================================================================
|
|
244
|
+
|
|
245
|
+
grok = "clusters" in globals()
|
|
246
|
+
|
|
247
|
+
if not grok:
|
|
248
|
+
# We are not in Datagrok - need to parse command line arguments
|
|
249
|
+
args = parse_command_line_args()
|
|
250
|
+
clusters = args.clusters
|
|
251
|
+
num_sequences = args.sequences
|
|
252
|
+
motif_length = args.motif_length
|
|
253
|
+
max_variants_position = args.max_variants_position
|
|
254
|
+
random_length = args.random_length
|
|
255
|
+
dispersion = args.dispersion
|
|
256
|
+
alphabet_key = args.alphabet
|
|
257
|
+
disable_cliffs = args.disable_cliffs
|
|
258
|
+
cliff_probability = args.cliff_probability
|
|
259
|
+
cliff_strength = args.cliff_strength
|
|
260
|
+
|
|
261
|
+
alphabet: alphabet_type = alphabets[alphabet_key].split(",") if alphabet_key in alphabets else alphabet_key.split(",")
|
|
262
|
+
|
|
263
|
+
# Running sequence generator
|
|
264
|
+
header, data = generate_sequences(
|
|
265
|
+
clusters,
|
|
266
|
+
num_sequences,
|
|
267
|
+
motif_length,
|
|
268
|
+
max_variants_position,
|
|
269
|
+
random_length,
|
|
270
|
+
dispersion,
|
|
271
|
+
alphabet,
|
|
272
|
+
not disable_cliffs,
|
|
273
|
+
cliff_probability,
|
|
274
|
+
cliff_strength,
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
if grok:
|
|
278
|
+
# Exporting data to Datagrok as a pandas dataframe
|
|
279
|
+
import pandas as pd
|
|
280
|
+
|
|
281
|
+
sequences = pd.DataFrame.from_records(data, columns=header)
|
|
282
|
+
else:
|
|
283
|
+
# Writing results to stdout - no need to work with big and heavy Pandas
|
|
284
|
+
import csv
|
|
285
|
+
|
|
286
|
+
csv_writer = csv.writer(sys.stdout, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
|
|
287
|
+
csv_writer.writerow(header)
|
|
288
|
+
for line in data:
|
|
289
|
+
csv_writer.writerow(line)
|
|
@@ -43,8 +43,8 @@ export async function getSimilaritiesMatrix(
|
|
|
43
43
|
}
|
|
44
44
|
|
|
45
45
|
export async function getChemSimilaritiesMatrix(dim: number, seqCol: DG.Column,
|
|
46
|
-
df: DG.DataFrame, colName: string, simArr: DG.Column[])
|
|
47
|
-
: Promise<DG.Column[]> {
|
|
46
|
+
df: DG.DataFrame, colName: string, simArr: (DG.Column | null)[])
|
|
47
|
+
: Promise<(DG.Column | null)[]> {
|
|
48
48
|
if (seqCol.version !== seqCol.temp[MONOMERIC_COL_TAGS.LAST_INVALIDATED_VERSION])
|
|
49
49
|
await invalidateMols(seqCol, false);
|
|
50
50
|
const fpDf = DG.DataFrame.create(seqCol.length);
|
|
@@ -12,15 +12,17 @@ import {updateDivInnerHTML} from '../utils/ui-utils';
|
|
|
12
12
|
import {Subject} from 'rxjs';
|
|
13
13
|
|
|
14
14
|
export class SequenceDiversityViewer extends SequenceSearchBaseViewer {
|
|
15
|
+
diverseColumnLabel: string | null; // Use postfix Label to prevent activating table column selection editor
|
|
16
|
+
|
|
15
17
|
renderMolIds: number[] | null = null;
|
|
16
18
|
columnNames = [];
|
|
17
19
|
computeCompleted = new Subject<boolean>();
|
|
18
20
|
|
|
19
21
|
constructor() {
|
|
20
22
|
super('diversity');
|
|
23
|
+
this.diverseColumnLabel = this.string('diverseColumnLabel', null);
|
|
21
24
|
}
|
|
22
25
|
|
|
23
|
-
|
|
24
26
|
async render(computeData = true): Promise<void> {
|
|
25
27
|
if (!this.beforeRender())
|
|
26
28
|
return;
|
|
@@ -29,14 +31,15 @@ export class SequenceDiversityViewer extends SequenceSearchBaseViewer {
|
|
|
29
31
|
const monomericMols = await getMonomericMols(this.moleculeColumn);
|
|
30
32
|
//need to create df to calculate fingerprints
|
|
31
33
|
const monomericMolsDf = DG.DataFrame.fromColumns([monomericMols]);
|
|
32
|
-
this.renderMolIds =
|
|
33
|
-
await grok.functions.call('Chem:callChemDiversitySearch', {
|
|
34
|
+
this.renderMolIds = await grok.functions.call('Chem:callChemDiversitySearch', {
|
|
34
35
|
col: monomericMols,
|
|
35
36
|
metricName: this.distanceMetric,
|
|
36
37
|
limit: this.limit,
|
|
37
38
|
fingerprint: this.fingerprint
|
|
38
39
|
});
|
|
39
|
-
const
|
|
40
|
+
const diverseColumnName: string = this.diverseColumnLabel != null ? this.diverseColumnLabel :
|
|
41
|
+
`diverse (${this.moleculeColumnName})`;
|
|
42
|
+
const resCol = DG.Column.string(diverseColumnName, this.renderMolIds!.length)
|
|
40
43
|
.init((i) => this.moleculeColumn?.get(this.renderMolIds![i]));
|
|
41
44
|
resCol.semType = DG.SEMTYPE.MACROMOLECULE;
|
|
42
45
|
this.tags.forEach((tag) => resCol.setTag(tag, this.moleculeColumn!.getTag(tag)));
|
|
@@ -11,13 +11,15 @@ import {Subject} from 'rxjs';
|
|
|
11
11
|
import {TAGS as bioTAGS, getSplitter} from '@datagrok-libraries/bio/src/utils/macromolecule';
|
|
12
12
|
|
|
13
13
|
export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
|
|
14
|
+
cutoff: number;
|
|
14
15
|
hotSearch: boolean;
|
|
16
|
+
similarColumnLabel: string | null; // Use postfix Label to prevent activating table column selection editor
|
|
17
|
+
|
|
15
18
|
sketchedMolecule: string = '';
|
|
16
19
|
curIdx: number = 0;
|
|
17
20
|
molCol: DG.Column | null = null;
|
|
18
21
|
idxs: DG.Column | null = null;
|
|
19
22
|
scores: DG.Column | null = null;
|
|
20
|
-
cutoff: number;
|
|
21
23
|
gridSelect: boolean = false;
|
|
22
24
|
targetMoleculeIdx: number = 0;
|
|
23
25
|
computeCompleted = new Subject<boolean>();
|
|
@@ -26,6 +28,7 @@ export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
|
|
|
26
28
|
super('similarity');
|
|
27
29
|
this.cutoff = this.float('cutoff', 0.01, {min: 0, max: 1});
|
|
28
30
|
this.hotSearch = this.bool('hotSearch', true);
|
|
31
|
+
this.similarColumnLabel = this.string('similarColumnLabel', null);
|
|
29
32
|
}
|
|
30
33
|
|
|
31
34
|
init(): void {
|
|
@@ -54,7 +57,9 @@ export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
|
|
|
54
57
|
});
|
|
55
58
|
this.idxs = df.getCol('indexes');
|
|
56
59
|
this.scores = df.getCol('score');
|
|
57
|
-
this.
|
|
60
|
+
const similarColumnName: string = this.similarColumnLabel != null ? this.similarColumnLabel :
|
|
61
|
+
`similar (${this.moleculeColumnName})`;
|
|
62
|
+
this.molCol = DG.Column.string(similarColumnName,
|
|
58
63
|
this.idxs!.length).init((i) => this.moleculeColumn?.get(this.idxs?.get(i)));
|
|
59
64
|
this.molCol.semType = DG.SEMTYPE.MACROMOLECULE;
|
|
60
65
|
this.tags.forEach((tag) => this.molCol!.setTag(tag, this.moleculeColumn!.getTag(tag)));
|
|
@@ -6,6 +6,7 @@ import {Matrix} from '@datagrok-libraries/utils/src/type-declarations';
|
|
|
6
6
|
import BitArray from '@datagrok-libraries/utils/src/bit-array';
|
|
7
7
|
import {ISequenceSpaceParams} from '@datagrok-libraries/ml/src/viewers/activity-cliffs';
|
|
8
8
|
import {invalidateMols, MONOMERIC_COL_TAGS} from '../substructure-search/substructure-search';
|
|
9
|
+
import {UnitsHandler} from '@datagrok-libraries/bio/src/utils/units-handler';
|
|
9
10
|
import * as grok from 'datagrok-api/grok';
|
|
10
11
|
|
|
11
12
|
export interface ISequenceSpaceResult {
|
|
@@ -53,6 +54,23 @@ export async function sequenceSpaceByFingerprints(spaceParams: ISequenceSpacePar
|
|
|
53
54
|
return result;
|
|
54
55
|
}
|
|
55
56
|
|
|
57
|
+
export async function getSequenceSpace(spaceParams: ISequenceSpaceParams): Promise<ISequenceSpaceResult> {
|
|
58
|
+
const uh = new UnitsHandler(spaceParams.seqCol);
|
|
59
|
+
if (uh.isFasta()) {
|
|
60
|
+
const distanceFName = uh.getDistanceFunctionName();
|
|
61
|
+
const sequenceSpaceResult = await reduceDimensinalityWithNormalization(
|
|
62
|
+
spaceParams.seqCol.toList(),
|
|
63
|
+
spaceParams.methodName,
|
|
64
|
+
distanceFName,
|
|
65
|
+
spaceParams.options);
|
|
66
|
+
console.log(sequenceSpaceResult);
|
|
67
|
+
const cols: DG.Column[] = spaceParams.embedAxesNames.map(
|
|
68
|
+
(name: string, index: number) => DG.Column.fromFloat32Array(name, sequenceSpaceResult.embedding[index]));
|
|
69
|
+
return {distance: sequenceSpaceResult.distance, coordinates: new DG.ColumnList(cols)};
|
|
70
|
+
} else {
|
|
71
|
+
return await sequenceSpaceByFingerprints(spaceParams);
|
|
72
|
+
}
|
|
73
|
+
}
|
|
56
74
|
|
|
57
75
|
export function getEmbeddingColsNames(df: DG.DataFrame) {
|
|
58
76
|
const axes = ['Embed_X', 'Embed_Y'];
|
|
@@ -6,8 +6,10 @@ import {_package} from '../package';
|
|
|
6
6
|
import {DemoScript} from '@datagrok-libraries/tutorials/src/demo-script';
|
|
7
7
|
import {delay} from '@datagrok-libraries/utils/src/test';
|
|
8
8
|
import {handleError} from './utils';
|
|
9
|
+
import {SequenceDiversityViewer} from '../analysis/sequence-diversity-viewer';
|
|
10
|
+
import {SequenceSimilarityViewer} from '../analysis/sequence-similarity-viewer';
|
|
9
11
|
|
|
10
|
-
const dataFn = 'data/sample_FASTA_DNA.csv';
|
|
12
|
+
const dataFn: string = 'data/sample_FASTA_DNA.csv';
|
|
11
13
|
|
|
12
14
|
export async function demoBio01UI() {
|
|
13
15
|
let view: DG.TableView;
|
|
@@ -17,21 +19,34 @@ export async function demoBio01UI() {
|
|
|
17
19
|
const demoScript = new DemoScript('Demo', 'Sequence similarity / diversity search');
|
|
18
20
|
await demoScript
|
|
19
21
|
.step(`Loading DNA notation 'fasta'`, async () => {
|
|
22
|
+
grok.shell.windows.showContextPanel = false;
|
|
23
|
+
grok.shell.windows.showProperties = false;
|
|
24
|
+
|
|
20
25
|
df = await _package.files.readCsv(dataFn);
|
|
21
26
|
view = grok.shell.addTableView(df);
|
|
27
|
+
|
|
28
|
+
view.grid.columns.byName('id')!.width = 0;
|
|
29
|
+
view.grid.columns.byName('sequence')!.width = 500;
|
|
30
|
+
// TODO: Fix column width
|
|
22
31
|
}, {
|
|
23
32
|
description: `Load dataset with macromolecules of 'fasta' notation, 'DNA' alphabet.`,
|
|
24
|
-
delay:
|
|
33
|
+
delay: 1200
|
|
25
34
|
})
|
|
26
35
|
.step('Sequence similarity search', async () => {
|
|
27
|
-
const simViewer = await df.plot.fromType('Sequence Similarity Search'
|
|
36
|
+
const simViewer = await df.plot.fromType('Sequence Similarity Search', {
|
|
37
|
+
moleculeColumnName: 'sequence',
|
|
38
|
+
similarColumnLabel: 'Similar to current',
|
|
39
|
+
}) as SequenceSimilarityViewer;
|
|
28
40
|
view.dockManager.dock(simViewer, DG.DOCK_TYPE.RIGHT, null, 'Similarity search', 0.35);
|
|
29
41
|
}, {
|
|
30
42
|
description: `Add 'Sequence Similarity Search' viewer.`,
|
|
31
43
|
delay: 1600
|
|
32
44
|
})
|
|
33
45
|
.step('Sequence diversity search', async () => {
|
|
34
|
-
const divViewer = await df.plot.fromType('Sequence Diversity Search'
|
|
46
|
+
const divViewer = await df.plot.fromType('Sequence Diversity Search', {
|
|
47
|
+
moleculeColumnName: 'sequence',
|
|
48
|
+
diverseColumnLabel: 'Top diverse sequences of all data'
|
|
49
|
+
}) as SequenceDiversityViewer;
|
|
35
50
|
view.dockManager.dock(divViewer, DG.DOCK_TYPE.DOWN, null, 'Diversity search', 0.27);
|
|
36
51
|
}, {
|
|
37
52
|
description: `Add 'Sequence Deversity Search' viewer.`,
|
|
@@ -36,6 +36,9 @@ export async function demoBio01aUI() {
|
|
|
36
36
|
]);
|
|
37
37
|
view = grok.shell.addTableView(df);
|
|
38
38
|
view.grid.props.rowHeight = 22;
|
|
39
|
+
|
|
40
|
+
grok.shell.windows.showContextPanel = false;
|
|
41
|
+
grok.shell.windows.showProperties = false;
|
|
39
42
|
}, {
|
|
40
43
|
description: `Load dataset with macromolecules of 'fasta' notation, 'DNA' alphabet.`,
|
|
41
44
|
delay: 1600,
|
|
@@ -29,6 +29,9 @@ export async function demoBio01bUI() {
|
|
|
29
29
|
const demoScript = new DemoScript('Demo', '');
|
|
30
30
|
await demoScript
|
|
31
31
|
.step(`Loading DNA notation \'fasta\'`, async () => {
|
|
32
|
+
grok.shell.windows.showContextPanel = false;
|
|
33
|
+
grok.shell.windows.showProperties = false;
|
|
34
|
+
|
|
32
35
|
[df, treeHelper, dendrogramSvc] = await Promise.all([
|
|
33
36
|
_package.files.readCsv(dataFn),
|
|
34
37
|
getTreeHelper(),
|
|
@@ -28,6 +28,9 @@ export async function demoBio05UI(): Promise<void> {
|
|
|
28
28
|
await demoScript
|
|
29
29
|
.step(`Loading peptides notation 'HELM'`, async () => {
|
|
30
30
|
view = grok.shell.addTableView(df = await _package.files.readCsv(helmFn));
|
|
31
|
+
|
|
32
|
+
grok.shell.windows.showContextPanel = false;
|
|
33
|
+
grok.shell.windows.showProperties = false;
|
|
31
34
|
}, {
|
|
32
35
|
description: 'Load dataset with macromolecules of \'Helm\' notation.',
|
|
33
36
|
delay: 1600,
|
|
@@ -44,15 +47,6 @@ export async function demoBio05UI(): Promise<void> {
|
|
|
44
47
|
description: 'Multiple sequence alignment (MSA) performed with PepSeA tool operating on non-natural aminoacids as well.',
|
|
45
48
|
delay: 1600,
|
|
46
49
|
})
|
|
47
|
-
.step('Composition analysis on MSA results', async () => {
|
|
48
|
-
wlViewer = await df.plot.fromType('WebLogo', {
|
|
49
|
-
sequenceColumnName: msaHelmColName
|
|
50
|
-
}) as DG.Viewer & IWebLogoViewer;
|
|
51
|
-
view.dockManager.dock(wlViewer, DG.DOCK_TYPE.DOWN, null, 'Composition analysis', 0.2);
|
|
52
|
-
}, {
|
|
53
|
-
description: 'Composition analysis allows to reveal functional features of sequences like motifs, or variable loops.',
|
|
54
|
-
delay: 1600,
|
|
55
|
-
})
|
|
56
50
|
.step('Building sequence space', async () => {
|
|
57
51
|
const method: string = 'UMAP';
|
|
58
52
|
ssViewer = (await sequenceSpaceTopMenu(df, msaHelmCol,
|
|
@@ -62,6 +56,16 @@ export async function demoBio05UI(): Promise<void> {
|
|
|
62
56
|
description: 'Reduce sequence space dimensionality to display on 2D representation.',
|
|
63
57
|
delay: 1600
|
|
64
58
|
})
|
|
59
|
+
.step('Composition analysis on MSA results', async () => {
|
|
60
|
+
wlViewer = await df.plot.fromType('WebLogo', {
|
|
61
|
+
sequenceColumnName: msaHelmColName,
|
|
62
|
+
maxHeight: 50,
|
|
63
|
+
}) as DG.Viewer & IWebLogoViewer;
|
|
64
|
+
view.dockManager.dock(wlViewer, DG.DOCK_TYPE.DOWN, null, 'Composition analysis', 0.2);
|
|
65
|
+
}, {
|
|
66
|
+
description: 'Composition analysis allows to reveal functional features of sequences like motifs, or variable loops.',
|
|
67
|
+
delay: 1600,
|
|
68
|
+
})
|
|
65
69
|
.start();
|
|
66
70
|
} catch (err: any) {
|
|
67
71
|
handleError(err);
|
package/src/package.ts
CHANGED
|
@@ -9,7 +9,7 @@ import {
|
|
|
9
9
|
} from './utils/cell-renderer';
|
|
10
10
|
import {VdRegionsViewer} from './viewers/vd-regions-viewer';
|
|
11
11
|
import {SequenceAlignment} from './seq_align';
|
|
12
|
-
import {getEmbeddingColsNames, sequenceSpaceByFingerprints} from './analysis/sequence-space';
|
|
12
|
+
import {getEmbeddingColsNames, sequenceSpaceByFingerprints, getSequenceSpace} from './analysis/sequence-space';
|
|
13
13
|
import {getActivityCliffs} from '@datagrok-libraries/ml/src/viewers/activity-cliffs';
|
|
14
14
|
import {
|
|
15
15
|
createLinesGrid,
|
|
@@ -49,6 +49,7 @@ import {demoBio01bUI} from './demo/bio01b-hierarchical-clustering-and-activity-c
|
|
|
49
49
|
import {demoBio05UI} from './demo/bio05-helm-msa-sequence-space';
|
|
50
50
|
import {checkInputColumnUI} from './utils/check-input-column';
|
|
51
51
|
import {multipleSequenceAlignmentUI} from './utils/multiple-sequence-alignment-ui';
|
|
52
|
+
import { runKalign } from './utils/multiple-sequence-alignment';
|
|
52
53
|
|
|
53
54
|
export const _package = new DG.Package();
|
|
54
55
|
|
|
@@ -286,19 +287,23 @@ export async function activityCliffs(df: DG.DataFrame, macroMolecule: DG.Column,
|
|
|
286
287
|
'separator': macroMolecule.getTag(bioTAGS.separator),
|
|
287
288
|
'alphabet': macroMolecule.getTag(bioTAGS.alphabet),
|
|
288
289
|
};
|
|
290
|
+
const uh = new UnitsHandler(macroMolecule);
|
|
291
|
+
let columnDistanceMetric = 'Tanimoto';
|
|
292
|
+
if (uh.isFasta())
|
|
293
|
+
columnDistanceMetric = uh.getDistanceFunctionName();
|
|
289
294
|
const sp = await getActivityCliffs(
|
|
290
295
|
df,
|
|
291
296
|
macroMolecule,
|
|
292
297
|
null,
|
|
293
298
|
axesNames,
|
|
294
|
-
|
|
299
|
+
columnDistanceMetric,
|
|
295
300
|
activities,
|
|
296
301
|
similarity,
|
|
297
302
|
'Tanimoto',
|
|
298
303
|
methodName,
|
|
299
304
|
DG.SEMTYPE.MACROMOLECULE,
|
|
300
305
|
tags,
|
|
301
|
-
|
|
306
|
+
getSequenceSpace,
|
|
302
307
|
getChemSimilaritiesMatrix,
|
|
303
308
|
createTooltipElement,
|
|
304
309
|
createPropPanelElement,
|
|
@@ -349,7 +354,7 @@ export async function sequenceSpaceTopMenu(table: DG.DataFrame, macroMolecule: D
|
|
|
349
354
|
embedAxesNames: embedColsNames,
|
|
350
355
|
options: options
|
|
351
356
|
};
|
|
352
|
-
const sequenceSpaceRes = await
|
|
357
|
+
const sequenceSpaceRes = await getSequenceSpace(chemSpaceParams);
|
|
353
358
|
const embeddings = sequenceSpaceRes.coordinates;
|
|
354
359
|
for (const col of embeddings) {
|
|
355
360
|
const listValues = col.toList();
|
|
@@ -411,10 +416,21 @@ export async function toAtomicLevel(df: DG.DataFrame, macroMolecule: DG.Column):
|
|
|
411
416
|
//top-menu: Bio | Alignment | MSA...
|
|
412
417
|
//name: MSA...
|
|
413
418
|
//tags: bio, panel
|
|
414
|
-
export function
|
|
419
|
+
export function multipleSequenceAlignmentDialog(): void {
|
|
415
420
|
multipleSequenceAlignmentUI();
|
|
416
421
|
}
|
|
417
422
|
|
|
423
|
+
//name: Multiple Sequence Alignment
|
|
424
|
+
//description: Multiple sequence alignment
|
|
425
|
+
//tags: bio
|
|
426
|
+
//input: column sequenceCol {semType: Macromolecule}
|
|
427
|
+
//input: column clustersCol
|
|
428
|
+
//output: column result
|
|
429
|
+
export async function alignSequences(sequenceCol: DG.Column<string> | null = null,
|
|
430
|
+
clustersCol: DG.Column | null = null): Promise<DG.Column<string>> {
|
|
431
|
+
return multipleSequenceAlignmentUI({col: sequenceCol, clustersCol});
|
|
432
|
+
}
|
|
433
|
+
|
|
418
434
|
//top-menu: Bio | Structure | Composition Analysis
|
|
419
435
|
//name: Composition Analysis
|
|
420
436
|
//meta.icon: files/icons/composition-analysis.svg
|
|
@@ -15,7 +15,7 @@ seq3,
|
|
|
15
15
|
seq4`;
|
|
16
16
|
|
|
17
17
|
test('testMsaPos', async () => {
|
|
18
|
-
const func: DG.Func = DG.Func.find({package: 'Bio', name: '
|
|
18
|
+
const func: DG.Func = DG.Func.find({package: 'Bio', name: 'multipleSequenceAlignmentDialog'})[0];
|
|
19
19
|
const funcInputColumnProperty: DG.Property = func.inputs.find((i) => i.name == 'sequence')!;
|
|
20
20
|
|
|
21
21
|
const k = 11;
|
|
@@ -67,7 +67,7 @@ seq4`;
|
|
|
67
67
|
});
|
|
68
68
|
|
|
69
69
|
test('testGetActionFunctionMeta', async () => {
|
|
70
|
-
const func: DG.Func = DG.Func.find({package: 'Bio', name: '
|
|
70
|
+
const func: DG.Func = DG.Func.find({package: 'Bio', name: 'multipleSequenceAlignmentDialog'})[0];
|
|
71
71
|
const sequenceInput: DG.Property = func.inputs.find((i) => i.name == 'sequence')!;
|
|
72
72
|
const k = 11;
|
|
73
73
|
});
|
package/src/tests/msa-tests.ts
CHANGED
|
@@ -138,7 +138,7 @@ async function _testMSAOnColumn(
|
|
|
138
138
|
if (alphabet)
|
|
139
139
|
expect(srcSeqCol.getTag(bioTAGS.alphabet), alphabet);
|
|
140
140
|
|
|
141
|
-
const msaSeqCol = await multipleSequenceAlignmentUI(srcSeqCol, pepseaMethod);
|
|
141
|
+
const msaSeqCol = await multipleSequenceAlignmentUI({col: srcSeqCol, pepsea: {method: pepseaMethod}});
|
|
142
142
|
expect(msaSeqCol.semType, DG.SEMTYPE.MACROMOLECULE);
|
|
143
143
|
expect(msaSeqCol.getTag(DG.TAGS.UNITS), tgtNotation);
|
|
144
144
|
expect(msaSeqCol.getTag(bioTAGS.aligned), ALIGNMENT.SEQ_MSA);
|
|
@@ -3,7 +3,7 @@ import * as DG from 'datagrok-api/dg';
|
|
|
3
3
|
|
|
4
4
|
import {after, before, category, delay, expect, test} from '@datagrok-libraries/utils/src/test';
|
|
5
5
|
|
|
6
|
-
import {importFasta
|
|
6
|
+
import {importFasta} from '../package';
|
|
7
7
|
import {convertDo} from '../utils/convert';
|
|
8
8
|
import * as C from '../utils/constants';
|
|
9
9
|
import {generateLongSequence, generateManySequences, performanceTest} from './utils/sequences-generators';
|
|
@@ -146,7 +146,7 @@ category('renderers', () => {
|
|
|
146
146
|
expect(srcSeqCol.getTag(bioTAGS.alphabet), ALPHABET.PT);
|
|
147
147
|
expect(srcSeqCol.getTag(DG.TAGS.CELL_RENDERER), 'sequence');
|
|
148
148
|
|
|
149
|
-
const msaSeqCol = await multipleSequenceAlignmentUI(srcSeqCol);
|
|
149
|
+
const msaSeqCol = await multipleSequenceAlignmentUI({col: srcSeqCol});
|
|
150
150
|
tv.grid.invalidate();
|
|
151
151
|
|
|
152
152
|
expect(msaSeqCol.semType, DG.SEMTYPE.MACROMOLECULE);
|