@datagrok/bio 2.4.31 → 2.4.40
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintrc.json +6 -8
- package/README.md +22 -7
- package/css/msa.css +3 -0
- package/detectors.js +21 -12
- package/dist/1.js +2 -0
- package/dist/1.js.map +1 -0
- package/dist/18.js +2 -0
- package/dist/18.js.map +1 -0
- package/dist/190.js +2 -0
- package/dist/190.js.map +1 -0
- package/dist/452.js +2 -0
- package/dist/452.js.map +1 -0
- package/dist/729.js +2 -0
- package/dist/729.js.map +1 -0
- package/dist/package-test.js +1 -1
- package/dist/package-test.js.map +1 -1
- package/dist/package.js +1 -1
- package/dist/package.js.map +1 -1
- package/files/libraries/broken-lib.sdf +136 -0
- package/files/libraries/group1/mock-lib-3.json +74 -0
- package/files/libraries/mock-lib-2.json +48 -0
- package/files/tests/100_3_clustests.csv +100 -0
- package/files/tests/100_3_clustests_empty_vals.csv +100 -0
- package/files/tests/peptides_motif-with-random_10000.csv +9998 -0
- package/package.json +4 -4
- package/scripts/sequence_generator.py +164 -48
- package/src/analysis/sequence-activity-cliffs.ts +7 -9
- package/src/analysis/sequence-diversity-viewer.ts +8 -3
- package/src/analysis/sequence-search-base-viewer.ts +4 -3
- package/src/analysis/sequence-similarity-viewer.ts +13 -7
- package/src/analysis/sequence-space.ts +15 -12
- package/src/analysis/workers/mm-distance-array-service.ts +48 -0
- package/src/analysis/workers/mm-distance-array-worker.ts +29 -0
- package/src/analysis/workers/mm-distance-worker-creator.ts +6 -9
- package/src/apps/web-logo-app.ts +34 -0
- package/src/calculations/monomerLevelMols.ts +10 -12
- package/src/demo/bio01-similarity-diversity.ts +4 -5
- package/src/demo/bio01a-hierarchical-clustering-and-sequence-space.ts +6 -7
- package/src/demo/bio01b-hierarchical-clustering-and-activity-cliffs.ts +7 -8
- package/src/demo/bio03-atomic-level.ts +1 -4
- package/src/demo/bio05-helm-msa-sequence-space.ts +6 -4
- package/src/demo/utils.ts +3 -4
- package/src/package-test.ts +1 -2
- package/src/package.ts +135 -82
- package/src/seq_align.ts +482 -483
- package/src/substructure-search/substructure-search.ts +3 -3
- package/src/tests/Palettes-test.ts +1 -1
- package/src/tests/WebLogo-positions-test.ts +12 -35
- package/src/tests/_first-tests.ts +1 -1
- package/src/tests/activity-cliffs-tests.ts +10 -7
- package/src/tests/activity-cliffs-utils.ts +6 -5
- package/src/tests/bio-tests.ts +20 -25
- package/src/tests/checkInputColumn-tests.ts +5 -11
- package/src/tests/converters-test.ts +19 -37
- package/src/tests/detectors-benchmark-tests.ts +35 -37
- package/src/tests/detectors-tests.ts +29 -34
- package/src/tests/detectors-weak-and-likely-tests.ts +11 -21
- package/src/tests/fasta-export-tests.ts +3 -3
- package/src/tests/fasta-handler-test.ts +2 -3
- package/src/tests/lib-tests.ts +2 -4
- package/src/tests/mm-distance-tests.ts +25 -17
- package/src/tests/monomer-libraries-tests.ts +1 -1
- package/src/tests/msa-tests.ts +12 -9
- package/src/tests/pepsea-tests.ts +6 -3
- package/src/tests/renderers-test.ts +13 -11
- package/src/tests/sequence-space-test.ts +10 -8
- package/src/tests/sequence-space-utils.ts +6 -4
- package/src/tests/similarity-diversity-tests.ts +47 -61
- package/src/tests/splitters-test.ts +14 -20
- package/src/tests/to-atomic-level-tests.ts +9 -17
- package/src/tests/units-handler-splitted-tests.ts +106 -0
- package/src/tests/units-handler-tests.ts +22 -26
- package/src/tests/utils/sequences-generators.ts +6 -2
- package/src/tests/utils.ts +10 -4
- package/src/tests/viewers.ts +1 -1
- package/src/utils/atomic-works.ts +49 -57
- package/src/utils/cell-renderer.ts +25 -8
- package/src/utils/check-input-column.ts +19 -4
- package/src/utils/constants.ts +3 -3
- package/src/utils/convert.ts +56 -23
- package/src/utils/monomer-lib.ts +83 -64
- package/src/utils/multiple-sequence-alignment-ui.ts +35 -21
- package/src/utils/multiple-sequence-alignment.ts +2 -2
- package/src/utils/pepsea.ts +17 -7
- package/src/utils/save-as-fasta.ts +11 -4
- package/src/utils/ui-utils.ts +1 -1
- package/src/viewers/vd-regions-viewer.ts +21 -22
- package/src/viewers/web-logo-viewer.ts +189 -154
- package/src/widgets/bio-substructure-filter.ts +9 -6
- package/src/widgets/representations.ts +11 -12
- package/tsconfig.json +1 -1
- package/dist/258.js +0 -2
- package/dist/258.js.map +0 -1
- package/dist/457.js +0 -2
- package/dist/457.js.map +0 -1
- package/dist/562.js +0 -2
- package/dist/562.js.map +0 -1
- package/dist/925.js +0 -2
- package/dist/925.js.map +0 -1
- package/src/analysis/workers/mm-distance-worker.ts +0 -16
package/package.json
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
"name": "Leonid Stolbov",
|
|
6
6
|
"email": "lstolbov@datagrok.ai"
|
|
7
7
|
},
|
|
8
|
-
"version": "2.4.
|
|
8
|
+
"version": "2.4.40",
|
|
9
9
|
"description": "Bioinformatics support (import/export of sequences, conversion, visualization, analysis). [See more](https://github.com/datagrok-ai/public/blob/master/packages/Bio/README.md) for details.",
|
|
10
10
|
"repository": {
|
|
11
11
|
"type": "git",
|
|
@@ -14,11 +14,11 @@
|
|
|
14
14
|
},
|
|
15
15
|
"dependencies": {
|
|
16
16
|
"@biowasm/aioli": "^3.1.0",
|
|
17
|
-
"@datagrok-libraries/bio": "^5.
|
|
17
|
+
"@datagrok-libraries/bio": "^5.32.1",
|
|
18
18
|
"@datagrok-libraries/chem-meta": "^1.0.1",
|
|
19
|
-
"@datagrok-libraries/ml": "^6.3.
|
|
19
|
+
"@datagrok-libraries/ml": "^6.3.37",
|
|
20
20
|
"@datagrok-libraries/tutorials": "^1.3.2",
|
|
21
|
-
"@datagrok-libraries/utils": "^4.0.
|
|
21
|
+
"@datagrok-libraries/utils": "^4.0.11",
|
|
22
22
|
"cash-dom": "^8.0.0",
|
|
23
23
|
"css-loader": "^6.7.3",
|
|
24
24
|
"datagrok-api": "^1.13.3",
|
|
@@ -13,21 +13,34 @@
|
|
|
13
13
|
# input: bool disable_cliffs = False [Disable generation of cliffs]
|
|
14
14
|
# input: double cliff_probability = 0.01 [Probability to make activity cliff of a sequence]
|
|
15
15
|
# input: double cliff_strength = 4.0 [Strength of cliff]
|
|
16
|
+
# input: double fasta_separator = '' [Separator for a FASTA notation]
|
|
16
17
|
# output: dataframe sequences
|
|
17
18
|
|
|
18
19
|
import random
|
|
19
20
|
import argparse
|
|
20
21
|
import sys
|
|
22
|
+
from enum import Enum
|
|
21
23
|
|
|
22
24
|
from typing import List, Tuple, Dict, Iterator, Any
|
|
23
25
|
|
|
24
|
-
alphabet_type = List[str]
|
|
25
26
|
|
|
26
|
-
|
|
27
|
-
motif_template_type = List[letter_choice_type]
|
|
27
|
+
# --- Type definitions ---
|
|
28
28
|
|
|
29
|
-
|
|
30
|
-
|
|
29
|
+
Letter = str
|
|
30
|
+
Alphabet = List[str]
|
|
31
|
+
|
|
32
|
+
LetterChoice = List[Letter]
|
|
33
|
+
MotifTemplate = List[LetterChoice]
|
|
34
|
+
|
|
35
|
+
Sequence = List[Letter] # The sequence in a form of list
|
|
36
|
+
SequenceSquashed = str # Sequence, joined together in string form
|
|
37
|
+
|
|
38
|
+
SequenceRecord = Tuple[int, Sequence, float, bool]
|
|
39
|
+
ClusterSequenceRecord = Tuple[int, str, Sequence, float, bool]
|
|
40
|
+
|
|
41
|
+
# --- constants ---
|
|
42
|
+
|
|
43
|
+
HelmConnectionMode = Enum("HelmConnectionMode", ["linear", "cyclic", "mixed"])
|
|
31
44
|
|
|
32
45
|
alphabets: Dict[str, str] = {
|
|
33
46
|
"PT": "A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y",
|
|
@@ -42,10 +55,10 @@ def mean_range(mean: int, disp: int) -> int:
|
|
|
42
55
|
|
|
43
56
|
def generate_motif_template(
|
|
44
57
|
motif_length: int,
|
|
45
|
-
alphabet:
|
|
58
|
+
alphabet: Alphabet,
|
|
46
59
|
max_variants_cluster: int,
|
|
47
60
|
prob_any: float = 0.2,
|
|
48
|
-
) ->
|
|
61
|
+
) -> MotifTemplate:
|
|
49
62
|
motif_template = []
|
|
50
63
|
for position in range(motif_length):
|
|
51
64
|
# Selecting letters for position i
|
|
@@ -53,20 +66,20 @@ def generate_motif_template(
|
|
|
53
66
|
letters = ["?"] # this stands for any symbol
|
|
54
67
|
else:
|
|
55
68
|
n_variants = random.randrange(max_variants_cluster) + 1
|
|
56
|
-
letters =
|
|
69
|
+
letters = list(set((random.choice(alphabet) for i in range(n_variants))))
|
|
57
70
|
motif_template.append(letters)
|
|
58
71
|
return motif_template
|
|
59
72
|
|
|
60
73
|
|
|
61
|
-
def generate_motif(template:
|
|
74
|
+
def generate_motif(template: MotifTemplate, alphabet: Alphabet) -> Sequence:
|
|
62
75
|
template_with_any = [
|
|
63
76
|
(letters if not "?" in letters else alphabet) for letters in template
|
|
64
77
|
]
|
|
65
|
-
return
|
|
78
|
+
return [random.choice(letters) for letters in template_with_any]
|
|
66
79
|
|
|
67
80
|
|
|
68
|
-
def motif_notation(motif_template:
|
|
69
|
-
def motif_notation_code(letter_choice:
|
|
81
|
+
def motif_notation(motif_template: MotifTemplate) -> str:
|
|
82
|
+
def motif_notation_code(letter_choice: LetterChoice) -> str:
|
|
70
83
|
if len(letter_choice) == 1:
|
|
71
84
|
return letter_choice[0]
|
|
72
85
|
else:
|
|
@@ -77,21 +90,51 @@ def motif_notation(motif_template: motif_template_type) -> str:
|
|
|
77
90
|
)
|
|
78
91
|
|
|
79
92
|
|
|
80
|
-
def generate_random(n: int, alphabet:
|
|
81
|
-
return
|
|
93
|
+
def generate_random(n: int, alphabet: Alphabet) -> Sequence:
|
|
94
|
+
return [random.choice(alphabet) for i in range(n)]
|
|
82
95
|
|
|
83
96
|
|
|
84
97
|
def make_cliff(
|
|
85
|
-
motif_template:
|
|
86
|
-
) ->
|
|
98
|
+
motif_template: MotifTemplate, alphabet: Alphabet, motif: Sequence
|
|
99
|
+
) -> Sequence:
|
|
87
100
|
# Mutate conservative letter in motif
|
|
88
|
-
|
|
101
|
+
motif_len = len(motif_template)
|
|
102
|
+
pos = random.randrange(motif_len)
|
|
89
103
|
while "?" in motif_template[pos]:
|
|
90
|
-
pos = (
|
|
91
|
-
|
|
92
|
-
) # always will find letters since ends of motif can't be any symbol
|
|
104
|
+
pos = (
|
|
105
|
+
pos + 1
|
|
106
|
+
) % motif_len # always will find letters since ends of motif can't be any symbol
|
|
93
107
|
outlier_letters = list(set(alphabet) - set(motif_template[pos]))
|
|
94
|
-
|
|
108
|
+
new_letter = random.choice(outlier_letters)
|
|
109
|
+
return (
|
|
110
|
+
motif[:pos]
|
|
111
|
+
+ [
|
|
112
|
+
new_letter,
|
|
113
|
+
]
|
|
114
|
+
+ motif[pos + 1 :]
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def sequence_to_fasta(sequence: Sequence, separator: str) -> SequenceSquashed:
|
|
119
|
+
return separator.join(sequence)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def sequence_to_helm(
|
|
123
|
+
sequence: Sequence, helm_connection_mode: str = HelmConnectionMode.linear.name
|
|
124
|
+
) -> SequenceSquashed:
|
|
125
|
+
def is_cyclic(helm_connection_mode: str) -> bool:
|
|
126
|
+
return helm_connection_mode == HelmConnectionMode.cyclic.name or (
|
|
127
|
+
helm_connection_mode == HelmConnectionMode.mixed.name
|
|
128
|
+
and random.random() < 0.5
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
sequence_escaped: Sequence = [
|
|
132
|
+
f"[{letter}]" if len(letter) > 1 else letter for letter in sequence
|
|
133
|
+
]
|
|
134
|
+
connection_format = ""
|
|
135
|
+
if is_cyclic(helm_connection_mode):
|
|
136
|
+
connection_format = f"PEPTIDE1,PEPTIDE1,{len(sequence_escaped)}:R2-1:R1"
|
|
137
|
+
return f"PEPTIDE1{{{sequence_to_fasta(sequence_escaped,'.')}}}${connection_format}$$$V2.0"
|
|
95
138
|
|
|
96
139
|
|
|
97
140
|
def generate_cluster(
|
|
@@ -99,16 +142,17 @@ def generate_cluster(
|
|
|
99
142
|
motif_length: int,
|
|
100
143
|
prefix_length: int,
|
|
101
144
|
suffix_length: int,
|
|
102
|
-
|
|
145
|
+
max_variants_per_position: int,
|
|
103
146
|
make_cliffs: bool,
|
|
104
|
-
alphabet:
|
|
147
|
+
alphabet: Alphabet,
|
|
105
148
|
cliff_probability: float,
|
|
106
149
|
cliff_strength: float,
|
|
107
|
-
) -> Iterator[
|
|
150
|
+
) -> Iterator[SequenceRecord]:
|
|
151
|
+
# Making a motif template
|
|
108
152
|
motif_template = generate_motif_template(
|
|
109
|
-
motif_length, alphabet,
|
|
153
|
+
motif_length, alphabet, max_variants_per_position
|
|
110
154
|
)
|
|
111
|
-
|
|
155
|
+
# Setting average and dispersion for activity
|
|
112
156
|
activity_average = random.random() * 10
|
|
113
157
|
activity_dispersion = random.random()
|
|
114
158
|
sys.stderr.write(f"Motif template: {motif_notation(motif_template)}\n")
|
|
@@ -120,11 +164,10 @@ def generate_cluster(
|
|
|
120
164
|
prefix = generate_random(prefix_length, alphabet)
|
|
121
165
|
suffix = generate_random(suffix_length, alphabet)
|
|
122
166
|
seq = prefix + motif + suffix
|
|
123
|
-
|
|
124
|
-
is_cliff = make_cliffs and (random.random() <= cliff_probability)
|
|
125
|
-
sequence_record: sequence_record_type = (n_seq, seq, activity, is_cliff)
|
|
167
|
+
sequence_record: SequenceRecord = (n_seq, seq, activity, False)
|
|
126
168
|
yield sequence_record
|
|
127
169
|
|
|
170
|
+
is_cliff = make_cliffs and (random.random() <= cliff_probability)
|
|
128
171
|
if is_cliff:
|
|
129
172
|
# Making activity cliff
|
|
130
173
|
cliff_motif = make_cliff(motif_template, alphabet, motif)
|
|
@@ -146,16 +189,16 @@ def generate_sequences(
|
|
|
146
189
|
n_clusters: int,
|
|
147
190
|
n_sequences: int,
|
|
148
191
|
average_motif_length: int,
|
|
149
|
-
|
|
192
|
+
max_variants_per_position: int,
|
|
150
193
|
average_random_length: int,
|
|
151
194
|
dispersion: int,
|
|
152
|
-
alphabet:
|
|
195
|
+
alphabet: Alphabet,
|
|
153
196
|
make_cliffs: bool,
|
|
154
197
|
cliff_probability: float,
|
|
155
198
|
cliff_strength: float,
|
|
156
|
-
) -> Tuple[List[str], List[
|
|
199
|
+
) -> Tuple[List[str], List[ClusterSequenceRecord]]:
|
|
157
200
|
headers: List[str] = ["cluster", "sequence_id", "sequence", "activity", "is_cliff"]
|
|
158
|
-
sequences: List[
|
|
201
|
+
sequences: List[ClusterSequenceRecord] = []
|
|
159
202
|
|
|
160
203
|
for n_cluster in range(n_clusters):
|
|
161
204
|
motif_length = mean_range(average_motif_length, dispersion)
|
|
@@ -170,28 +213,71 @@ def generate_sequences(
|
|
|
170
213
|
motif_length,
|
|
171
214
|
prefix_length,
|
|
172
215
|
suffix_length,
|
|
173
|
-
|
|
216
|
+
max_variants_per_position,
|
|
174
217
|
make_cliffs,
|
|
175
218
|
alphabet,
|
|
176
219
|
cliff_probability,
|
|
177
220
|
cliff_strength,
|
|
178
221
|
):
|
|
179
222
|
sequences.append(
|
|
180
|
-
(n_cluster, f"c{n_cluster}_s{n_seq}", seq, activity, is_cliff)
|
|
223
|
+
(n_cluster, f"c{n_cluster}_s{n_seq:03d}", seq, activity, is_cliff)
|
|
181
224
|
)
|
|
182
225
|
return headers, sequences
|
|
183
226
|
|
|
184
227
|
|
|
228
|
+
def convert_to_fasta(
|
|
229
|
+
cluster_sequence_records: List[ClusterSequenceRecord], separator: str
|
|
230
|
+
) -> List[Tuple[int, str, str, float, bool]]:
|
|
231
|
+
return [
|
|
232
|
+
(n_cluster, name_cluster, sequence_to_fasta(seq, separator), activity, is_cliff)
|
|
233
|
+
for n_cluster, name_cluster, seq, activity, is_cliff in cluster_sequence_records
|
|
234
|
+
]
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def convert_to_helm(
|
|
238
|
+
cluster_sequence_records: List[ClusterSequenceRecord], helm_connection_mode: str
|
|
239
|
+
) -> List[Tuple[int, str, str, float, bool]]:
|
|
240
|
+
return [
|
|
241
|
+
(
|
|
242
|
+
n_cluster,
|
|
243
|
+
name_cluster,
|
|
244
|
+
sequence_to_helm(seq, helm_connection_mode),
|
|
245
|
+
activity,
|
|
246
|
+
is_cliff,
|
|
247
|
+
)
|
|
248
|
+
for n_cluster, name_cluster, seq, activity, is_cliff in cluster_sequence_records
|
|
249
|
+
]
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def is_monomer_suitable(monomer: Any) -> bool:
|
|
253
|
+
return (
|
|
254
|
+
monomer["polymerType"] == "PEPTIDE"
|
|
255
|
+
and monomer["monomerType"] == "Backbone"
|
|
256
|
+
and len(monomer["rgroups"]) == 2
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def alphabet_from_helm(helm_library_file: str) -> Alphabet:
|
|
261
|
+
import json
|
|
262
|
+
|
|
263
|
+
alphabet: Alphabet = []
|
|
264
|
+
with open(helm_library_file) as helm_library:
|
|
265
|
+
for monomer in json.load(helm_library):
|
|
266
|
+
if is_monomer_suitable(monomer):
|
|
267
|
+
alphabet.append(monomer["symbol"])
|
|
268
|
+
return alphabet
|
|
269
|
+
|
|
270
|
+
|
|
185
271
|
def parse_command_line_args() -> Any:
|
|
186
272
|
parser = argparse.ArgumentParser(
|
|
187
273
|
prog="MotifSequencesGenerator",
|
|
188
274
|
description="The program generates set of sequences containing sequence motifs "
|
|
189
|
-
"for SAR
|
|
190
|
-
epilog="Utility support: Gennadii Zakharov",
|
|
275
|
+
"for SAR functionality testing",
|
|
276
|
+
epilog="Utility author and support: Gennadii Zakharov <Gennadiy.Zakharov@gmail.com>",
|
|
191
277
|
)
|
|
192
278
|
|
|
193
279
|
parser.add_argument(
|
|
194
|
-
"-c", "--clusters", type=int, default=5, help="Number of
|
|
280
|
+
"-c", "--clusters", type=int, default=5, help="Number of clusters"
|
|
195
281
|
)
|
|
196
282
|
parser.add_argument(
|
|
197
283
|
"-s",
|
|
@@ -219,6 +305,21 @@ def parse_command_line_args() -> Any:
|
|
|
219
305
|
help="Variation of total sequence length",
|
|
220
306
|
)
|
|
221
307
|
|
|
308
|
+
parser.add_argument(
|
|
309
|
+
"-h,",
|
|
310
|
+
"--helm-library-file",
|
|
311
|
+
type=str,
|
|
312
|
+
help="JSON file containing the HELM monomer library in the same format as used for Datagrok. "
|
|
313
|
+
+ "The alphabet property is ignored when helm library is specified.",
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
parser.add_argument(
|
|
317
|
+
"--helm-connection-mode",
|
|
318
|
+
type=str,
|
|
319
|
+
default=HelmConnectionMode.linear.value,
|
|
320
|
+
help=f"HELM peptide generation mode: {'/'.join([mode.name for mode in HelmConnectionMode])}",
|
|
321
|
+
)
|
|
322
|
+
|
|
222
323
|
available_alphabets = ",".join(list(alphabets.keys()) + ["custom"])
|
|
223
324
|
parser.add_argument(
|
|
224
325
|
"--alphabet",
|
|
@@ -251,7 +352,12 @@ def parse_command_line_args() -> Any:
|
|
|
251
352
|
default=False,
|
|
252
353
|
help="Disable generation of cliffs",
|
|
253
354
|
)
|
|
254
|
-
|
|
355
|
+
parser.add_argument(
|
|
356
|
+
"--fasta-separator",
|
|
357
|
+
type=str,
|
|
358
|
+
default="",
|
|
359
|
+
help="Separator symbol for FASTA sequence",
|
|
360
|
+
)
|
|
255
361
|
command_line_args = parser.parse_args()
|
|
256
362
|
|
|
257
363
|
return command_line_args
|
|
@@ -274,12 +380,18 @@ if not grok:
|
|
|
274
380
|
disable_cliffs = args.disable_cliffs
|
|
275
381
|
cliff_probability = args.cliff_probability
|
|
276
382
|
cliff_strength = args.cliff_strength
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
383
|
+
fasta_separator = args.fasta_separator
|
|
384
|
+
helm_library_file = args.helm_library_file
|
|
385
|
+
helm_connection_mode = args.helm_connection_mode
|
|
386
|
+
|
|
387
|
+
if helm_library_file is None:
|
|
388
|
+
alphabet: Alphabet = (
|
|
389
|
+
alphabets[alphabet_key].split(",")
|
|
390
|
+
if alphabet_key in alphabets
|
|
391
|
+
else alphabet_key.split(",")
|
|
392
|
+
)
|
|
393
|
+
else:
|
|
394
|
+
alphabet = alphabet_from_helm(helm_library_file)
|
|
283
395
|
|
|
284
396
|
# Running sequence generator
|
|
285
397
|
header, data = generate_sequences(
|
|
@@ -294,17 +406,21 @@ header, data = generate_sequences(
|
|
|
294
406
|
cliff_probability,
|
|
295
407
|
cliff_strength,
|
|
296
408
|
)
|
|
409
|
+
if helm_library_file is None:
|
|
410
|
+
data_formatted = convert_to_fasta(data, fasta_separator)
|
|
411
|
+
else:
|
|
412
|
+
data_formatted = convert_to_helm(data, helm_connection_mode)
|
|
297
413
|
|
|
298
414
|
if grok:
|
|
299
|
-
# Exporting data to Datagrok as a
|
|
415
|
+
# Exporting data to Datagrok as a Pandas dataframe
|
|
300
416
|
import pandas as pd
|
|
301
417
|
|
|
302
|
-
sequences = pd.DataFrame.from_records(
|
|
418
|
+
sequences = pd.DataFrame.from_records(data_formatted, columns=header)
|
|
303
419
|
else:
|
|
304
420
|
# Writing results to stdout - no need to work with big and heavy Pandas
|
|
305
421
|
import csv
|
|
306
422
|
|
|
307
423
|
csv_writer = csv.writer(sys.stdout, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
|
|
308
424
|
csv_writer.writerow(header)
|
|
309
|
-
for line in
|
|
425
|
+
for line in data_formatted:
|
|
310
426
|
csv_writer.writerow(line)
|
|
@@ -6,8 +6,6 @@ import {ITooltipAndPanelParams} from '@datagrok-libraries/ml/src/viewers/activit
|
|
|
6
6
|
import {getSimilarityFromDistance} from '@datagrok-libraries/ml/src/distance-metrics-methods';
|
|
7
7
|
import {AvailableMetrics, DistanceMetricsSubjects, StringMetricsNames} from '@datagrok-libraries/ml/src/typed-metrics';
|
|
8
8
|
import {drawMoleculeDifferenceOnCanvas} from '../utils/cell-renderer';
|
|
9
|
-
import * as C from '../utils/constants';
|
|
10
|
-
import {GridColumn} from 'datagrok-api/dg';
|
|
11
9
|
import {invalidateMols, MONOMERIC_COL_TAGS} from '../substructure-search/substructure-search';
|
|
12
10
|
import {getSplitter, TAGS as bioTAGS} from '@datagrok-libraries/bio/src/utils/macromolecule';
|
|
13
11
|
|
|
@@ -24,7 +22,7 @@ export async function getDistances(col: DG.Column, seq: string): Promise<Array<n
|
|
|
24
22
|
}
|
|
25
23
|
|
|
26
24
|
export async function getSimilaritiesMatrix(
|
|
27
|
-
dim: number, seqCol: DG.Column, df: DG.DataFrame, colName: string, simArr: DG.Column[]
|
|
25
|
+
dim: number, seqCol: DG.Column, df: DG.DataFrame, colName: string, simArr: DG.Column[],
|
|
28
26
|
): Promise<DG.Column[]> {
|
|
29
27
|
const distances = new Array(simArr.length).fill(null);
|
|
30
28
|
for (let i = 0; i != dim - 1; ++i) {
|
|
@@ -54,7 +52,7 @@ export async function getChemSimilaritiesMatrix(dim: number, seqCol: DG.Column,
|
|
|
54
52
|
col: seqCol.temp[MONOMERIC_COL_TAGS.MONOMERIC_MOLS],
|
|
55
53
|
df: fpDf,
|
|
56
54
|
colName: colName,
|
|
57
|
-
simArr: simArr
|
|
55
|
+
simArr: simArr,
|
|
58
56
|
});
|
|
59
57
|
return res;
|
|
60
58
|
}
|
|
@@ -69,7 +67,7 @@ export function createTooltipElement(params: ITooltipAndPanelParams): HTMLDivEle
|
|
|
69
67
|
columnNames.style.display = 'flex';
|
|
70
68
|
columnNames.style.justifyContent = 'space-between';
|
|
71
69
|
tooltipElement.append(columnNames);
|
|
72
|
-
params.line.mols.forEach((molIdx: number,
|
|
70
|
+
params.line.mols.forEach((molIdx: number, _idx: number) => {
|
|
73
71
|
const activity = ui.divText(params.activityCol.get(molIdx).toFixed(2));
|
|
74
72
|
activity.style.display = 'flex';
|
|
75
73
|
activity.style.justifyContent = 'left';
|
|
@@ -82,7 +80,7 @@ export function createTooltipElement(params: ITooltipAndPanelParams): HTMLDivEle
|
|
|
82
80
|
return tooltipElement;
|
|
83
81
|
}
|
|
84
82
|
|
|
85
|
-
function
|
|
83
|
+
function _moleculeInfo(df: DG.DataFrame, idx: number, seqColName: string): HTMLElement {
|
|
86
84
|
const dict: { [key: string]: string } = {};
|
|
87
85
|
for (const col of df.columns) {
|
|
88
86
|
if (col.name !== seqColName)
|
|
@@ -124,7 +122,7 @@ export function createPropPanelElement(params: ITooltipAndPanelParams): HTMLDivE
|
|
|
124
122
|
function createPropPanelField(name: string, value: number): HTMLDivElement {
|
|
125
123
|
return ui.divH([
|
|
126
124
|
ui.divText(`${name}: `, {style: {fontWeight: 'bold', paddingRight: '5px'}}),
|
|
127
|
-
ui.divText(value.toFixed(2))
|
|
125
|
+
ui.divText(value.toFixed(2)),
|
|
128
126
|
], {style: {paddingTop: '10px'}});
|
|
129
127
|
}
|
|
130
128
|
|
|
@@ -147,13 +145,13 @@ export function createDifferencesWithPositions(
|
|
|
147
145
|
const diffsPanel = ui.divV([]);
|
|
148
146
|
diffsPanel.append(ui.divH([
|
|
149
147
|
ui.divText('Pos', {style: {fontWeight: 'bold', width: '30px', borderBottom: '1px solid'}}),
|
|
150
|
-
ui.divText('Difference', {style: {fontWeight: 'bold', borderBottom: '1px solid'}})
|
|
148
|
+
ui.divText('Difference', {style: {fontWeight: 'bold', borderBottom: '1px solid'}}),
|
|
151
149
|
]));
|
|
152
150
|
for (const key of Object.keys(molDifferences)) {
|
|
153
151
|
molDifferences[key as any].style.borderBottom = '1px solid lightgray';
|
|
154
152
|
diffsPanel.append(ui.divH([
|
|
155
153
|
ui.divText((parseInt(key) + 1).toString(), {style: {width: '30px', borderBottom: '1px solid lightgray'}}),
|
|
156
|
-
molDifferences[key as any]
|
|
154
|
+
molDifferences[key as any],
|
|
157
155
|
]));
|
|
158
156
|
}
|
|
159
157
|
div.append(diffsPanel);
|
|
@@ -27,7 +27,7 @@ export class SequenceDiversityViewer extends SequenceSearchBaseViewer {
|
|
|
27
27
|
return;
|
|
28
28
|
if (this.dataFrame) {
|
|
29
29
|
if (computeData && this.moleculeColumn) {
|
|
30
|
-
const uh =
|
|
30
|
+
const uh = UnitsHandler.getOrCreate(this.moleculeColumn);
|
|
31
31
|
await (uh.isFasta() ? this.computeByMM() : this.computeByChem());
|
|
32
32
|
|
|
33
33
|
const diverseColumnName: string = this.diverseColumnLabel != null ? this.diverseColumnLabel :
|
|
@@ -37,6 +37,8 @@ export class SequenceDiversityViewer extends SequenceSearchBaseViewer {
|
|
|
37
37
|
resCol.semType = DG.SEMTYPE.MACROMOLECULE;
|
|
38
38
|
this.tags.forEach((tag) => resCol.setTag(tag, this.moleculeColumn!.getTag(tag)));
|
|
39
39
|
const resDf = DG.DataFrame.fromColumns([resCol]);
|
|
40
|
+
resDf.onCurrentRowChanged.subscribe(
|
|
41
|
+
(_) => { this.dataFrame.currentRowIdx = this.renderMolIds![resDf.currentRowIdx]; });
|
|
40
42
|
updateDivInnerHTML(this.root, resDf.plot.grid().root);
|
|
41
43
|
this.computeCompleted.next(true);
|
|
42
44
|
}
|
|
@@ -51,7 +53,7 @@ export class SequenceDiversityViewer extends SequenceSearchBaseViewer {
|
|
|
51
53
|
col: monomericMols,
|
|
52
54
|
metricName: this.distanceMetric,
|
|
53
55
|
limit: this.limit,
|
|
54
|
-
fingerprint: this.fingerprint
|
|
56
|
+
fingerprint: this.fingerprint,
|
|
55
57
|
});
|
|
56
58
|
}
|
|
57
59
|
|
|
@@ -60,6 +62,9 @@ export class SequenceDiversityViewer extends SequenceSearchBaseViewer {
|
|
|
60
62
|
const len = this.moleculeColumn!.length;
|
|
61
63
|
const linearizeFunc = dmLinearIndex(len);
|
|
62
64
|
this.renderMolIds = getDiverseSubset(len, Math.min(len, this.limit),
|
|
63
|
-
(i1: number, i2: number) =>
|
|
65
|
+
(i1: number, i2: number) => {
|
|
66
|
+
return this.moleculeColumn!.isNone(i1) || this.moleculeColumn!.isNone(i2) ? 0 :
|
|
67
|
+
distanceMatrixData[linearizeFunc(i1, i2)];
|
|
68
|
+
});
|
|
64
69
|
}
|
|
65
70
|
}
|
|
@@ -4,8 +4,8 @@ import * as grok from 'datagrok-api/grok';
|
|
|
4
4
|
|
|
5
5
|
import {CHEM_SIMILARITY_METRICS} from '@datagrok-libraries/ml/src/distance-metrics-methods';
|
|
6
6
|
import {TAGS as bioTAGS} from '@datagrok-libraries/bio/src/utils/macromolecule';
|
|
7
|
-
import * as C from '../utils/constants';
|
|
8
7
|
|
|
8
|
+
const MAX_ROWS_FOR_DISTANCE_MATRIX = 22000;
|
|
9
9
|
export class SequenceSearchBaseViewer extends DG.JsViewer {
|
|
10
10
|
name: string = '';
|
|
11
11
|
distanceMetric: string;
|
|
@@ -17,7 +17,7 @@ export class SequenceSearchBaseViewer extends DG.JsViewer {
|
|
|
17
17
|
moleculeColumnName: string;
|
|
18
18
|
initialized: boolean = false;
|
|
19
19
|
tags = [DG.TAGS.UNITS, bioTAGS.aligned, bioTAGS.separator, bioTAGS.alphabet];
|
|
20
|
-
|
|
20
|
+
preComputeDistanceMatrix: boolean = false;
|
|
21
21
|
constructor(name: string) {
|
|
22
22
|
super();
|
|
23
23
|
this.fingerprint = this.string('fingerprint', this.fingerprintChoices[0], {choices: this.fingerprintChoices});
|
|
@@ -39,6 +39,7 @@ export class SequenceSearchBaseViewer extends DG.JsViewer {
|
|
|
39
39
|
this.init();
|
|
40
40
|
|
|
41
41
|
if (this.dataFrame) {
|
|
42
|
+
this.preComputeDistanceMatrix = this.dataFrame.rowCount <= MAX_ROWS_FOR_DISTANCE_MATRIX;
|
|
42
43
|
this.subs.push(DG.debounce(this.dataFrame.onRowsRemoved, 50).subscribe(async (_: any) => await this.render()));
|
|
43
44
|
const compute = this.name !== 'diversity';
|
|
44
45
|
this.subs.push(DG.debounce(this.dataFrame.onCurrentRowChanged, 50)
|
|
@@ -66,7 +67,7 @@ export class SequenceSearchBaseViewer extends DG.JsViewer {
|
|
|
66
67
|
this.render();
|
|
67
68
|
}
|
|
68
69
|
|
|
69
|
-
async render(
|
|
70
|
+
async render(_computeData = true) {
|
|
70
71
|
|
|
71
72
|
}
|
|
72
73
|
|
|
@@ -4,13 +4,13 @@ import * as DG from 'datagrok-api/dg';
|
|
|
4
4
|
|
|
5
5
|
import {SequenceSearchBaseViewer} from './sequence-search-base-viewer';
|
|
6
6
|
import {getMonomericMols} from '../calculations/monomerLevelMols';
|
|
7
|
-
import * as C from '../utils/constants';
|
|
8
7
|
import {createDifferenceCanvas, createDifferencesWithPositions} from './sequence-activity-cliffs';
|
|
9
8
|
import {updateDivInnerHTML} from '../utils/ui-utils';
|
|
10
9
|
import {Subject} from 'rxjs';
|
|
11
10
|
import {TAGS as bioTAGS, getSplitter} from '@datagrok-libraries/bio/src/utils/macromolecule';
|
|
12
11
|
import {UnitsHandler} from '@datagrok-libraries/bio/src/utils/units-handler';
|
|
13
12
|
import {calcMmDistanceMatrix, dmLinearIndex} from './workers/mm-distance-worker-creator';
|
|
13
|
+
import {calculateMMDistancesArray} from './workers/mm-distance-array-service';
|
|
14
14
|
|
|
15
15
|
export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
|
|
16
16
|
cutoff: number;
|
|
@@ -47,7 +47,7 @@ export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
|
|
|
47
47
|
this.curIdx = this.dataFrame!.currentRowIdx == -1 ? 0 : this.dataFrame!.currentRowIdx;
|
|
48
48
|
if (computeData && !this.gridSelect) {
|
|
49
49
|
this.targetMoleculeIdx = this.dataFrame!.currentRowIdx == -1 ? 0 : this.dataFrame!.currentRowIdx;
|
|
50
|
-
const uh =
|
|
50
|
+
const uh = UnitsHandler.getOrCreate(this.moleculeColumn!);
|
|
51
51
|
|
|
52
52
|
await (uh.isFasta() ? this.computeByMM() : this.computeByChem());
|
|
53
53
|
const similarColumnName: string = this.similarColumnLabel != null ? this.similarColumnLabel :
|
|
@@ -67,7 +67,7 @@ export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
|
|
|
67
67
|
const targetMolRow = this.idxs?.getRawData().findIndex((it) => it == this.targetMoleculeIdx);
|
|
68
68
|
const targetScoreCell = grid.cell('score', targetMolRow!);
|
|
69
69
|
targetScoreCell.cell.value = null;
|
|
70
|
-
(grok.shell.v as DG.TableView).grid.root.addEventListener('click', (
|
|
70
|
+
(grok.shell.v as DG.TableView).grid.root.addEventListener('click', (_event: MouseEvent) => {
|
|
71
71
|
this.gridSelect = false;
|
|
72
72
|
});
|
|
73
73
|
updateDivInnerHTML(this.root, grid.root);
|
|
@@ -87,23 +87,29 @@ export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
|
|
|
87
87
|
metricName: this.distanceMetric,
|
|
88
88
|
limit: this.limit,
|
|
89
89
|
minScore: this.cutoff,
|
|
90
|
-
fingerprint: this.fingerprint
|
|
90
|
+
fingerprint: this.fingerprint,
|
|
91
91
|
});
|
|
92
92
|
this.idxs = df.getCol('indexes');
|
|
93
93
|
this.scores = df.getCol('score');
|
|
94
94
|
}
|
|
95
95
|
|
|
96
96
|
private async computeByMM() {
|
|
97
|
-
|
|
97
|
+
let distanceArray = new Float32Array();
|
|
98
|
+
if (!this.distanceMatrixComputed && this.preComputeDistanceMatrix) {
|
|
98
99
|
this.mmDistanceMatrix = await calcMmDistanceMatrix(this.moleculeColumn!);
|
|
99
100
|
this.distanceMatrixComputed = true;
|
|
101
|
+
} else if (!this.preComputeDistanceMatrix) {
|
|
102
|
+
// use fast distance array calculation if matrix will take too much space
|
|
103
|
+
distanceArray = await calculateMMDistancesArray(this.moleculeColumn!, this.targetMoleculeIdx);
|
|
100
104
|
}
|
|
101
105
|
const len = this.moleculeColumn!.length;
|
|
102
106
|
const linearizeFunc = dmLinearIndex(len);
|
|
103
107
|
// array that keeps track of the indexes and scores together
|
|
104
108
|
const indexWScore = Array(len).fill(0)
|
|
105
109
|
.map((_, i) => ({idx: i, score: i === this.targetMoleculeIdx ? 1 :
|
|
106
|
-
1 - this.mmDistanceMatrix[linearizeFunc(this.targetMoleculeIdx, i)]
|
|
110
|
+
this.preComputeDistanceMatrix ? 1 - this.mmDistanceMatrix[linearizeFunc(this.targetMoleculeIdx, i)] :
|
|
111
|
+
1 - distanceArray[i]
|
|
112
|
+
}));
|
|
107
113
|
indexWScore.sort((a, b) => b.score - a.score);
|
|
108
114
|
// get the most similar molecules
|
|
109
115
|
const actualLimit = Math.min(this.limit, len);
|
|
@@ -127,7 +133,7 @@ export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
|
|
|
127
133
|
propPanel.append(ui.divV([
|
|
128
134
|
ui.divText(`Different sequence length:`, {style: {fontWeight: 'bold'}}),
|
|
129
135
|
ui.divText(`target: ${subParts1.length} monomers`),
|
|
130
|
-
ui.divText(`selected: ${subParts2.length} monomers`)
|
|
136
|
+
ui.divText(`selected: ${subParts2.length} monomers`),
|
|
131
137
|
], {style: {paddingBottom: '10px'}}));
|
|
132
138
|
}
|
|
133
139
|
propPanel.append(createDifferencesWithPositions(molDifferences));
|
|
@@ -1,19 +1,17 @@
|
|
|
1
1
|
import * as DG from 'datagrok-api/dg';
|
|
2
|
-
import {AvailableMetrics} from '@datagrok-libraries/ml/src/typed-metrics';
|
|
3
2
|
import {reduceDimensinalityWithNormalization} from '@datagrok-libraries/ml/src/sequence-space';
|
|
4
3
|
import {BitArrayMetrics, StringMetrics} from '@datagrok-libraries/ml/src/typed-metrics';
|
|
5
4
|
import {Matrix} from '@datagrok-libraries/utils/src/type-declarations';
|
|
6
|
-
import BitArray from '@datagrok-libraries/utils/src/bit-array';
|
|
7
5
|
import {ISequenceSpaceParams} from '@datagrok-libraries/ml/src/viewers/activity-cliffs';
|
|
8
6
|
import {invalidateMols, MONOMERIC_COL_TAGS} from '../substructure-search/substructure-search';
|
|
9
7
|
import {UnitsHandler} from '@datagrok-libraries/bio/src/utils/units-handler';
|
|
10
8
|
import * as grok from 'datagrok-api/grok';
|
|
11
|
-
import {
|
|
12
|
-
import {
|
|
13
|
-
import {
|
|
9
|
+
import {NotationConverter} from '@datagrok-libraries/bio/src/utils/notation-converter';
|
|
10
|
+
import {ALPHABET, NOTATION} from '@datagrok-libraries/bio/src/utils/macromolecule';
|
|
11
|
+
import {MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
|
|
14
12
|
|
|
15
13
|
export interface ISequenceSpaceResult {
|
|
16
|
-
distance
|
|
14
|
+
distance?: Float32Array;
|
|
17
15
|
coordinates: DG.ColumnList;
|
|
18
16
|
}
|
|
19
17
|
|
|
@@ -44,7 +42,8 @@ export async function sequenceSpace(spaceParams: ISequenceSpaceParams): Promise<
|
|
|
44
42
|
|
|
45
43
|
export async function sequenceSpaceByFingerprints(spaceParams: ISequenceSpaceParams): Promise<ISequenceSpaceResult> {
|
|
46
44
|
if (spaceParams.seqCol.version !== spaceParams.seqCol.temp[MONOMERIC_COL_TAGS.LAST_INVALIDATED_VERSION])
|
|
47
|
-
|
|
45
|
+
//we expect only string columns here
|
|
46
|
+
await invalidateMols(spaceParams.seqCol as unknown as DG.Column<string>, false);
|
|
48
47
|
|
|
49
48
|
const result = await grok.functions.call('Chem:getChemSpaceEmbeddings', {
|
|
50
49
|
col: spaceParams.seqCol.temp[MONOMERIC_COL_TAGS.MONOMERIC_MOLS],
|
|
@@ -52,7 +51,7 @@ export async function sequenceSpaceByFingerprints(spaceParams: ISequenceSpacePar
|
|
|
52
51
|
similarityMetric: spaceParams.similarityMetric,
|
|
53
52
|
xAxis: spaceParams.embedAxesNames[0],
|
|
54
53
|
yAxis: spaceParams.embedAxesNames[1],
|
|
55
|
-
options: spaceParams.options
|
|
54
|
+
options: spaceParams.options,
|
|
56
55
|
});
|
|
57
56
|
return result;
|
|
58
57
|
}
|
|
@@ -65,17 +64,21 @@ export async function getSequenceSpace(spaceParams: ISequenceSpaceParams): Promi
|
|
|
65
64
|
if (nc.isSeparator()) {
|
|
66
65
|
const fastaCol = nc.convert(NOTATION.FASTA);
|
|
67
66
|
seqList = fastaCol.toList();
|
|
68
|
-
const uh =
|
|
67
|
+
const uh = UnitsHandler.getOrCreate(fastaCol);
|
|
69
68
|
distanceFName = uh.getDistanceFunctionName();
|
|
70
|
-
}
|
|
71
|
-
else {
|
|
69
|
+
} else {
|
|
72
70
|
distanceFName = nc.getDistanceFunctionName();
|
|
73
71
|
}
|
|
72
|
+
for (let i = 0; i < seqList.length; i++) {
|
|
73
|
+
// toList puts empty values in array and it causes downstream errors. replace with null
|
|
74
|
+
seqList[i] = spaceParams.seqCol.isNone(i) ? null : seqList[i];
|
|
75
|
+
}
|
|
74
76
|
const sequenceSpaceResult = await reduceDimensinalityWithNormalization(
|
|
75
77
|
seqList,
|
|
76
78
|
spaceParams.methodName,
|
|
77
79
|
distanceFName,
|
|
78
|
-
spaceParams.options
|
|
80
|
+
spaceParams.options,
|
|
81
|
+
true);
|
|
79
82
|
const cols: DG.Column[] = spaceParams.embedAxesNames.map(
|
|
80
83
|
(name: string, index: number) => DG.Column.fromFloat32Array(name, sequenceSpaceResult.embedding[index]));
|
|
81
84
|
return {distance: sequenceSpaceResult.distance, coordinates: new DG.ColumnList(cols)};
|