@datagrok/bio 2.4.30 → 2.4.39
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintrc.json +6 -8
- package/README.md +22 -7
- package/detectors.js +21 -12
- package/dist/1.js +2 -0
- package/dist/1.js.map +1 -0
- package/dist/18.js +2 -0
- package/dist/18.js.map +1 -0
- package/dist/190.js +2 -0
- package/dist/190.js.map +1 -0
- package/dist/452.js +2 -0
- package/dist/452.js.map +1 -0
- package/dist/729.js +2 -0
- package/dist/729.js.map +1 -0
- package/dist/package-test.js +1 -1
- package/dist/package-test.js.map +1 -1
- package/dist/package.js +1 -1
- package/dist/package.js.map +1 -1
- package/files/libraries/broken-lib.sdf +136 -0
- package/files/libraries/group1/mock-lib-3.json +74 -0
- package/files/libraries/mock-lib-2.json +48 -0
- package/files/tests/100_3_clustests.csv +100 -0
- package/files/tests/100_3_clustests_empty_vals.csv +100 -0
- package/files/tests/peptides_motif-with-random_10000.csv +9998 -0
- package/package.json +4 -4
- package/scripts/sequence_generator.py +185 -48
- package/src/analysis/sequence-activity-cliffs.ts +9 -11
- package/src/analysis/sequence-diversity-viewer.ts +8 -3
- package/src/analysis/sequence-search-base-viewer.ts +4 -3
- package/src/analysis/sequence-similarity-viewer.ts +13 -7
- package/src/analysis/sequence-space.ts +15 -12
- package/src/analysis/workers/mm-distance-array-service.ts +48 -0
- package/src/analysis/workers/mm-distance-array-worker.ts +29 -0
- package/src/analysis/workers/mm-distance-worker-creator.ts +6 -9
- package/src/apps/web-logo-app.ts +34 -0
- package/src/calculations/monomerLevelMols.ts +10 -12
- package/src/demo/bio01-similarity-diversity.ts +4 -5
- package/src/demo/bio01a-hierarchical-clustering-and-sequence-space.ts +6 -7
- package/src/demo/bio01b-hierarchical-clustering-and-activity-cliffs.ts +8 -8
- package/src/demo/bio03-atomic-level.ts +1 -4
- package/src/demo/bio05-helm-msa-sequence-space.ts +8 -5
- package/src/demo/utils.ts +4 -3
- package/src/package-test.ts +1 -2
- package/src/package.ts +138 -83
- package/src/seq_align.ts +482 -483
- package/src/substructure-search/substructure-search.ts +3 -3
- package/src/tests/Palettes-test.ts +1 -1
- package/src/tests/WebLogo-positions-test.ts +12 -35
- package/src/tests/_first-tests.ts +1 -1
- package/src/tests/activity-cliffs-tests.ts +10 -6
- package/src/tests/activity-cliffs-utils.ts +6 -4
- package/src/tests/bio-tests.ts +20 -25
- package/src/tests/checkInputColumn-tests.ts +5 -11
- package/src/tests/converters-test.ts +19 -37
- package/src/tests/detectors-benchmark-tests.ts +35 -37
- package/src/tests/detectors-tests.ts +29 -34
- package/src/tests/detectors-weak-and-likely-tests.ts +11 -21
- package/src/tests/fasta-export-tests.ts +3 -3
- package/src/tests/fasta-handler-test.ts +2 -3
- package/src/tests/lib-tests.ts +2 -4
- package/src/tests/mm-distance-tests.ts +25 -17
- package/src/tests/monomer-libraries-tests.ts +1 -1
- package/src/tests/msa-tests.ts +12 -9
- package/src/tests/pepsea-tests.ts +6 -3
- package/src/tests/renderers-test.ts +13 -11
- package/src/tests/sequence-space-test.ts +10 -7
- package/src/tests/sequence-space-utils.ts +7 -3
- package/src/tests/similarity-diversity-tests.ts +47 -61
- package/src/tests/splitters-test.ts +14 -20
- package/src/tests/to-atomic-level-tests.ts +9 -17
- package/src/tests/units-handler-splitted-tests.ts +106 -0
- package/src/tests/units-handler-tests.ts +22 -26
- package/src/tests/utils/sequences-generators.ts +6 -2
- package/src/tests/utils.ts +10 -4
- package/src/tests/viewers.ts +1 -1
- package/src/utils/atomic-works.ts +49 -57
- package/src/utils/cell-renderer.ts +25 -8
- package/src/utils/check-input-column.ts +19 -4
- package/src/utils/constants.ts +3 -3
- package/src/utils/convert.ts +56 -23
- package/src/utils/monomer-lib.ts +83 -64
- package/src/utils/multiple-sequence-alignment-ui.ts +24 -21
- package/src/utils/multiple-sequence-alignment.ts +2 -2
- package/src/utils/pepsea.ts +17 -7
- package/src/utils/save-as-fasta.ts +11 -4
- package/src/utils/ui-utils.ts +1 -1
- package/src/viewers/vd-regions-viewer.ts +21 -22
- package/src/viewers/web-logo-viewer.ts +189 -154
- package/src/widgets/bio-substructure-filter.ts +9 -6
- package/src/widgets/representations.ts +11 -12
- package/tsconfig.json +1 -1
- package/dist/258.js +0 -2
- package/dist/258.js.map +0 -1
- package/dist/562.js +0 -2
- package/dist/562.js.map +0 -1
- package/dist/705.js +0 -2
- package/dist/705.js.map +0 -1
- package/dist/925.js +0 -2
- package/dist/925.js.map +0 -1
- package/src/analysis/workers/mm-distance-worker.ts +0 -16
package/package.json
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
"name": "Leonid Stolbov",
|
|
6
6
|
"email": "lstolbov@datagrok.ai"
|
|
7
7
|
},
|
|
8
|
-
"version": "2.4.
|
|
8
|
+
"version": "2.4.39",
|
|
9
9
|
"description": "Bioinformatics support (import/export of sequences, conversion, visualization, analysis). [See more](https://github.com/datagrok-ai/public/blob/master/packages/Bio/README.md) for details.",
|
|
10
10
|
"repository": {
|
|
11
11
|
"type": "git",
|
|
@@ -14,11 +14,11 @@
|
|
|
14
14
|
},
|
|
15
15
|
"dependencies": {
|
|
16
16
|
"@biowasm/aioli": "^3.1.0",
|
|
17
|
-
"@datagrok-libraries/bio": "^5.
|
|
17
|
+
"@datagrok-libraries/bio": "^5.32.1",
|
|
18
18
|
"@datagrok-libraries/chem-meta": "^1.0.1",
|
|
19
|
-
"@datagrok-libraries/ml": "^6.3.
|
|
19
|
+
"@datagrok-libraries/ml": "^6.3.37",
|
|
20
20
|
"@datagrok-libraries/tutorials": "^1.3.2",
|
|
21
|
-
"@datagrok-libraries/utils": "^4.0.
|
|
21
|
+
"@datagrok-libraries/utils": "^4.0.11",
|
|
22
22
|
"cash-dom": "^8.0.0",
|
|
23
23
|
"css-loader": "^6.7.3",
|
|
24
24
|
"datagrok-api": "^1.13.3",
|
|
@@ -3,8 +3,8 @@
|
|
|
3
3
|
# description: Create the model peptides/DNA sequences with peptides data
|
|
4
4
|
# language: python
|
|
5
5
|
# tags: template, demo
|
|
6
|
-
# input: int clusters =
|
|
7
|
-
# input: int num_sequences =
|
|
6
|
+
# input: int clusters = 5 [Number of superclusters]
|
|
7
|
+
# input: int num_sequences = 50 [Number of sequences in each supercluster]
|
|
8
8
|
# input: int motif_length = 12 [Average length of motif]
|
|
9
9
|
# input: int max_variants_position = 3 [Maximum number of different letters in conservative position in motif]
|
|
10
10
|
# input: int random_length = 3 [Average length of random sequence parts before and after motif]
|
|
@@ -13,21 +13,34 @@
|
|
|
13
13
|
# input: bool disable_cliffs = False [Disable generation of cliffs]
|
|
14
14
|
# input: double cliff_probability = 0.01 [Probability to make activity cliff of a sequence]
|
|
15
15
|
# input: double cliff_strength = 4.0 [Strength of cliff]
|
|
16
|
+
# input: double fasta_separator = '' [Separator for a FASTA notation]
|
|
16
17
|
# output: dataframe sequences
|
|
17
18
|
|
|
18
19
|
import random
|
|
19
20
|
import argparse
|
|
20
21
|
import sys
|
|
22
|
+
from enum import Enum
|
|
21
23
|
|
|
22
24
|
from typing import List, Tuple, Dict, Iterator, Any
|
|
23
25
|
|
|
24
|
-
alphabet_type = List[str]
|
|
25
26
|
|
|
26
|
-
|
|
27
|
-
motif_template_type = List[letter_choice_type]
|
|
27
|
+
# --- Type definitions ---
|
|
28
28
|
|
|
29
|
-
|
|
30
|
-
|
|
29
|
+
Letter = str
|
|
30
|
+
Alphabet = List[str]
|
|
31
|
+
|
|
32
|
+
LetterChoice = List[Letter]
|
|
33
|
+
MotifTemplate = List[LetterChoice]
|
|
34
|
+
|
|
35
|
+
Sequence = List[Letter] # The sequence in a form of list
|
|
36
|
+
SequenceSquashed = str # Sequence, joined together in string form
|
|
37
|
+
|
|
38
|
+
SequenceRecord = Tuple[int, Sequence, float, bool]
|
|
39
|
+
ClusterSequenceRecord = Tuple[int, str, Sequence, float, bool]
|
|
40
|
+
|
|
41
|
+
# --- constants ---
|
|
42
|
+
|
|
43
|
+
HelmConnectionMode = Enum("HelmConnectionMode", ["linear", "cyclic", "mixed"])
|
|
31
44
|
|
|
32
45
|
alphabets: Dict[str, str] = {
|
|
33
46
|
"PT": "A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y",
|
|
@@ -42,10 +55,10 @@ def mean_range(mean: int, disp: int) -> int:
|
|
|
42
55
|
|
|
43
56
|
def generate_motif_template(
|
|
44
57
|
motif_length: int,
|
|
45
|
-
alphabet:
|
|
58
|
+
alphabet: Alphabet,
|
|
46
59
|
max_variants_cluster: int,
|
|
47
60
|
prob_any: float = 0.2,
|
|
48
|
-
) ->
|
|
61
|
+
) -> MotifTemplate:
|
|
49
62
|
motif_template = []
|
|
50
63
|
for position in range(motif_length):
|
|
51
64
|
# Selecting letters for position i
|
|
@@ -53,37 +66,75 @@ def generate_motif_template(
|
|
|
53
66
|
letters = ["?"] # this stands for any symbol
|
|
54
67
|
else:
|
|
55
68
|
n_variants = random.randrange(max_variants_cluster) + 1
|
|
56
|
-
letters =
|
|
69
|
+
letters = list(set((random.choice(alphabet) for i in range(n_variants))))
|
|
57
70
|
motif_template.append(letters)
|
|
58
71
|
return motif_template
|
|
59
72
|
|
|
60
73
|
|
|
61
|
-
def generate_motif(template:
|
|
62
|
-
template_with_any = [
|
|
63
|
-
|
|
74
|
+
def generate_motif(template: MotifTemplate, alphabet: Alphabet) -> Sequence:
|
|
75
|
+
template_with_any = [
|
|
76
|
+
(letters if not "?" in letters else alphabet) for letters in template
|
|
77
|
+
]
|
|
78
|
+
return [random.choice(letters) for letters in template_with_any]
|
|
64
79
|
|
|
65
80
|
|
|
66
|
-
def motif_notation(motif_template:
|
|
67
|
-
def motif_notation_code(letter_choice:
|
|
81
|
+
def motif_notation(motif_template: MotifTemplate) -> str:
|
|
82
|
+
def motif_notation_code(letter_choice: LetterChoice) -> str:
|
|
68
83
|
if len(letter_choice) == 1:
|
|
69
84
|
return letter_choice[0]
|
|
70
85
|
else:
|
|
71
86
|
return f"[{''.join(letter_choice)}]"
|
|
72
87
|
|
|
73
|
-
return "".join(
|
|
88
|
+
return "".join(
|
|
89
|
+
[motif_notation_code(letter_choice) for letter_choice in motif_template]
|
|
90
|
+
)
|
|
74
91
|
|
|
75
92
|
|
|
76
|
-
def generate_random(n: int, alphabet:
|
|
77
|
-
return
|
|
93
|
+
def generate_random(n: int, alphabet: Alphabet) -> Sequence:
|
|
94
|
+
return [random.choice(alphabet) for i in range(n)]
|
|
78
95
|
|
|
79
96
|
|
|
80
|
-
def make_cliff(
|
|
97
|
+
def make_cliff(
|
|
98
|
+
motif_template: MotifTemplate, alphabet: Alphabet, motif: Sequence
|
|
99
|
+
) -> Sequence:
|
|
81
100
|
# Mutate conservative letter in motif
|
|
82
|
-
|
|
101
|
+
motif_len = len(motif_template)
|
|
102
|
+
pos = random.randrange(motif_len)
|
|
83
103
|
while "?" in motif_template[pos]:
|
|
84
|
-
pos = (
|
|
104
|
+
pos = (
|
|
105
|
+
pos + 1
|
|
106
|
+
) % motif_len # always will find letters since ends of motif can't be any symbol
|
|
85
107
|
outlier_letters = list(set(alphabet) - set(motif_template[pos]))
|
|
86
|
-
|
|
108
|
+
new_letter = random.choice(outlier_letters)
|
|
109
|
+
return (
|
|
110
|
+
motif[:pos]
|
|
111
|
+
+ [
|
|
112
|
+
new_letter,
|
|
113
|
+
]
|
|
114
|
+
+ motif[pos + 1 :]
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def sequence_to_fasta(sequence: Sequence, separator: str) -> SequenceSquashed:
|
|
119
|
+
return separator.join(sequence)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def sequence_to_helm(
|
|
123
|
+
sequence: Sequence, helm_connection_mode: str = HelmConnectionMode.linear.name
|
|
124
|
+
) -> SequenceSquashed:
|
|
125
|
+
def is_cyclic(helm_connection_mode: str) -> bool:
|
|
126
|
+
return helm_connection_mode == HelmConnectionMode.cyclic.name or (
|
|
127
|
+
helm_connection_mode == HelmConnectionMode.mixed.name
|
|
128
|
+
and random.random() < 0.5
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
sequence_escaped: Sequence = [
|
|
132
|
+
f"[{letter}]" if len(letter) > 1 else letter for letter in sequence
|
|
133
|
+
]
|
|
134
|
+
connection_format = ""
|
|
135
|
+
if is_cyclic(helm_connection_mode):
|
|
136
|
+
connection_format = f"PEPTIDE1,PEPTIDE1,{len(sequence_escaped)}:R2-1:R1"
|
|
137
|
+
return f"PEPTIDE1{{{sequence_to_fasta(sequence_escaped,'.')}}}${connection_format}$$$V2.0"
|
|
87
138
|
|
|
88
139
|
|
|
89
140
|
def generate_cluster(
|
|
@@ -91,14 +142,17 @@ def generate_cluster(
|
|
|
91
142
|
motif_length: int,
|
|
92
143
|
prefix_length: int,
|
|
93
144
|
suffix_length: int,
|
|
94
|
-
|
|
145
|
+
max_variants_per_position: int,
|
|
95
146
|
make_cliffs: bool,
|
|
96
|
-
alphabet:
|
|
147
|
+
alphabet: Alphabet,
|
|
97
148
|
cliff_probability: float,
|
|
98
149
|
cliff_strength: float,
|
|
99
|
-
) -> Iterator[
|
|
100
|
-
|
|
101
|
-
|
|
150
|
+
) -> Iterator[SequenceRecord]:
|
|
151
|
+
# Making a motif template
|
|
152
|
+
motif_template = generate_motif_template(
|
|
153
|
+
motif_length, alphabet, max_variants_per_position
|
|
154
|
+
)
|
|
155
|
+
# Setting average and dispersion for activity
|
|
102
156
|
activity_average = random.random() * 10
|
|
103
157
|
activity_dispersion = random.random()
|
|
104
158
|
sys.stderr.write(f"Motif template: {motif_notation(motif_template)}\n")
|
|
@@ -110,11 +164,10 @@ def generate_cluster(
|
|
|
110
164
|
prefix = generate_random(prefix_length, alphabet)
|
|
111
165
|
suffix = generate_random(suffix_length, alphabet)
|
|
112
166
|
seq = prefix + motif + suffix
|
|
113
|
-
|
|
114
|
-
is_cliff = make_cliffs and (random.random() <= cliff_probability)
|
|
115
|
-
sequence_record: sequence_record_type = (n_seq, seq, activity, is_cliff)
|
|
167
|
+
sequence_record: SequenceRecord = (n_seq, seq, activity, False)
|
|
116
168
|
yield sequence_record
|
|
117
169
|
|
|
170
|
+
is_cliff = make_cliffs and (random.random() <= cliff_probability)
|
|
118
171
|
if is_cliff:
|
|
119
172
|
# Making activity cliff
|
|
120
173
|
cliff_motif = make_cliff(motif_template, alphabet, motif)
|
|
@@ -136,16 +189,16 @@ def generate_sequences(
|
|
|
136
189
|
n_clusters: int,
|
|
137
190
|
n_sequences: int,
|
|
138
191
|
average_motif_length: int,
|
|
139
|
-
|
|
192
|
+
max_variants_per_position: int,
|
|
140
193
|
average_random_length: int,
|
|
141
194
|
dispersion: int,
|
|
142
|
-
alphabet:
|
|
195
|
+
alphabet: Alphabet,
|
|
143
196
|
make_cliffs: bool,
|
|
144
197
|
cliff_probability: float,
|
|
145
198
|
cliff_strength: float,
|
|
146
|
-
) -> Tuple[List[str], List[
|
|
199
|
+
) -> Tuple[List[str], List[ClusterSequenceRecord]]:
|
|
147
200
|
headers: List[str] = ["cluster", "sequence_id", "sequence", "activity", "is_cliff"]
|
|
148
|
-
sequences: List[
|
|
201
|
+
sequences: List[ClusterSequenceRecord] = []
|
|
149
202
|
|
|
150
203
|
for n_cluster in range(n_clusters):
|
|
151
204
|
motif_length = mean_range(average_motif_length, dispersion)
|
|
@@ -160,33 +213,82 @@ def generate_sequences(
|
|
|
160
213
|
motif_length,
|
|
161
214
|
prefix_length,
|
|
162
215
|
suffix_length,
|
|
163
|
-
|
|
216
|
+
max_variants_per_position,
|
|
164
217
|
make_cliffs,
|
|
165
218
|
alphabet,
|
|
166
219
|
cliff_probability,
|
|
167
220
|
cliff_strength,
|
|
168
221
|
):
|
|
169
|
-
sequences.append(
|
|
222
|
+
sequences.append(
|
|
223
|
+
(n_cluster, f"c{n_cluster}_s{n_seq:03d}", seq, activity, is_cliff)
|
|
224
|
+
)
|
|
170
225
|
return headers, sequences
|
|
171
226
|
|
|
172
227
|
|
|
228
|
+
def convert_to_fasta(
|
|
229
|
+
cluster_sequence_records: List[ClusterSequenceRecord], separator: str
|
|
230
|
+
) -> List[Tuple[int, str, str, float, bool]]:
|
|
231
|
+
return [
|
|
232
|
+
(n_cluster, name_cluster, sequence_to_fasta(seq, separator), activity, is_cliff)
|
|
233
|
+
for n_cluster, name_cluster, seq, activity, is_cliff in cluster_sequence_records
|
|
234
|
+
]
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def convert_to_helm(
|
|
238
|
+
cluster_sequence_records: List[ClusterSequenceRecord], helm_connection_mode: str
|
|
239
|
+
) -> List[Tuple[int, str, str, float, bool]]:
|
|
240
|
+
return [
|
|
241
|
+
(
|
|
242
|
+
n_cluster,
|
|
243
|
+
name_cluster,
|
|
244
|
+
sequence_to_helm(seq, helm_connection_mode),
|
|
245
|
+
activity,
|
|
246
|
+
is_cliff,
|
|
247
|
+
)
|
|
248
|
+
for n_cluster, name_cluster, seq, activity, is_cliff in cluster_sequence_records
|
|
249
|
+
]
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def is_monomer_suitable(monomer: Any) -> bool:
|
|
253
|
+
return (
|
|
254
|
+
monomer["polymerType"] == "PEPTIDE"
|
|
255
|
+
and monomer["monomerType"] == "Backbone"
|
|
256
|
+
and len(monomer["rgroups"]) == 2
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def alphabet_from_helm(helm_library_file: str) -> Alphabet:
|
|
261
|
+
import json
|
|
262
|
+
|
|
263
|
+
alphabet: Alphabet = []
|
|
264
|
+
with open(helm_library_file) as helm_library:
|
|
265
|
+
for monomer in json.load(helm_library):
|
|
266
|
+
if is_monomer_suitable(monomer):
|
|
267
|
+
alphabet.append(monomer["symbol"])
|
|
268
|
+
return alphabet
|
|
269
|
+
|
|
270
|
+
|
|
173
271
|
def parse_command_line_args() -> Any:
|
|
174
272
|
parser = argparse.ArgumentParser(
|
|
175
273
|
prog="MotifSequencesGenerator",
|
|
176
274
|
description="The program generates set of sequences containing sequence motifs "
|
|
177
|
-
"for SAR
|
|
178
|
-
epilog="Utility support: Gennadii Zakharov",
|
|
275
|
+
"for SAR functionality testing",
|
|
276
|
+
epilog="Utility author and support: Gennadii Zakharov <Gennadiy.Zakharov@gmail.com>",
|
|
179
277
|
)
|
|
180
278
|
|
|
181
|
-
parser.add_argument(
|
|
279
|
+
parser.add_argument(
|
|
280
|
+
"-c", "--clusters", type=int, default=5, help="Number of clusters"
|
|
281
|
+
)
|
|
182
282
|
parser.add_argument(
|
|
183
283
|
"-s",
|
|
184
284
|
"--sequences",
|
|
185
285
|
type=int,
|
|
186
|
-
default=
|
|
286
|
+
default=50,
|
|
187
287
|
help="Number of sequences in each supercluster",
|
|
188
288
|
)
|
|
189
|
-
parser.add_argument(
|
|
289
|
+
parser.add_argument(
|
|
290
|
+
"-m,", "--motif-length", type=int, default=12, help="Average length of motif"
|
|
291
|
+
)
|
|
190
292
|
|
|
191
293
|
parser.add_argument(
|
|
192
294
|
"-r,",
|
|
@@ -203,12 +305,28 @@ def parse_command_line_args() -> Any:
|
|
|
203
305
|
help="Variation of total sequence length",
|
|
204
306
|
)
|
|
205
307
|
|
|
308
|
+
parser.add_argument(
|
|
309
|
+
"-h,",
|
|
310
|
+
"--helm-library-file",
|
|
311
|
+
type=str,
|
|
312
|
+
help="JSON file containing the HELM monomer library in the same format as used for Datagrok. "
|
|
313
|
+
+ "The alphabet property is ignored when helm library is specified.",
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
parser.add_argument(
|
|
317
|
+
"--helm-connection-mode",
|
|
318
|
+
type=str,
|
|
319
|
+
default=HelmConnectionMode.linear.value,
|
|
320
|
+
help=f"HELM peptide generation mode: {'/'.join([mode.name for mode in HelmConnectionMode])}",
|
|
321
|
+
)
|
|
322
|
+
|
|
206
323
|
available_alphabets = ",".join(list(alphabets.keys()) + ["custom"])
|
|
207
324
|
parser.add_argument(
|
|
208
325
|
"--alphabet",
|
|
209
326
|
type=str,
|
|
210
327
|
default=list(alphabets.keys())[0],
|
|
211
|
-
help=f"Sequence alphabet: {available_alphabets}. Custom alphabet is a list of values separated "
|
|
328
|
+
help=f"Sequence alphabet: {available_alphabets}. Custom alphabet is a list of values separated "
|
|
329
|
+
f"by comma",
|
|
212
330
|
)
|
|
213
331
|
parser.add_argument(
|
|
214
332
|
"--max-variants-position",
|
|
@@ -234,7 +352,12 @@ def parse_command_line_args() -> Any:
|
|
|
234
352
|
default=False,
|
|
235
353
|
help="Disable generation of cliffs",
|
|
236
354
|
)
|
|
237
|
-
|
|
355
|
+
parser.add_argument(
|
|
356
|
+
"--fasta-separator",
|
|
357
|
+
type=str,
|
|
358
|
+
default="",
|
|
359
|
+
help="Separator symbol for FASTA sequence",
|
|
360
|
+
)
|
|
238
361
|
command_line_args = parser.parse_args()
|
|
239
362
|
|
|
240
363
|
return command_line_args
|
|
@@ -257,8 +380,18 @@ if not grok:
|
|
|
257
380
|
disable_cliffs = args.disable_cliffs
|
|
258
381
|
cliff_probability = args.cliff_probability
|
|
259
382
|
cliff_strength = args.cliff_strength
|
|
260
|
-
|
|
261
|
-
|
|
383
|
+
fasta_separator = args.fasta_separator
|
|
384
|
+
helm_library_file = args.helm_library_file
|
|
385
|
+
helm_connection_mode = args.helm_connection_mode
|
|
386
|
+
|
|
387
|
+
if helm_library_file is None:
|
|
388
|
+
alphabet: Alphabet = (
|
|
389
|
+
alphabets[alphabet_key].split(",")
|
|
390
|
+
if alphabet_key in alphabets
|
|
391
|
+
else alphabet_key.split(",")
|
|
392
|
+
)
|
|
393
|
+
else:
|
|
394
|
+
alphabet = alphabet_from_helm(helm_library_file)
|
|
262
395
|
|
|
263
396
|
# Running sequence generator
|
|
264
397
|
header, data = generate_sequences(
|
|
@@ -273,17 +406,21 @@ header, data = generate_sequences(
|
|
|
273
406
|
cliff_probability,
|
|
274
407
|
cliff_strength,
|
|
275
408
|
)
|
|
409
|
+
if helm_library_file is None:
|
|
410
|
+
data_formatted = convert_to_fasta(data, fasta_separator)
|
|
411
|
+
else:
|
|
412
|
+
data_formatted = convert_to_helm(data, helm_connection_mode)
|
|
276
413
|
|
|
277
414
|
if grok:
|
|
278
|
-
# Exporting data to Datagrok as a
|
|
415
|
+
# Exporting data to Datagrok as a Pandas dataframe
|
|
279
416
|
import pandas as pd
|
|
280
417
|
|
|
281
|
-
sequences = pd.DataFrame.from_records(
|
|
418
|
+
sequences = pd.DataFrame.from_records(data_formatted, columns=header)
|
|
282
419
|
else:
|
|
283
420
|
# Writing results to stdout - no need to work with big and heavy Pandas
|
|
284
421
|
import csv
|
|
285
422
|
|
|
286
423
|
csv_writer = csv.writer(sys.stdout, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
|
|
287
424
|
csv_writer.writerow(header)
|
|
288
|
-
for line in
|
|
425
|
+
for line in data_formatted:
|
|
289
426
|
csv_writer.writerow(line)
|
|
@@ -4,10 +4,8 @@ import * as DG from 'datagrok-api/dg';
|
|
|
4
4
|
|
|
5
5
|
import {ITooltipAndPanelParams} from '@datagrok-libraries/ml/src/viewers/activity-cliffs';
|
|
6
6
|
import {getSimilarityFromDistance} from '@datagrok-libraries/ml/src/distance-metrics-methods';
|
|
7
|
-
import {AvailableMetrics,
|
|
7
|
+
import {AvailableMetrics, DistanceMetricsSubjects, StringMetricsNames} from '@datagrok-libraries/ml/src/typed-metrics';
|
|
8
8
|
import {drawMoleculeDifferenceOnCanvas} from '../utils/cell-renderer';
|
|
9
|
-
import * as C from '../utils/constants';
|
|
10
|
-
import {GridColumn} from 'datagrok-api/dg';
|
|
11
9
|
import {invalidateMols, MONOMERIC_COL_TAGS} from '../substructure-search/substructure-search';
|
|
12
10
|
import {getSplitter, TAGS as bioTAGS} from '@datagrok-libraries/bio/src/utils/macromolecule';
|
|
13
11
|
|
|
@@ -15,7 +13,7 @@ export async function getDistances(col: DG.Column, seq: string): Promise<Array<n
|
|
|
15
13
|
const stringArray = col.toList();
|
|
16
14
|
const distances = new Array(stringArray.length).fill(0);
|
|
17
15
|
const distanceMethod: (x: string, y: string) => number =
|
|
18
|
-
AvailableMetrics[
|
|
16
|
+
AvailableMetrics[DistanceMetricsSubjects.String][StringMetricsNames.Levenshtein];
|
|
19
17
|
for (let i = 0; i < stringArray.length; ++i) {
|
|
20
18
|
const distance = stringArray[i] ? distanceMethod(stringArray[i], seq) : null;
|
|
21
19
|
distances[i] = distance ? distance / Math.max((stringArray[i] as string).length, seq.length) : null;
|
|
@@ -24,7 +22,7 @@ export async function getDistances(col: DG.Column, seq: string): Promise<Array<n
|
|
|
24
22
|
}
|
|
25
23
|
|
|
26
24
|
export async function getSimilaritiesMatrix(
|
|
27
|
-
dim: number, seqCol: DG.Column, df: DG.DataFrame, colName: string, simArr: DG.Column[]
|
|
25
|
+
dim: number, seqCol: DG.Column, df: DG.DataFrame, colName: string, simArr: DG.Column[],
|
|
28
26
|
): Promise<DG.Column[]> {
|
|
29
27
|
const distances = new Array(simArr.length).fill(null);
|
|
30
28
|
for (let i = 0; i != dim - 1; ++i) {
|
|
@@ -54,7 +52,7 @@ export async function getChemSimilaritiesMatrix(dim: number, seqCol: DG.Column,
|
|
|
54
52
|
col: seqCol.temp[MONOMERIC_COL_TAGS.MONOMERIC_MOLS],
|
|
55
53
|
df: fpDf,
|
|
56
54
|
colName: colName,
|
|
57
|
-
simArr: simArr
|
|
55
|
+
simArr: simArr,
|
|
58
56
|
});
|
|
59
57
|
return res;
|
|
60
58
|
}
|
|
@@ -69,7 +67,7 @@ export function createTooltipElement(params: ITooltipAndPanelParams): HTMLDivEle
|
|
|
69
67
|
columnNames.style.display = 'flex';
|
|
70
68
|
columnNames.style.justifyContent = 'space-between';
|
|
71
69
|
tooltipElement.append(columnNames);
|
|
72
|
-
params.line.mols.forEach((molIdx: number,
|
|
70
|
+
params.line.mols.forEach((molIdx: number, _idx: number) => {
|
|
73
71
|
const activity = ui.divText(params.activityCol.get(molIdx).toFixed(2));
|
|
74
72
|
activity.style.display = 'flex';
|
|
75
73
|
activity.style.justifyContent = 'left';
|
|
@@ -82,7 +80,7 @@ export function createTooltipElement(params: ITooltipAndPanelParams): HTMLDivEle
|
|
|
82
80
|
return tooltipElement;
|
|
83
81
|
}
|
|
84
82
|
|
|
85
|
-
function
|
|
83
|
+
function _moleculeInfo(df: DG.DataFrame, idx: number, seqColName: string): HTMLElement {
|
|
86
84
|
const dict: { [key: string]: string } = {};
|
|
87
85
|
for (const col of df.columns) {
|
|
88
86
|
if (col.name !== seqColName)
|
|
@@ -124,7 +122,7 @@ export function createPropPanelElement(params: ITooltipAndPanelParams): HTMLDivE
|
|
|
124
122
|
function createPropPanelField(name: string, value: number): HTMLDivElement {
|
|
125
123
|
return ui.divH([
|
|
126
124
|
ui.divText(`${name}: `, {style: {fontWeight: 'bold', paddingRight: '5px'}}),
|
|
127
|
-
ui.divText(value.toFixed(2))
|
|
125
|
+
ui.divText(value.toFixed(2)),
|
|
128
126
|
], {style: {paddingTop: '10px'}});
|
|
129
127
|
}
|
|
130
128
|
|
|
@@ -147,13 +145,13 @@ export function createDifferencesWithPositions(
|
|
|
147
145
|
const diffsPanel = ui.divV([]);
|
|
148
146
|
diffsPanel.append(ui.divH([
|
|
149
147
|
ui.divText('Pos', {style: {fontWeight: 'bold', width: '30px', borderBottom: '1px solid'}}),
|
|
150
|
-
ui.divText('Difference', {style: {fontWeight: 'bold', borderBottom: '1px solid'}})
|
|
148
|
+
ui.divText('Difference', {style: {fontWeight: 'bold', borderBottom: '1px solid'}}),
|
|
151
149
|
]));
|
|
152
150
|
for (const key of Object.keys(molDifferences)) {
|
|
153
151
|
molDifferences[key as any].style.borderBottom = '1px solid lightgray';
|
|
154
152
|
diffsPanel.append(ui.divH([
|
|
155
153
|
ui.divText((parseInt(key) + 1).toString(), {style: {width: '30px', borderBottom: '1px solid lightgray'}}),
|
|
156
|
-
molDifferences[key as any]
|
|
154
|
+
molDifferences[key as any],
|
|
157
155
|
]));
|
|
158
156
|
}
|
|
159
157
|
div.append(diffsPanel);
|
|
@@ -27,7 +27,7 @@ export class SequenceDiversityViewer extends SequenceSearchBaseViewer {
|
|
|
27
27
|
return;
|
|
28
28
|
if (this.dataFrame) {
|
|
29
29
|
if (computeData && this.moleculeColumn) {
|
|
30
|
-
const uh =
|
|
30
|
+
const uh = UnitsHandler.getOrCreate(this.moleculeColumn);
|
|
31
31
|
await (uh.isFasta() ? this.computeByMM() : this.computeByChem());
|
|
32
32
|
|
|
33
33
|
const diverseColumnName: string = this.diverseColumnLabel != null ? this.diverseColumnLabel :
|
|
@@ -37,6 +37,8 @@ export class SequenceDiversityViewer extends SequenceSearchBaseViewer {
|
|
|
37
37
|
resCol.semType = DG.SEMTYPE.MACROMOLECULE;
|
|
38
38
|
this.tags.forEach((tag) => resCol.setTag(tag, this.moleculeColumn!.getTag(tag)));
|
|
39
39
|
const resDf = DG.DataFrame.fromColumns([resCol]);
|
|
40
|
+
resDf.onCurrentRowChanged.subscribe(
|
|
41
|
+
(_) => { this.dataFrame.currentRowIdx = this.renderMolIds![resDf.currentRowIdx]; });
|
|
40
42
|
updateDivInnerHTML(this.root, resDf.plot.grid().root);
|
|
41
43
|
this.computeCompleted.next(true);
|
|
42
44
|
}
|
|
@@ -51,7 +53,7 @@ export class SequenceDiversityViewer extends SequenceSearchBaseViewer {
|
|
|
51
53
|
col: monomericMols,
|
|
52
54
|
metricName: this.distanceMetric,
|
|
53
55
|
limit: this.limit,
|
|
54
|
-
fingerprint: this.fingerprint
|
|
56
|
+
fingerprint: this.fingerprint,
|
|
55
57
|
});
|
|
56
58
|
}
|
|
57
59
|
|
|
@@ -60,6 +62,9 @@ export class SequenceDiversityViewer extends SequenceSearchBaseViewer {
|
|
|
60
62
|
const len = this.moleculeColumn!.length;
|
|
61
63
|
const linearizeFunc = dmLinearIndex(len);
|
|
62
64
|
this.renderMolIds = getDiverseSubset(len, Math.min(len, this.limit),
|
|
63
|
-
(i1: number, i2: number) =>
|
|
65
|
+
(i1: number, i2: number) => {
|
|
66
|
+
return this.moleculeColumn!.isNone(i1) || this.moleculeColumn!.isNone(i2) ? 0 :
|
|
67
|
+
distanceMatrixData[linearizeFunc(i1, i2)];
|
|
68
|
+
});
|
|
64
69
|
}
|
|
65
70
|
}
|
|
@@ -4,8 +4,8 @@ import * as grok from 'datagrok-api/grok';
|
|
|
4
4
|
|
|
5
5
|
import {CHEM_SIMILARITY_METRICS} from '@datagrok-libraries/ml/src/distance-metrics-methods';
|
|
6
6
|
import {TAGS as bioTAGS} from '@datagrok-libraries/bio/src/utils/macromolecule';
|
|
7
|
-
import * as C from '../utils/constants';
|
|
8
7
|
|
|
8
|
+
const MAX_ROWS_FOR_DISTANCE_MATRIX = 22000;
|
|
9
9
|
export class SequenceSearchBaseViewer extends DG.JsViewer {
|
|
10
10
|
name: string = '';
|
|
11
11
|
distanceMetric: string;
|
|
@@ -17,7 +17,7 @@ export class SequenceSearchBaseViewer extends DG.JsViewer {
|
|
|
17
17
|
moleculeColumnName: string;
|
|
18
18
|
initialized: boolean = false;
|
|
19
19
|
tags = [DG.TAGS.UNITS, bioTAGS.aligned, bioTAGS.separator, bioTAGS.alphabet];
|
|
20
|
-
|
|
20
|
+
preComputeDistanceMatrix: boolean = false;
|
|
21
21
|
constructor(name: string) {
|
|
22
22
|
super();
|
|
23
23
|
this.fingerprint = this.string('fingerprint', this.fingerprintChoices[0], {choices: this.fingerprintChoices});
|
|
@@ -39,6 +39,7 @@ export class SequenceSearchBaseViewer extends DG.JsViewer {
|
|
|
39
39
|
this.init();
|
|
40
40
|
|
|
41
41
|
if (this.dataFrame) {
|
|
42
|
+
this.preComputeDistanceMatrix = this.dataFrame.rowCount <= MAX_ROWS_FOR_DISTANCE_MATRIX;
|
|
42
43
|
this.subs.push(DG.debounce(this.dataFrame.onRowsRemoved, 50).subscribe(async (_: any) => await this.render()));
|
|
43
44
|
const compute = this.name !== 'diversity';
|
|
44
45
|
this.subs.push(DG.debounce(this.dataFrame.onCurrentRowChanged, 50)
|
|
@@ -66,7 +67,7 @@ export class SequenceSearchBaseViewer extends DG.JsViewer {
|
|
|
66
67
|
this.render();
|
|
67
68
|
}
|
|
68
69
|
|
|
69
|
-
async render(
|
|
70
|
+
async render(_computeData = true) {
|
|
70
71
|
|
|
71
72
|
}
|
|
72
73
|
|
|
@@ -4,13 +4,13 @@ import * as DG from 'datagrok-api/dg';
|
|
|
4
4
|
|
|
5
5
|
import {SequenceSearchBaseViewer} from './sequence-search-base-viewer';
|
|
6
6
|
import {getMonomericMols} from '../calculations/monomerLevelMols';
|
|
7
|
-
import * as C from '../utils/constants';
|
|
8
7
|
import {createDifferenceCanvas, createDifferencesWithPositions} from './sequence-activity-cliffs';
|
|
9
8
|
import {updateDivInnerHTML} from '../utils/ui-utils';
|
|
10
9
|
import {Subject} from 'rxjs';
|
|
11
10
|
import {TAGS as bioTAGS, getSplitter} from '@datagrok-libraries/bio/src/utils/macromolecule';
|
|
12
11
|
import {UnitsHandler} from '@datagrok-libraries/bio/src/utils/units-handler';
|
|
13
12
|
import {calcMmDistanceMatrix, dmLinearIndex} from './workers/mm-distance-worker-creator';
|
|
13
|
+
import {calculateMMDistancesArray} from './workers/mm-distance-array-service';
|
|
14
14
|
|
|
15
15
|
export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
|
|
16
16
|
cutoff: number;
|
|
@@ -47,7 +47,7 @@ export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
|
|
|
47
47
|
this.curIdx = this.dataFrame!.currentRowIdx == -1 ? 0 : this.dataFrame!.currentRowIdx;
|
|
48
48
|
if (computeData && !this.gridSelect) {
|
|
49
49
|
this.targetMoleculeIdx = this.dataFrame!.currentRowIdx == -1 ? 0 : this.dataFrame!.currentRowIdx;
|
|
50
|
-
const uh =
|
|
50
|
+
const uh = UnitsHandler.getOrCreate(this.moleculeColumn!);
|
|
51
51
|
|
|
52
52
|
await (uh.isFasta() ? this.computeByMM() : this.computeByChem());
|
|
53
53
|
const similarColumnName: string = this.similarColumnLabel != null ? this.similarColumnLabel :
|
|
@@ -67,7 +67,7 @@ export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
|
|
|
67
67
|
const targetMolRow = this.idxs?.getRawData().findIndex((it) => it == this.targetMoleculeIdx);
|
|
68
68
|
const targetScoreCell = grid.cell('score', targetMolRow!);
|
|
69
69
|
targetScoreCell.cell.value = null;
|
|
70
|
-
(grok.shell.v as DG.TableView).grid.root.addEventListener('click', (
|
|
70
|
+
(grok.shell.v as DG.TableView).grid.root.addEventListener('click', (_event: MouseEvent) => {
|
|
71
71
|
this.gridSelect = false;
|
|
72
72
|
});
|
|
73
73
|
updateDivInnerHTML(this.root, grid.root);
|
|
@@ -87,23 +87,29 @@ export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
|
|
|
87
87
|
metricName: this.distanceMetric,
|
|
88
88
|
limit: this.limit,
|
|
89
89
|
minScore: this.cutoff,
|
|
90
|
-
fingerprint: this.fingerprint
|
|
90
|
+
fingerprint: this.fingerprint,
|
|
91
91
|
});
|
|
92
92
|
this.idxs = df.getCol('indexes');
|
|
93
93
|
this.scores = df.getCol('score');
|
|
94
94
|
}
|
|
95
95
|
|
|
96
96
|
private async computeByMM() {
|
|
97
|
-
|
|
97
|
+
let distanceArray = new Float32Array();
|
|
98
|
+
if (!this.distanceMatrixComputed && this.preComputeDistanceMatrix) {
|
|
98
99
|
this.mmDistanceMatrix = await calcMmDistanceMatrix(this.moleculeColumn!);
|
|
99
100
|
this.distanceMatrixComputed = true;
|
|
101
|
+
} else if (!this.preComputeDistanceMatrix) {
|
|
102
|
+
// use fast distance array calculation if matrix will take too much space
|
|
103
|
+
distanceArray = await calculateMMDistancesArray(this.moleculeColumn!, this.targetMoleculeIdx);
|
|
100
104
|
}
|
|
101
105
|
const len = this.moleculeColumn!.length;
|
|
102
106
|
const linearizeFunc = dmLinearIndex(len);
|
|
103
107
|
// array that keeps track of the indexes and scores together
|
|
104
108
|
const indexWScore = Array(len).fill(0)
|
|
105
109
|
.map((_, i) => ({idx: i, score: i === this.targetMoleculeIdx ? 1 :
|
|
106
|
-
1 - this.mmDistanceMatrix[linearizeFunc(this.targetMoleculeIdx, i)]
|
|
110
|
+
this.preComputeDistanceMatrix ? 1 - this.mmDistanceMatrix[linearizeFunc(this.targetMoleculeIdx, i)] :
|
|
111
|
+
1 - distanceArray[i]
|
|
112
|
+
}));
|
|
107
113
|
indexWScore.sort((a, b) => b.score - a.score);
|
|
108
114
|
// get the most similar molecules
|
|
109
115
|
const actualLimit = Math.min(this.limit, len);
|
|
@@ -127,7 +133,7 @@ export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
|
|
|
127
133
|
propPanel.append(ui.divV([
|
|
128
134
|
ui.divText(`Different sequence length:`, {style: {fontWeight: 'bold'}}),
|
|
129
135
|
ui.divText(`target: ${subParts1.length} monomers`),
|
|
130
|
-
ui.divText(`selected: ${subParts2.length} monomers`)
|
|
136
|
+
ui.divText(`selected: ${subParts2.length} monomers`),
|
|
131
137
|
], {style: {paddingBottom: '10px'}}));
|
|
132
138
|
}
|
|
133
139
|
propPanel.append(createDifferencesWithPositions(molDifferences));
|