@datagrok/bio 2.4.30 → 2.4.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. package/.eslintrc.json +6 -8
  2. package/README.md +22 -7
  3. package/detectors.js +21 -12
  4. package/dist/1.js +2 -0
  5. package/dist/1.js.map +1 -0
  6. package/dist/18.js +2 -0
  7. package/dist/18.js.map +1 -0
  8. package/dist/190.js +2 -0
  9. package/dist/190.js.map +1 -0
  10. package/dist/452.js +2 -0
  11. package/dist/452.js.map +1 -0
  12. package/dist/729.js +2 -0
  13. package/dist/729.js.map +1 -0
  14. package/dist/package-test.js +1 -1
  15. package/dist/package-test.js.map +1 -1
  16. package/dist/package.js +1 -1
  17. package/dist/package.js.map +1 -1
  18. package/files/libraries/broken-lib.sdf +136 -0
  19. package/files/libraries/group1/mock-lib-3.json +74 -0
  20. package/files/libraries/mock-lib-2.json +48 -0
  21. package/files/tests/100_3_clustests.csv +100 -0
  22. package/files/tests/100_3_clustests_empty_vals.csv +100 -0
  23. package/files/tests/peptides_motif-with-random_10000.csv +9998 -0
  24. package/package.json +4 -4
  25. package/scripts/sequence_generator.py +185 -48
  26. package/src/analysis/sequence-activity-cliffs.ts +9 -11
  27. package/src/analysis/sequence-diversity-viewer.ts +8 -3
  28. package/src/analysis/sequence-search-base-viewer.ts +4 -3
  29. package/src/analysis/sequence-similarity-viewer.ts +13 -7
  30. package/src/analysis/sequence-space.ts +15 -12
  31. package/src/analysis/workers/mm-distance-array-service.ts +48 -0
  32. package/src/analysis/workers/mm-distance-array-worker.ts +29 -0
  33. package/src/analysis/workers/mm-distance-worker-creator.ts +6 -9
  34. package/src/apps/web-logo-app.ts +34 -0
  35. package/src/calculations/monomerLevelMols.ts +10 -12
  36. package/src/demo/bio01-similarity-diversity.ts +4 -5
  37. package/src/demo/bio01a-hierarchical-clustering-and-sequence-space.ts +6 -7
  38. package/src/demo/bio01b-hierarchical-clustering-and-activity-cliffs.ts +8 -8
  39. package/src/demo/bio03-atomic-level.ts +1 -4
  40. package/src/demo/bio05-helm-msa-sequence-space.ts +8 -5
  41. package/src/demo/utils.ts +4 -3
  42. package/src/package-test.ts +1 -2
  43. package/src/package.ts +138 -83
  44. package/src/seq_align.ts +482 -483
  45. package/src/substructure-search/substructure-search.ts +3 -3
  46. package/src/tests/Palettes-test.ts +1 -1
  47. package/src/tests/WebLogo-positions-test.ts +12 -35
  48. package/src/tests/_first-tests.ts +1 -1
  49. package/src/tests/activity-cliffs-tests.ts +10 -6
  50. package/src/tests/activity-cliffs-utils.ts +6 -4
  51. package/src/tests/bio-tests.ts +20 -25
  52. package/src/tests/checkInputColumn-tests.ts +5 -11
  53. package/src/tests/converters-test.ts +19 -37
  54. package/src/tests/detectors-benchmark-tests.ts +35 -37
  55. package/src/tests/detectors-tests.ts +29 -34
  56. package/src/tests/detectors-weak-and-likely-tests.ts +11 -21
  57. package/src/tests/fasta-export-tests.ts +3 -3
  58. package/src/tests/fasta-handler-test.ts +2 -3
  59. package/src/tests/lib-tests.ts +2 -4
  60. package/src/tests/mm-distance-tests.ts +25 -17
  61. package/src/tests/monomer-libraries-tests.ts +1 -1
  62. package/src/tests/msa-tests.ts +12 -9
  63. package/src/tests/pepsea-tests.ts +6 -3
  64. package/src/tests/renderers-test.ts +13 -11
  65. package/src/tests/sequence-space-test.ts +10 -7
  66. package/src/tests/sequence-space-utils.ts +7 -3
  67. package/src/tests/similarity-diversity-tests.ts +47 -61
  68. package/src/tests/splitters-test.ts +14 -20
  69. package/src/tests/to-atomic-level-tests.ts +9 -17
  70. package/src/tests/units-handler-splitted-tests.ts +106 -0
  71. package/src/tests/units-handler-tests.ts +22 -26
  72. package/src/tests/utils/sequences-generators.ts +6 -2
  73. package/src/tests/utils.ts +10 -4
  74. package/src/tests/viewers.ts +1 -1
  75. package/src/utils/atomic-works.ts +49 -57
  76. package/src/utils/cell-renderer.ts +25 -8
  77. package/src/utils/check-input-column.ts +19 -4
  78. package/src/utils/constants.ts +3 -3
  79. package/src/utils/convert.ts +56 -23
  80. package/src/utils/monomer-lib.ts +83 -64
  81. package/src/utils/multiple-sequence-alignment-ui.ts +24 -21
  82. package/src/utils/multiple-sequence-alignment.ts +2 -2
  83. package/src/utils/pepsea.ts +17 -7
  84. package/src/utils/save-as-fasta.ts +11 -4
  85. package/src/utils/ui-utils.ts +1 -1
  86. package/src/viewers/vd-regions-viewer.ts +21 -22
  87. package/src/viewers/web-logo-viewer.ts +189 -154
  88. package/src/widgets/bio-substructure-filter.ts +9 -6
  89. package/src/widgets/representations.ts +11 -12
  90. package/tsconfig.json +1 -1
  91. package/dist/258.js +0 -2
  92. package/dist/258.js.map +0 -1
  93. package/dist/562.js +0 -2
  94. package/dist/562.js.map +0 -1
  95. package/dist/705.js +0 -2
  96. package/dist/705.js.map +0 -1
  97. package/dist/925.js +0 -2
  98. package/dist/925.js.map +0 -1
  99. package/src/analysis/workers/mm-distance-worker.ts +0 -16
package/package.json CHANGED
@@ -5,7 +5,7 @@
5
5
  "name": "Leonid Stolbov",
6
6
  "email": "lstolbov@datagrok.ai"
7
7
  },
8
- "version": "2.4.30",
8
+ "version": "2.4.39",
9
9
  "description": "Bioinformatics support (import/export of sequences, conversion, visualization, analysis). [See more](https://github.com/datagrok-ai/public/blob/master/packages/Bio/README.md) for details.",
10
10
  "repository": {
11
11
  "type": "git",
@@ -14,11 +14,11 @@
14
14
  },
15
15
  "dependencies": {
16
16
  "@biowasm/aioli": "^3.1.0",
17
- "@datagrok-libraries/bio": "^5.30.0",
17
+ "@datagrok-libraries/bio": "^5.32.1",
18
18
  "@datagrok-libraries/chem-meta": "^1.0.1",
19
- "@datagrok-libraries/ml": "^6.3.23",
19
+ "@datagrok-libraries/ml": "^6.3.37",
20
20
  "@datagrok-libraries/tutorials": "^1.3.2",
21
- "@datagrok-libraries/utils": "^4.0.7",
21
+ "@datagrok-libraries/utils": "^4.0.11",
22
22
  "cash-dom": "^8.0.0",
23
23
  "css-loader": "^6.7.3",
24
24
  "datagrok-api": "^1.13.3",
@@ -3,8 +3,8 @@
3
3
  # description: Create the model peptides/DNA sequences with peptides data
4
4
  # language: python
5
5
  # tags: template, demo
6
- # input: int clusters = 1 [Number of superclusters]
7
- # input: int num_sequences = 500 [Number of sequences in each supercluster]
6
+ # input: int clusters = 5 [Number of superclusters]
7
+ # input: int num_sequences = 50 [Number of sequences in each supercluster]
8
8
  # input: int motif_length = 12 [Average length of motif]
9
9
  # input: int max_variants_position = 3 [Maximum number of different letters in conservative position in motif]
10
10
  # input: int random_length = 3 [Average length of random sequence parts before and after motif]
@@ -13,21 +13,34 @@
13
13
  # input: bool disable_cliffs = False [Disable generation of cliffs]
14
14
  # input: double cliff_probability = 0.01 [Probability to make activity cliff of a sequence]
15
15
  # input: double cliff_strength = 4.0 [Strength of cliff]
16
+ # input: double fasta_separator = '' [Separator for a FASTA notation]
16
17
  # output: dataframe sequences
17
18
 
18
19
  import random
19
20
  import argparse
20
21
  import sys
22
+ from enum import Enum
21
23
 
22
24
  from typing import List, Tuple, Dict, Iterator, Any
23
25
 
24
- alphabet_type = List[str]
25
26
 
26
- letter_choice_type = List[str]
27
- motif_template_type = List[letter_choice_type]
27
+ # --- Type definitions ---
28
28
 
29
- sequence_record_type = Tuple[int, str, float, bool]
30
- sequence_record_cluster_type = Tuple[int, str, str, float, bool]
29
+ Letter = str
30
+ Alphabet = List[str]
31
+
32
+ LetterChoice = List[Letter]
33
+ MotifTemplate = List[LetterChoice]
34
+
35
+ Sequence = List[Letter] # The sequence in a form of list
36
+ SequenceSquashed = str # Sequence, joined together in string form
37
+
38
+ SequenceRecord = Tuple[int, Sequence, float, bool]
39
+ ClusterSequenceRecord = Tuple[int, str, Sequence, float, bool]
40
+
41
+ # --- constants ---
42
+
43
+ HelmConnectionMode = Enum("HelmConnectionMode", ["linear", "cyclic", "mixed"])
31
44
 
32
45
  alphabets: Dict[str, str] = {
33
46
  "PT": "A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y",
@@ -42,10 +55,10 @@ def mean_range(mean: int, disp: int) -> int:
42
55
 
43
56
  def generate_motif_template(
44
57
  motif_length: int,
45
- alphabet: alphabet_type,
58
+ alphabet: Alphabet,
46
59
  max_variants_cluster: int,
47
60
  prob_any: float = 0.2,
48
- ) -> motif_template_type:
61
+ ) -> MotifTemplate:
49
62
  motif_template = []
50
63
  for position in range(motif_length):
51
64
  # Selecting letters for position i
@@ -53,37 +66,75 @@ def generate_motif_template(
53
66
  letters = ["?"] # this stands for any symbol
54
67
  else:
55
68
  n_variants = random.randrange(max_variants_cluster) + 1
56
- letters = [random.choice(alphabet) for i in range(n_variants)]
69
+ letters = list(set((random.choice(alphabet) for i in range(n_variants))))
57
70
  motif_template.append(letters)
58
71
  return motif_template
59
72
 
60
73
 
61
- def generate_motif(template: motif_template_type, alphabet: alphabet_type) -> str:
62
- template_with_any = [(letters if not "?" in letters else alphabet) for letters in template]
63
- return "".join([random.choice(letters) for letters in template_with_any])
74
+ def generate_motif(template: MotifTemplate, alphabet: Alphabet) -> Sequence:
75
+ template_with_any = [
76
+ (letters if not "?" in letters else alphabet) for letters in template
77
+ ]
78
+ return [random.choice(letters) for letters in template_with_any]
64
79
 
65
80
 
66
- def motif_notation(motif_template: motif_template_type) -> str:
67
- def motif_notation_code(letter_choice: letter_choice_type) -> str:
81
+ def motif_notation(motif_template: MotifTemplate) -> str:
82
+ def motif_notation_code(letter_choice: LetterChoice) -> str:
68
83
  if len(letter_choice) == 1:
69
84
  return letter_choice[0]
70
85
  else:
71
86
  return f"[{''.join(letter_choice)}]"
72
87
 
73
- return "".join([motif_notation_code(letter_choice) for letter_choice in motif_template])
88
+ return "".join(
89
+ [motif_notation_code(letter_choice) for letter_choice in motif_template]
90
+ )
74
91
 
75
92
 
76
- def generate_random(n: int, alphabet: alphabet_type) -> str:
77
- return "".join([random.choice(alphabet) for i in range(n)])
93
+ def generate_random(n: int, alphabet: Alphabet) -> Sequence:
94
+ return [random.choice(alphabet) for i in range(n)]
78
95
 
79
96
 
80
- def make_cliff(motif_template: motif_template_type, alphabet: alphabet_type, motif: str) -> str:
97
+ def make_cliff(
98
+ motif_template: MotifTemplate, alphabet: Alphabet, motif: Sequence
99
+ ) -> Sequence:
81
100
  # Mutate conservative letter in motif
82
- pos = random.randrange(len(motif_template))
101
+ motif_len = len(motif_template)
102
+ pos = random.randrange(motif_len)
83
103
  while "?" in motif_template[pos]:
84
- pos = (pos + 1) % len(motif_template) # always will find letters since ends of motif can't be any symbol
104
+ pos = (
105
+ pos + 1
106
+ ) % motif_len # always will find letters since ends of motif can't be any symbol
85
107
  outlier_letters = list(set(alphabet) - set(motif_template[pos]))
86
- return motif[:pos] + random.choice(outlier_letters) + motif[pos + 1 :]
108
+ new_letter = random.choice(outlier_letters)
109
+ return (
110
+ motif[:pos]
111
+ + [
112
+ new_letter,
113
+ ]
114
+ + motif[pos + 1 :]
115
+ )
116
+
117
+
118
+ def sequence_to_fasta(sequence: Sequence, separator: str) -> SequenceSquashed:
119
+ return separator.join(sequence)
120
+
121
+
122
+ def sequence_to_helm(
123
+ sequence: Sequence, helm_connection_mode: str = HelmConnectionMode.linear.name
124
+ ) -> SequenceSquashed:
125
+ def is_cyclic(helm_connection_mode: str) -> bool:
126
+ return helm_connection_mode == HelmConnectionMode.cyclic.name or (
127
+ helm_connection_mode == HelmConnectionMode.mixed.name
128
+ and random.random() < 0.5
129
+ )
130
+
131
+ sequence_escaped: Sequence = [
132
+ f"[{letter}]" if len(letter) > 1 else letter for letter in sequence
133
+ ]
134
+ connection_format = ""
135
+ if is_cyclic(helm_connection_mode):
136
+ connection_format = f"PEPTIDE1,PEPTIDE1,{len(sequence_escaped)}:R2-1:R1"
137
+ return f"PEPTIDE1{{{sequence_to_fasta(sequence_escaped,'.')}}}${connection_format}$$$V2.0"
87
138
 
88
139
 
89
140
  def generate_cluster(
@@ -91,14 +142,17 @@ def generate_cluster(
91
142
  motif_length: int,
92
143
  prefix_length: int,
93
144
  suffix_length: int,
94
- max_variants_position: int,
145
+ max_variants_per_position: int,
95
146
  make_cliffs: bool,
96
- alphabet: alphabet_type,
147
+ alphabet: Alphabet,
97
148
  cliff_probability: float,
98
149
  cliff_strength: float,
99
- ) -> Iterator[sequence_record_type]:
100
- motif_template = generate_motif_template(motif_length, alphabet, max_variants_position)
101
-
150
+ ) -> Iterator[SequenceRecord]:
151
+ # Making a motif template
152
+ motif_template = generate_motif_template(
153
+ motif_length, alphabet, max_variants_per_position
154
+ )
155
+ # Setting average and dispersion for activity
102
156
  activity_average = random.random() * 10
103
157
  activity_dispersion = random.random()
104
158
  sys.stderr.write(f"Motif template: {motif_notation(motif_template)}\n")
@@ -110,11 +164,10 @@ def generate_cluster(
110
164
  prefix = generate_random(prefix_length, alphabet)
111
165
  suffix = generate_random(suffix_length, alphabet)
112
166
  seq = prefix + motif + suffix
113
-
114
- is_cliff = make_cliffs and (random.random() <= cliff_probability)
115
- sequence_record: sequence_record_type = (n_seq, seq, activity, is_cliff)
167
+ sequence_record: SequenceRecord = (n_seq, seq, activity, False)
116
168
  yield sequence_record
117
169
 
170
+ is_cliff = make_cliffs and (random.random() <= cliff_probability)
118
171
  if is_cliff:
119
172
  # Making activity cliff
120
173
  cliff_motif = make_cliff(motif_template, alphabet, motif)
@@ -136,16 +189,16 @@ def generate_sequences(
136
189
  n_clusters: int,
137
190
  n_sequences: int,
138
191
  average_motif_length: int,
139
- max_variants_position: int,
192
+ max_variants_per_position: int,
140
193
  average_random_length: int,
141
194
  dispersion: int,
142
- alphabet: alphabet_type,
195
+ alphabet: Alphabet,
143
196
  make_cliffs: bool,
144
197
  cliff_probability: float,
145
198
  cliff_strength: float,
146
- ) -> Tuple[List[str], List[sequence_record_cluster_type]]:
199
+ ) -> Tuple[List[str], List[ClusterSequenceRecord]]:
147
200
  headers: List[str] = ["cluster", "sequence_id", "sequence", "activity", "is_cliff"]
148
- sequences: List[sequence_record_cluster_type] = []
201
+ sequences: List[ClusterSequenceRecord] = []
149
202
 
150
203
  for n_cluster in range(n_clusters):
151
204
  motif_length = mean_range(average_motif_length, dispersion)
@@ -160,33 +213,82 @@ def generate_sequences(
160
213
  motif_length,
161
214
  prefix_length,
162
215
  suffix_length,
163
- max_variants_position,
216
+ max_variants_per_position,
164
217
  make_cliffs,
165
218
  alphabet,
166
219
  cliff_probability,
167
220
  cliff_strength,
168
221
  ):
169
- sequences.append((n_cluster, f"c{n_cluster}_s{n_seq}", seq, activity, is_cliff))
222
+ sequences.append(
223
+ (n_cluster, f"c{n_cluster}_s{n_seq:03d}", seq, activity, is_cliff)
224
+ )
170
225
  return headers, sequences
171
226
 
172
227
 
228
+ def convert_to_fasta(
229
+ cluster_sequence_records: List[ClusterSequenceRecord], separator: str
230
+ ) -> List[Tuple[int, str, str, float, bool]]:
231
+ return [
232
+ (n_cluster, name_cluster, sequence_to_fasta(seq, separator), activity, is_cliff)
233
+ for n_cluster, name_cluster, seq, activity, is_cliff in cluster_sequence_records
234
+ ]
235
+
236
+
237
+ def convert_to_helm(
238
+ cluster_sequence_records: List[ClusterSequenceRecord], helm_connection_mode: str
239
+ ) -> List[Tuple[int, str, str, float, bool]]:
240
+ return [
241
+ (
242
+ n_cluster,
243
+ name_cluster,
244
+ sequence_to_helm(seq, helm_connection_mode),
245
+ activity,
246
+ is_cliff,
247
+ )
248
+ for n_cluster, name_cluster, seq, activity, is_cliff in cluster_sequence_records
249
+ ]
250
+
251
+
252
+ def is_monomer_suitable(monomer: Any) -> bool:
253
+ return (
254
+ monomer["polymerType"] == "PEPTIDE"
255
+ and monomer["monomerType"] == "Backbone"
256
+ and len(monomer["rgroups"]) == 2
257
+ )
258
+
259
+
260
+ def alphabet_from_helm(helm_library_file: str) -> Alphabet:
261
+ import json
262
+
263
+ alphabet: Alphabet = []
264
+ with open(helm_library_file) as helm_library:
265
+ for monomer in json.load(helm_library):
266
+ if is_monomer_suitable(monomer):
267
+ alphabet.append(monomer["symbol"])
268
+ return alphabet
269
+
270
+
173
271
  def parse_command_line_args() -> Any:
174
272
  parser = argparse.ArgumentParser(
175
273
  prog="MotifSequencesGenerator",
176
274
  description="The program generates set of sequences containing sequence motifs "
177
- "for SAR fucntionality testing",
178
- epilog="Utility support: Gennadii Zakharov",
275
+ "for SAR functionality testing",
276
+ epilog="Utility author and support: Gennadii Zakharov <Gennadiy.Zakharov@gmail.com>",
179
277
  )
180
278
 
181
- parser.add_argument("-c", "--clusters", type=int, default=1, help="Number of superclusters")
279
+ parser.add_argument(
280
+ "-c", "--clusters", type=int, default=5, help="Number of clusters"
281
+ )
182
282
  parser.add_argument(
183
283
  "-s",
184
284
  "--sequences",
185
285
  type=int,
186
- default=500,
286
+ default=50,
187
287
  help="Number of sequences in each supercluster",
188
288
  )
189
- parser.add_argument("-m,", "--motif-length", type=int, default=12, help="Average length of motif")
289
+ parser.add_argument(
290
+ "-m,", "--motif-length", type=int, default=12, help="Average length of motif"
291
+ )
190
292
 
191
293
  parser.add_argument(
192
294
  "-r,",
@@ -203,12 +305,28 @@ def parse_command_line_args() -> Any:
203
305
  help="Variation of total sequence length",
204
306
  )
205
307
 
308
+ parser.add_argument(
309
+ "-h,",
310
+ "--helm-library-file",
311
+ type=str,
312
+ help="JSON file containing the HELM monomer library in the same format as used for Datagrok. "
313
+ + "The alphabet property is ignored when helm library is specified.",
314
+ )
315
+
316
+ parser.add_argument(
317
+ "--helm-connection-mode",
318
+ type=str,
319
+ default=HelmConnectionMode.linear.value,
320
+ help=f"HELM peptide generation mode: {'/'.join([mode.name for mode in HelmConnectionMode])}",
321
+ )
322
+
206
323
  available_alphabets = ",".join(list(alphabets.keys()) + ["custom"])
207
324
  parser.add_argument(
208
325
  "--alphabet",
209
326
  type=str,
210
327
  default=list(alphabets.keys())[0],
211
- help=f"Sequence alphabet: {available_alphabets}. Custom alphabet is a list of values separated " f"by comma",
328
+ help=f"Sequence alphabet: {available_alphabets}. Custom alphabet is a list of values separated "
329
+ f"by comma",
212
330
  )
213
331
  parser.add_argument(
214
332
  "--max-variants-position",
@@ -234,7 +352,12 @@ def parse_command_line_args() -> Any:
234
352
  default=False,
235
353
  help="Disable generation of cliffs",
236
354
  )
237
-
355
+ parser.add_argument(
356
+ "--fasta-separator",
357
+ type=str,
358
+ default="",
359
+ help="Separator symbol for FASTA sequence",
360
+ )
238
361
  command_line_args = parser.parse_args()
239
362
 
240
363
  return command_line_args
@@ -257,8 +380,18 @@ if not grok:
257
380
  disable_cliffs = args.disable_cliffs
258
381
  cliff_probability = args.cliff_probability
259
382
  cliff_strength = args.cliff_strength
260
-
261
- alphabet: alphabet_type = alphabets[alphabet_key].split(",") if alphabet_key in alphabets else alphabet_key.split(",")
383
+ fasta_separator = args.fasta_separator
384
+ helm_library_file = args.helm_library_file
385
+ helm_connection_mode = args.helm_connection_mode
386
+
387
+ if helm_library_file is None:
388
+ alphabet: Alphabet = (
389
+ alphabets[alphabet_key].split(",")
390
+ if alphabet_key in alphabets
391
+ else alphabet_key.split(",")
392
+ )
393
+ else:
394
+ alphabet = alphabet_from_helm(helm_library_file)
262
395
 
263
396
  # Running sequence generator
264
397
  header, data = generate_sequences(
@@ -273,17 +406,21 @@ header, data = generate_sequences(
273
406
  cliff_probability,
274
407
  cliff_strength,
275
408
  )
409
+ if helm_library_file is None:
410
+ data_formatted = convert_to_fasta(data, fasta_separator)
411
+ else:
412
+ data_formatted = convert_to_helm(data, helm_connection_mode)
276
413
 
277
414
  if grok:
278
- # Exporting data to Datagrok as a pandas dataframe
415
+ # Exporting data to Datagrok as a Pandas dataframe
279
416
  import pandas as pd
280
417
 
281
- sequences = pd.DataFrame.from_records(data, columns=header)
418
+ sequences = pd.DataFrame.from_records(data_formatted, columns=header)
282
419
  else:
283
420
  # Writing results to stdout - no need to work with big and heavy Pandas
284
421
  import csv
285
422
 
286
423
  csv_writer = csv.writer(sys.stdout, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
287
424
  csv_writer.writerow(header)
288
- for line in data:
425
+ for line in data_formatted:
289
426
  csv_writer.writerow(line)
@@ -4,10 +4,8 @@ import * as DG from 'datagrok-api/dg';
4
4
 
5
5
  import {ITooltipAndPanelParams} from '@datagrok-libraries/ml/src/viewers/activity-cliffs';
6
6
  import {getSimilarityFromDistance} from '@datagrok-libraries/ml/src/distance-metrics-methods';
7
- import {AvailableMetrics, AvailableMetricsTypes, StringMetricsNames} from '@datagrok-libraries/ml/src/typed-metrics';
7
+ import {AvailableMetrics, DistanceMetricsSubjects, StringMetricsNames} from '@datagrok-libraries/ml/src/typed-metrics';
8
8
  import {drawMoleculeDifferenceOnCanvas} from '../utils/cell-renderer';
9
- import * as C from '../utils/constants';
10
- import {GridColumn} from 'datagrok-api/dg';
11
9
  import {invalidateMols, MONOMERIC_COL_TAGS} from '../substructure-search/substructure-search';
12
10
  import {getSplitter, TAGS as bioTAGS} from '@datagrok-libraries/bio/src/utils/macromolecule';
13
11
 
@@ -15,7 +13,7 @@ export async function getDistances(col: DG.Column, seq: string): Promise<Array<n
15
13
  const stringArray = col.toList();
16
14
  const distances = new Array(stringArray.length).fill(0);
17
15
  const distanceMethod: (x: string, y: string) => number =
18
- AvailableMetrics[AvailableMetricsTypes.String][StringMetricsNames.Levenshtein];
16
+ AvailableMetrics[DistanceMetricsSubjects.String][StringMetricsNames.Levenshtein];
19
17
  for (let i = 0; i < stringArray.length; ++i) {
20
18
  const distance = stringArray[i] ? distanceMethod(stringArray[i], seq) : null;
21
19
  distances[i] = distance ? distance / Math.max((stringArray[i] as string).length, seq.length) : null;
@@ -24,7 +22,7 @@ export async function getDistances(col: DG.Column, seq: string): Promise<Array<n
24
22
  }
25
23
 
26
24
  export async function getSimilaritiesMatrix(
27
- dim: number, seqCol: DG.Column, df: DG.DataFrame, colName: string, simArr: DG.Column[]
25
+ dim: number, seqCol: DG.Column, df: DG.DataFrame, colName: string, simArr: DG.Column[],
28
26
  ): Promise<DG.Column[]> {
29
27
  const distances = new Array(simArr.length).fill(null);
30
28
  for (let i = 0; i != dim - 1; ++i) {
@@ -54,7 +52,7 @@ export async function getChemSimilaritiesMatrix(dim: number, seqCol: DG.Column,
54
52
  col: seqCol.temp[MONOMERIC_COL_TAGS.MONOMERIC_MOLS],
55
53
  df: fpDf,
56
54
  colName: colName,
57
- simArr: simArr
55
+ simArr: simArr,
58
56
  });
59
57
  return res;
60
58
  }
@@ -69,7 +67,7 @@ export function createTooltipElement(params: ITooltipAndPanelParams): HTMLDivEle
69
67
  columnNames.style.display = 'flex';
70
68
  columnNames.style.justifyContent = 'space-between';
71
69
  tooltipElement.append(columnNames);
72
- params.line.mols.forEach((molIdx: number, idx: number) => {
70
+ params.line.mols.forEach((molIdx: number, _idx: number) => {
73
71
  const activity = ui.divText(params.activityCol.get(molIdx).toFixed(2));
74
72
  activity.style.display = 'flex';
75
73
  activity.style.justifyContent = 'left';
@@ -82,7 +80,7 @@ export function createTooltipElement(params: ITooltipAndPanelParams): HTMLDivEle
82
80
  return tooltipElement;
83
81
  }
84
82
 
85
- function moleculeInfo(df: DG.DataFrame, idx: number, seqColName: string): HTMLElement {
83
+ function _moleculeInfo(df: DG.DataFrame, idx: number, seqColName: string): HTMLElement {
86
84
  const dict: { [key: string]: string } = {};
87
85
  for (const col of df.columns) {
88
86
  if (col.name !== seqColName)
@@ -124,7 +122,7 @@ export function createPropPanelElement(params: ITooltipAndPanelParams): HTMLDivE
124
122
  function createPropPanelField(name: string, value: number): HTMLDivElement {
125
123
  return ui.divH([
126
124
  ui.divText(`${name}: `, {style: {fontWeight: 'bold', paddingRight: '5px'}}),
127
- ui.divText(value.toFixed(2))
125
+ ui.divText(value.toFixed(2)),
128
126
  ], {style: {paddingTop: '10px'}});
129
127
  }
130
128
 
@@ -147,13 +145,13 @@ export function createDifferencesWithPositions(
147
145
  const diffsPanel = ui.divV([]);
148
146
  diffsPanel.append(ui.divH([
149
147
  ui.divText('Pos', {style: {fontWeight: 'bold', width: '30px', borderBottom: '1px solid'}}),
150
- ui.divText('Difference', {style: {fontWeight: 'bold', borderBottom: '1px solid'}})
148
+ ui.divText('Difference', {style: {fontWeight: 'bold', borderBottom: '1px solid'}}),
151
149
  ]));
152
150
  for (const key of Object.keys(molDifferences)) {
153
151
  molDifferences[key as any].style.borderBottom = '1px solid lightgray';
154
152
  diffsPanel.append(ui.divH([
155
153
  ui.divText((parseInt(key) + 1).toString(), {style: {width: '30px', borderBottom: '1px solid lightgray'}}),
156
- molDifferences[key as any]
154
+ molDifferences[key as any],
157
155
  ]));
158
156
  }
159
157
  div.append(diffsPanel);
@@ -27,7 +27,7 @@ export class SequenceDiversityViewer extends SequenceSearchBaseViewer {
27
27
  return;
28
28
  if (this.dataFrame) {
29
29
  if (computeData && this.moleculeColumn) {
30
- const uh = new UnitsHandler(this.moleculeColumn);
30
+ const uh = UnitsHandler.getOrCreate(this.moleculeColumn);
31
31
  await (uh.isFasta() ? this.computeByMM() : this.computeByChem());
32
32
 
33
33
  const diverseColumnName: string = this.diverseColumnLabel != null ? this.diverseColumnLabel :
@@ -37,6 +37,8 @@ export class SequenceDiversityViewer extends SequenceSearchBaseViewer {
37
37
  resCol.semType = DG.SEMTYPE.MACROMOLECULE;
38
38
  this.tags.forEach((tag) => resCol.setTag(tag, this.moleculeColumn!.getTag(tag)));
39
39
  const resDf = DG.DataFrame.fromColumns([resCol]);
40
+ resDf.onCurrentRowChanged.subscribe(
41
+ (_) => { this.dataFrame.currentRowIdx = this.renderMolIds![resDf.currentRowIdx]; });
40
42
  updateDivInnerHTML(this.root, resDf.plot.grid().root);
41
43
  this.computeCompleted.next(true);
42
44
  }
@@ -51,7 +53,7 @@ export class SequenceDiversityViewer extends SequenceSearchBaseViewer {
51
53
  col: monomericMols,
52
54
  metricName: this.distanceMetric,
53
55
  limit: this.limit,
54
- fingerprint: this.fingerprint
56
+ fingerprint: this.fingerprint,
55
57
  });
56
58
  }
57
59
 
@@ -60,6 +62,9 @@ export class SequenceDiversityViewer extends SequenceSearchBaseViewer {
60
62
  const len = this.moleculeColumn!.length;
61
63
  const linearizeFunc = dmLinearIndex(len);
62
64
  this.renderMolIds = getDiverseSubset(len, Math.min(len, this.limit),
63
- (i1: number, i2: number) => distanceMatrixData[linearizeFunc(i1, i2)]);
65
+ (i1: number, i2: number) => {
66
+ return this.moleculeColumn!.isNone(i1) || this.moleculeColumn!.isNone(i2) ? 0 :
67
+ distanceMatrixData[linearizeFunc(i1, i2)];
68
+ });
64
69
  }
65
70
  }
@@ -4,8 +4,8 @@ import * as grok from 'datagrok-api/grok';
4
4
 
5
5
  import {CHEM_SIMILARITY_METRICS} from '@datagrok-libraries/ml/src/distance-metrics-methods';
6
6
  import {TAGS as bioTAGS} from '@datagrok-libraries/bio/src/utils/macromolecule';
7
- import * as C from '../utils/constants';
8
7
 
8
+ const MAX_ROWS_FOR_DISTANCE_MATRIX = 22000;
9
9
  export class SequenceSearchBaseViewer extends DG.JsViewer {
10
10
  name: string = '';
11
11
  distanceMetric: string;
@@ -17,7 +17,7 @@ export class SequenceSearchBaseViewer extends DG.JsViewer {
17
17
  moleculeColumnName: string;
18
18
  initialized: boolean = false;
19
19
  tags = [DG.TAGS.UNITS, bioTAGS.aligned, bioTAGS.separator, bioTAGS.alphabet];
20
-
20
+ preComputeDistanceMatrix: boolean = false;
21
21
  constructor(name: string) {
22
22
  super();
23
23
  this.fingerprint = this.string('fingerprint', this.fingerprintChoices[0], {choices: this.fingerprintChoices});
@@ -39,6 +39,7 @@ export class SequenceSearchBaseViewer extends DG.JsViewer {
39
39
  this.init();
40
40
 
41
41
  if (this.dataFrame) {
42
+ this.preComputeDistanceMatrix = this.dataFrame.rowCount <= MAX_ROWS_FOR_DISTANCE_MATRIX;
42
43
  this.subs.push(DG.debounce(this.dataFrame.onRowsRemoved, 50).subscribe(async (_: any) => await this.render()));
43
44
  const compute = this.name !== 'diversity';
44
45
  this.subs.push(DG.debounce(this.dataFrame.onCurrentRowChanged, 50)
@@ -66,7 +67,7 @@ export class SequenceSearchBaseViewer extends DG.JsViewer {
66
67
  this.render();
67
68
  }
68
69
 
69
- async render(computeData = true) {
70
+ async render(_computeData = true) {
70
71
 
71
72
  }
72
73
 
@@ -4,13 +4,13 @@ import * as DG from 'datagrok-api/dg';
4
4
 
5
5
  import {SequenceSearchBaseViewer} from './sequence-search-base-viewer';
6
6
  import {getMonomericMols} from '../calculations/monomerLevelMols';
7
- import * as C from '../utils/constants';
8
7
  import {createDifferenceCanvas, createDifferencesWithPositions} from './sequence-activity-cliffs';
9
8
  import {updateDivInnerHTML} from '../utils/ui-utils';
10
9
  import {Subject} from 'rxjs';
11
10
  import {TAGS as bioTAGS, getSplitter} from '@datagrok-libraries/bio/src/utils/macromolecule';
12
11
  import {UnitsHandler} from '@datagrok-libraries/bio/src/utils/units-handler';
13
12
  import {calcMmDistanceMatrix, dmLinearIndex} from './workers/mm-distance-worker-creator';
13
+ import {calculateMMDistancesArray} from './workers/mm-distance-array-service';
14
14
 
15
15
  export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
16
16
  cutoff: number;
@@ -47,7 +47,7 @@ export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
47
47
  this.curIdx = this.dataFrame!.currentRowIdx == -1 ? 0 : this.dataFrame!.currentRowIdx;
48
48
  if (computeData && !this.gridSelect) {
49
49
  this.targetMoleculeIdx = this.dataFrame!.currentRowIdx == -1 ? 0 : this.dataFrame!.currentRowIdx;
50
- const uh = new UnitsHandler(this.moleculeColumn!);
50
+ const uh = UnitsHandler.getOrCreate(this.moleculeColumn!);
51
51
 
52
52
  await (uh.isFasta() ? this.computeByMM() : this.computeByChem());
53
53
  const similarColumnName: string = this.similarColumnLabel != null ? this.similarColumnLabel :
@@ -67,7 +67,7 @@ export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
67
67
  const targetMolRow = this.idxs?.getRawData().findIndex((it) => it == this.targetMoleculeIdx);
68
68
  const targetScoreCell = grid.cell('score', targetMolRow!);
69
69
  targetScoreCell.cell.value = null;
70
- (grok.shell.v as DG.TableView).grid.root.addEventListener('click', (event: MouseEvent) => {
70
+ (grok.shell.v as DG.TableView).grid.root.addEventListener('click', (_event: MouseEvent) => {
71
71
  this.gridSelect = false;
72
72
  });
73
73
  updateDivInnerHTML(this.root, grid.root);
@@ -87,23 +87,29 @@ export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
87
87
  metricName: this.distanceMetric,
88
88
  limit: this.limit,
89
89
  minScore: this.cutoff,
90
- fingerprint: this.fingerprint
90
+ fingerprint: this.fingerprint,
91
91
  });
92
92
  this.idxs = df.getCol('indexes');
93
93
  this.scores = df.getCol('score');
94
94
  }
95
95
 
96
96
  private async computeByMM() {
97
- if (!this.distanceMatrixComputed) {
97
+ let distanceArray = new Float32Array();
98
+ if (!this.distanceMatrixComputed && this.preComputeDistanceMatrix) {
98
99
  this.mmDistanceMatrix = await calcMmDistanceMatrix(this.moleculeColumn!);
99
100
  this.distanceMatrixComputed = true;
101
+ } else if (!this.preComputeDistanceMatrix) {
102
+ // use fast distance array calculation if matrix will take too much space
103
+ distanceArray = await calculateMMDistancesArray(this.moleculeColumn!, this.targetMoleculeIdx);
100
104
  }
101
105
  const len = this.moleculeColumn!.length;
102
106
  const linearizeFunc = dmLinearIndex(len);
103
107
  // array that keeps track of the indexes and scores together
104
108
  const indexWScore = Array(len).fill(0)
105
109
  .map((_, i) => ({idx: i, score: i === this.targetMoleculeIdx ? 1 :
106
- 1 - this.mmDistanceMatrix[linearizeFunc(this.targetMoleculeIdx, i)]}));
110
+ this.preComputeDistanceMatrix ? 1 - this.mmDistanceMatrix[linearizeFunc(this.targetMoleculeIdx, i)] :
111
+ 1 - distanceArray[i]
112
+ }));
107
113
  indexWScore.sort((a, b) => b.score - a.score);
108
114
  // get the most similar molecules
109
115
  const actualLimit = Math.min(this.limit, len);
@@ -127,7 +133,7 @@ export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
127
133
  propPanel.append(ui.divV([
128
134
  ui.divText(`Different sequence length:`, {style: {fontWeight: 'bold'}}),
129
135
  ui.divText(`target: ${subParts1.length} monomers`),
130
- ui.divText(`selected: ${subParts2.length} monomers`)
136
+ ui.divText(`selected: ${subParts2.length} monomers`),
131
137
  ], {style: {paddingBottom: '10px'}}));
132
138
  }
133
139
  propPanel.append(createDifferencesWithPositions(molDifferences));