@datagrok/bio 2.4.31 → 2.4.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. package/.eslintrc.json +6 -8
  2. package/README.md +22 -7
  3. package/detectors.js +21 -12
  4. package/dist/1.js +2 -0
  5. package/dist/1.js.map +1 -0
  6. package/dist/18.js +2 -0
  7. package/dist/18.js.map +1 -0
  8. package/dist/190.js +2 -0
  9. package/dist/190.js.map +1 -0
  10. package/dist/452.js +2 -0
  11. package/dist/452.js.map +1 -0
  12. package/dist/729.js +2 -0
  13. package/dist/729.js.map +1 -0
  14. package/dist/package-test.js +1 -1
  15. package/dist/package-test.js.map +1 -1
  16. package/dist/package.js +1 -1
  17. package/dist/package.js.map +1 -1
  18. package/files/libraries/broken-lib.sdf +136 -0
  19. package/files/libraries/group1/mock-lib-3.json +74 -0
  20. package/files/libraries/mock-lib-2.json +48 -0
  21. package/files/tests/100_3_clustests.csv +100 -0
  22. package/files/tests/100_3_clustests_empty_vals.csv +100 -0
  23. package/files/tests/peptides_motif-with-random_10000.csv +9998 -0
  24. package/package.json +4 -4
  25. package/scripts/sequence_generator.py +164 -48
  26. package/src/analysis/sequence-activity-cliffs.ts +7 -9
  27. package/src/analysis/sequence-diversity-viewer.ts +8 -3
  28. package/src/analysis/sequence-search-base-viewer.ts +4 -3
  29. package/src/analysis/sequence-similarity-viewer.ts +13 -7
  30. package/src/analysis/sequence-space.ts +15 -12
  31. package/src/analysis/workers/mm-distance-array-service.ts +48 -0
  32. package/src/analysis/workers/mm-distance-array-worker.ts +29 -0
  33. package/src/analysis/workers/mm-distance-worker-creator.ts +6 -9
  34. package/src/apps/web-logo-app.ts +34 -0
  35. package/src/calculations/monomerLevelMols.ts +10 -12
  36. package/src/demo/bio01-similarity-diversity.ts +4 -5
  37. package/src/demo/bio01a-hierarchical-clustering-and-sequence-space.ts +6 -7
  38. package/src/demo/bio01b-hierarchical-clustering-and-activity-cliffs.ts +7 -8
  39. package/src/demo/bio03-atomic-level.ts +1 -4
  40. package/src/demo/bio05-helm-msa-sequence-space.ts +6 -4
  41. package/src/demo/utils.ts +3 -4
  42. package/src/package-test.ts +1 -2
  43. package/src/package.ts +135 -82
  44. package/src/seq_align.ts +482 -483
  45. package/src/substructure-search/substructure-search.ts +3 -3
  46. package/src/tests/Palettes-test.ts +1 -1
  47. package/src/tests/WebLogo-positions-test.ts +12 -35
  48. package/src/tests/_first-tests.ts +1 -1
  49. package/src/tests/activity-cliffs-tests.ts +10 -7
  50. package/src/tests/activity-cliffs-utils.ts +6 -5
  51. package/src/tests/bio-tests.ts +20 -25
  52. package/src/tests/checkInputColumn-tests.ts +5 -11
  53. package/src/tests/converters-test.ts +19 -37
  54. package/src/tests/detectors-benchmark-tests.ts +35 -37
  55. package/src/tests/detectors-tests.ts +29 -34
  56. package/src/tests/detectors-weak-and-likely-tests.ts +11 -21
  57. package/src/tests/fasta-export-tests.ts +3 -3
  58. package/src/tests/fasta-handler-test.ts +2 -3
  59. package/src/tests/lib-tests.ts +2 -4
  60. package/src/tests/mm-distance-tests.ts +25 -17
  61. package/src/tests/monomer-libraries-tests.ts +1 -1
  62. package/src/tests/msa-tests.ts +12 -9
  63. package/src/tests/pepsea-tests.ts +6 -3
  64. package/src/tests/renderers-test.ts +13 -11
  65. package/src/tests/sequence-space-test.ts +10 -8
  66. package/src/tests/sequence-space-utils.ts +6 -4
  67. package/src/tests/similarity-diversity-tests.ts +47 -61
  68. package/src/tests/splitters-test.ts +14 -20
  69. package/src/tests/to-atomic-level-tests.ts +9 -17
  70. package/src/tests/units-handler-splitted-tests.ts +106 -0
  71. package/src/tests/units-handler-tests.ts +22 -26
  72. package/src/tests/utils/sequences-generators.ts +6 -2
  73. package/src/tests/utils.ts +10 -4
  74. package/src/tests/viewers.ts +1 -1
  75. package/src/utils/atomic-works.ts +49 -57
  76. package/src/utils/cell-renderer.ts +25 -8
  77. package/src/utils/check-input-column.ts +19 -4
  78. package/src/utils/constants.ts +3 -3
  79. package/src/utils/convert.ts +56 -23
  80. package/src/utils/monomer-lib.ts +83 -64
  81. package/src/utils/multiple-sequence-alignment-ui.ts +24 -21
  82. package/src/utils/multiple-sequence-alignment.ts +2 -2
  83. package/src/utils/pepsea.ts +17 -7
  84. package/src/utils/save-as-fasta.ts +11 -4
  85. package/src/utils/ui-utils.ts +1 -1
  86. package/src/viewers/vd-regions-viewer.ts +21 -22
  87. package/src/viewers/web-logo-viewer.ts +189 -154
  88. package/src/widgets/bio-substructure-filter.ts +9 -6
  89. package/src/widgets/representations.ts +11 -12
  90. package/tsconfig.json +1 -1
  91. package/dist/258.js +0 -2
  92. package/dist/258.js.map +0 -1
  93. package/dist/457.js +0 -2
  94. package/dist/457.js.map +0 -1
  95. package/dist/562.js +0 -2
  96. package/dist/562.js.map +0 -1
  97. package/dist/925.js +0 -2
  98. package/dist/925.js.map +0 -1
  99. package/src/analysis/workers/mm-distance-worker.ts +0 -16
package/package.json CHANGED
@@ -5,7 +5,7 @@
5
5
  "name": "Leonid Stolbov",
6
6
  "email": "lstolbov@datagrok.ai"
7
7
  },
8
- "version": "2.4.31",
8
+ "version": "2.4.39",
9
9
  "description": "Bioinformatics support (import/export of sequences, conversion, visualization, analysis). [See more](https://github.com/datagrok-ai/public/blob/master/packages/Bio/README.md) for details.",
10
10
  "repository": {
11
11
  "type": "git",
@@ -14,11 +14,11 @@
14
14
  },
15
15
  "dependencies": {
16
16
  "@biowasm/aioli": "^3.1.0",
17
- "@datagrok-libraries/bio": "^5.30.0",
17
+ "@datagrok-libraries/bio": "^5.32.1",
18
18
  "@datagrok-libraries/chem-meta": "^1.0.1",
19
- "@datagrok-libraries/ml": "^6.3.27",
19
+ "@datagrok-libraries/ml": "^6.3.37",
20
20
  "@datagrok-libraries/tutorials": "^1.3.2",
21
- "@datagrok-libraries/utils": "^4.0.8",
21
+ "@datagrok-libraries/utils": "^4.0.11",
22
22
  "cash-dom": "^8.0.0",
23
23
  "css-loader": "^6.7.3",
24
24
  "datagrok-api": "^1.13.3",
@@ -13,21 +13,34 @@
13
13
  # input: bool disable_cliffs = False [Disable generation of cliffs]
14
14
  # input: double cliff_probability = 0.01 [Probability to make activity cliff of a sequence]
15
15
  # input: double cliff_strength = 4.0 [Strength of cliff]
16
+ # input: double fasta_separator = '' [Separator for a FASTA notation]
16
17
  # output: dataframe sequences
17
18
 
18
19
  import random
19
20
  import argparse
20
21
  import sys
22
+ from enum import Enum
21
23
 
22
24
  from typing import List, Tuple, Dict, Iterator, Any
23
25
 
24
- alphabet_type = List[str]
25
26
 
26
- letter_choice_type = List[str]
27
- motif_template_type = List[letter_choice_type]
27
+ # --- Type definitions ---
28
28
 
29
- sequence_record_type = Tuple[int, str, float, bool]
30
- sequence_record_cluster_type = Tuple[int, str, str, float, bool]
29
+ Letter = str
30
+ Alphabet = List[str]
31
+
32
+ LetterChoice = List[Letter]
33
+ MotifTemplate = List[LetterChoice]
34
+
35
+ Sequence = List[Letter] # The sequence in a form of list
36
+ SequenceSquashed = str # Sequence, joined together in string form
37
+
38
+ SequenceRecord = Tuple[int, Sequence, float, bool]
39
+ ClusterSequenceRecord = Tuple[int, str, Sequence, float, bool]
40
+
41
+ # --- constants ---
42
+
43
+ HelmConnectionMode = Enum("HelmConnectionMode", ["linear", "cyclic", "mixed"])
31
44
 
32
45
  alphabets: Dict[str, str] = {
33
46
  "PT": "A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y",
@@ -42,10 +55,10 @@ def mean_range(mean: int, disp: int) -> int:
42
55
 
43
56
  def generate_motif_template(
44
57
  motif_length: int,
45
- alphabet: alphabet_type,
58
+ alphabet: Alphabet,
46
59
  max_variants_cluster: int,
47
60
  prob_any: float = 0.2,
48
- ) -> motif_template_type:
61
+ ) -> MotifTemplate:
49
62
  motif_template = []
50
63
  for position in range(motif_length):
51
64
  # Selecting letters for position i
@@ -53,20 +66,20 @@ def generate_motif_template(
53
66
  letters = ["?"] # this stands for any symbol
54
67
  else:
55
68
  n_variants = random.randrange(max_variants_cluster) + 1
56
- letters = [random.choice(alphabet) for i in range(n_variants)]
69
+ letters = list(set((random.choice(alphabet) for i in range(n_variants))))
57
70
  motif_template.append(letters)
58
71
  return motif_template
59
72
 
60
73
 
61
- def generate_motif(template: motif_template_type, alphabet: alphabet_type) -> str:
74
+ def generate_motif(template: MotifTemplate, alphabet: Alphabet) -> Sequence:
62
75
  template_with_any = [
63
76
  (letters if not "?" in letters else alphabet) for letters in template
64
77
  ]
65
- return "".join([random.choice(letters) for letters in template_with_any])
78
+ return [random.choice(letters) for letters in template_with_any]
66
79
 
67
80
 
68
- def motif_notation(motif_template: motif_template_type) -> str:
69
- def motif_notation_code(letter_choice: letter_choice_type) -> str:
81
+ def motif_notation(motif_template: MotifTemplate) -> str:
82
+ def motif_notation_code(letter_choice: LetterChoice) -> str:
70
83
  if len(letter_choice) == 1:
71
84
  return letter_choice[0]
72
85
  else:
@@ -77,21 +90,51 @@ def motif_notation(motif_template: motif_template_type) -> str:
77
90
  )
78
91
 
79
92
 
80
- def generate_random(n: int, alphabet: alphabet_type) -> str:
81
- return "".join([random.choice(alphabet) for i in range(n)])
93
+ def generate_random(n: int, alphabet: Alphabet) -> Sequence:
94
+ return [random.choice(alphabet) for i in range(n)]
82
95
 
83
96
 
84
97
  def make_cliff(
85
- motif_template: motif_template_type, alphabet: alphabet_type, motif: str
86
- ) -> str:
98
+ motif_template: MotifTemplate, alphabet: Alphabet, motif: Sequence
99
+ ) -> Sequence:
87
100
  # Mutate conservative letter in motif
88
- pos = random.randrange(len(motif_template))
101
+ motif_len = len(motif_template)
102
+ pos = random.randrange(motif_len)
89
103
  while "?" in motif_template[pos]:
90
- pos = (pos + 1) % len(
91
- motif_template
92
- ) # always will find letters since ends of motif can't be any symbol
104
+ pos = (
105
+ pos + 1
106
+ ) % motif_len # always will find letters since ends of motif can't be any symbol
93
107
  outlier_letters = list(set(alphabet) - set(motif_template[pos]))
94
- return motif[:pos] + random.choice(outlier_letters) + motif[pos + 1 :]
108
+ new_letter = random.choice(outlier_letters)
109
+ return (
110
+ motif[:pos]
111
+ + [
112
+ new_letter,
113
+ ]
114
+ + motif[pos + 1 :]
115
+ )
116
+
117
+
118
+ def sequence_to_fasta(sequence: Sequence, separator: str) -> SequenceSquashed:
119
+ return separator.join(sequence)
120
+
121
+
122
+ def sequence_to_helm(
123
+ sequence: Sequence, helm_connection_mode: str = HelmConnectionMode.linear.name
124
+ ) -> SequenceSquashed:
125
+ def is_cyclic(helm_connection_mode: str) -> bool:
126
+ return helm_connection_mode == HelmConnectionMode.cyclic.name or (
127
+ helm_connection_mode == HelmConnectionMode.mixed.name
128
+ and random.random() < 0.5
129
+ )
130
+
131
+ sequence_escaped: Sequence = [
132
+ f"[{letter}]" if len(letter) > 1 else letter for letter in sequence
133
+ ]
134
+ connection_format = ""
135
+ if is_cyclic(helm_connection_mode):
136
+ connection_format = f"PEPTIDE1,PEPTIDE1,{len(sequence_escaped)}:R2-1:R1"
137
+ return f"PEPTIDE1{{{sequence_to_fasta(sequence_escaped,'.')}}}${connection_format}$$$V2.0"
95
138
 
96
139
 
97
140
  def generate_cluster(
@@ -99,16 +142,17 @@ def generate_cluster(
99
142
  motif_length: int,
100
143
  prefix_length: int,
101
144
  suffix_length: int,
102
- max_variants_position: int,
145
+ max_variants_per_position: int,
103
146
  make_cliffs: bool,
104
- alphabet: alphabet_type,
147
+ alphabet: Alphabet,
105
148
  cliff_probability: float,
106
149
  cliff_strength: float,
107
- ) -> Iterator[sequence_record_type]:
150
+ ) -> Iterator[SequenceRecord]:
151
+ # Making a motif template
108
152
  motif_template = generate_motif_template(
109
- motif_length, alphabet, max_variants_position
153
+ motif_length, alphabet, max_variants_per_position
110
154
  )
111
-
155
+ # Setting average and dispersion for activity
112
156
  activity_average = random.random() * 10
113
157
  activity_dispersion = random.random()
114
158
  sys.stderr.write(f"Motif template: {motif_notation(motif_template)}\n")
@@ -120,11 +164,10 @@ def generate_cluster(
120
164
  prefix = generate_random(prefix_length, alphabet)
121
165
  suffix = generate_random(suffix_length, alphabet)
122
166
  seq = prefix + motif + suffix
123
-
124
- is_cliff = make_cliffs and (random.random() <= cliff_probability)
125
- sequence_record: sequence_record_type = (n_seq, seq, activity, is_cliff)
167
+ sequence_record: SequenceRecord = (n_seq, seq, activity, False)
126
168
  yield sequence_record
127
169
 
170
+ is_cliff = make_cliffs and (random.random() <= cliff_probability)
128
171
  if is_cliff:
129
172
  # Making activity cliff
130
173
  cliff_motif = make_cliff(motif_template, alphabet, motif)
@@ -146,16 +189,16 @@ def generate_sequences(
146
189
  n_clusters: int,
147
190
  n_sequences: int,
148
191
  average_motif_length: int,
149
- max_variants_position: int,
192
+ max_variants_per_position: int,
150
193
  average_random_length: int,
151
194
  dispersion: int,
152
- alphabet: alphabet_type,
195
+ alphabet: Alphabet,
153
196
  make_cliffs: bool,
154
197
  cliff_probability: float,
155
198
  cliff_strength: float,
156
- ) -> Tuple[List[str], List[sequence_record_cluster_type]]:
199
+ ) -> Tuple[List[str], List[ClusterSequenceRecord]]:
157
200
  headers: List[str] = ["cluster", "sequence_id", "sequence", "activity", "is_cliff"]
158
- sequences: List[sequence_record_cluster_type] = []
201
+ sequences: List[ClusterSequenceRecord] = []
159
202
 
160
203
  for n_cluster in range(n_clusters):
161
204
  motif_length = mean_range(average_motif_length, dispersion)
@@ -170,28 +213,71 @@ def generate_sequences(
170
213
  motif_length,
171
214
  prefix_length,
172
215
  suffix_length,
173
- max_variants_position,
216
+ max_variants_per_position,
174
217
  make_cliffs,
175
218
  alphabet,
176
219
  cliff_probability,
177
220
  cliff_strength,
178
221
  ):
179
222
  sequences.append(
180
- (n_cluster, f"c{n_cluster}_s{n_seq}", seq, activity, is_cliff)
223
+ (n_cluster, f"c{n_cluster}_s{n_seq:03d}", seq, activity, is_cliff)
181
224
  )
182
225
  return headers, sequences
183
226
 
184
227
 
228
+ def convert_to_fasta(
229
+ cluster_sequence_records: List[ClusterSequenceRecord], separator: str
230
+ ) -> List[Tuple[int, str, str, float, bool]]:
231
+ return [
232
+ (n_cluster, name_cluster, sequence_to_fasta(seq, separator), activity, is_cliff)
233
+ for n_cluster, name_cluster, seq, activity, is_cliff in cluster_sequence_records
234
+ ]
235
+
236
+
237
+ def convert_to_helm(
238
+ cluster_sequence_records: List[ClusterSequenceRecord], helm_connection_mode: str
239
+ ) -> List[Tuple[int, str, str, float, bool]]:
240
+ return [
241
+ (
242
+ n_cluster,
243
+ name_cluster,
244
+ sequence_to_helm(seq, helm_connection_mode),
245
+ activity,
246
+ is_cliff,
247
+ )
248
+ for n_cluster, name_cluster, seq, activity, is_cliff in cluster_sequence_records
249
+ ]
250
+
251
+
252
+ def is_monomer_suitable(monomer: Any) -> bool:
253
+ return (
254
+ monomer["polymerType"] == "PEPTIDE"
255
+ and monomer["monomerType"] == "Backbone"
256
+ and len(monomer["rgroups"]) == 2
257
+ )
258
+
259
+
260
+ def alphabet_from_helm(helm_library_file: str) -> Alphabet:
261
+ import json
262
+
263
+ alphabet: Alphabet = []
264
+ with open(helm_library_file) as helm_library:
265
+ for monomer in json.load(helm_library):
266
+ if is_monomer_suitable(monomer):
267
+ alphabet.append(monomer["symbol"])
268
+ return alphabet
269
+
270
+
185
271
  def parse_command_line_args() -> Any:
186
272
  parser = argparse.ArgumentParser(
187
273
  prog="MotifSequencesGenerator",
188
274
  description="The program generates set of sequences containing sequence motifs "
189
- "for SAR fucntionality testing",
190
- epilog="Utility support: Gennadii Zakharov",
275
+ "for SAR functionality testing",
276
+ epilog="Utility author and support: Gennadii Zakharov <Gennadiy.Zakharov@gmail.com>",
191
277
  )
192
278
 
193
279
  parser.add_argument(
194
- "-c", "--clusters", type=int, default=5, help="Number of superclusters"
280
+ "-c", "--clusters", type=int, default=5, help="Number of clusters"
195
281
  )
196
282
  parser.add_argument(
197
283
  "-s",
@@ -219,6 +305,21 @@ def parse_command_line_args() -> Any:
219
305
  help="Variation of total sequence length",
220
306
  )
221
307
 
308
+ parser.add_argument(
309
+ "-h,",
310
+ "--helm-library-file",
311
+ type=str,
312
+ help="JSON file containing the HELM monomer library in the same format as used for Datagrok. "
313
+ + "The alphabet property is ignored when helm library is specified.",
314
+ )
315
+
316
+ parser.add_argument(
317
+ "--helm-connection-mode",
318
+ type=str,
319
+ default=HelmConnectionMode.linear.value,
320
+ help=f"HELM peptide generation mode: {'/'.join([mode.name for mode in HelmConnectionMode])}",
321
+ )
322
+
222
323
  available_alphabets = ",".join(list(alphabets.keys()) + ["custom"])
223
324
  parser.add_argument(
224
325
  "--alphabet",
@@ -251,7 +352,12 @@ def parse_command_line_args() -> Any:
251
352
  default=False,
252
353
  help="Disable generation of cliffs",
253
354
  )
254
-
355
+ parser.add_argument(
356
+ "--fasta-separator",
357
+ type=str,
358
+ default="",
359
+ help="Separator symbol for FASTA sequence",
360
+ )
255
361
  command_line_args = parser.parse_args()
256
362
 
257
363
  return command_line_args
@@ -274,12 +380,18 @@ if not grok:
274
380
  disable_cliffs = args.disable_cliffs
275
381
  cliff_probability = args.cliff_probability
276
382
  cliff_strength = args.cliff_strength
277
-
278
- alphabet: alphabet_type = (
279
- alphabets[alphabet_key].split(",")
280
- if alphabet_key in alphabets
281
- else alphabet_key.split(",")
282
- )
383
+ fasta_separator = args.fasta_separator
384
+ helm_library_file = args.helm_library_file
385
+ helm_connection_mode = args.helm_connection_mode
386
+
387
+ if helm_library_file is None:
388
+ alphabet: Alphabet = (
389
+ alphabets[alphabet_key].split(",")
390
+ if alphabet_key in alphabets
391
+ else alphabet_key.split(",")
392
+ )
393
+ else:
394
+ alphabet = alphabet_from_helm(helm_library_file)
283
395
 
284
396
  # Running sequence generator
285
397
  header, data = generate_sequences(
@@ -294,17 +406,21 @@ header, data = generate_sequences(
294
406
  cliff_probability,
295
407
  cliff_strength,
296
408
  )
409
+ if helm_library_file is None:
410
+ data_formatted = convert_to_fasta(data, fasta_separator)
411
+ else:
412
+ data_formatted = convert_to_helm(data, helm_connection_mode)
297
413
 
298
414
  if grok:
299
- # Exporting data to Datagrok as a pandas dataframe
415
+ # Exporting data to Datagrok as a Pandas dataframe
300
416
  import pandas as pd
301
417
 
302
- sequences = pd.DataFrame.from_records(data, columns=header)
418
+ sequences = pd.DataFrame.from_records(data_formatted, columns=header)
303
419
  else:
304
420
  # Writing results to stdout - no need to work with big and heavy Pandas
305
421
  import csv
306
422
 
307
423
  csv_writer = csv.writer(sys.stdout, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
308
424
  csv_writer.writerow(header)
309
- for line in data:
425
+ for line in data_formatted:
310
426
  csv_writer.writerow(line)
@@ -6,8 +6,6 @@ import {ITooltipAndPanelParams} from '@datagrok-libraries/ml/src/viewers/activit
6
6
  import {getSimilarityFromDistance} from '@datagrok-libraries/ml/src/distance-metrics-methods';
7
7
  import {AvailableMetrics, DistanceMetricsSubjects, StringMetricsNames} from '@datagrok-libraries/ml/src/typed-metrics';
8
8
  import {drawMoleculeDifferenceOnCanvas} from '../utils/cell-renderer';
9
- import * as C from '../utils/constants';
10
- import {GridColumn} from 'datagrok-api/dg';
11
9
  import {invalidateMols, MONOMERIC_COL_TAGS} from '../substructure-search/substructure-search';
12
10
  import {getSplitter, TAGS as bioTAGS} from '@datagrok-libraries/bio/src/utils/macromolecule';
13
11
 
@@ -24,7 +22,7 @@ export async function getDistances(col: DG.Column, seq: string): Promise<Array<n
24
22
  }
25
23
 
26
24
  export async function getSimilaritiesMatrix(
27
- dim: number, seqCol: DG.Column, df: DG.DataFrame, colName: string, simArr: DG.Column[]
25
+ dim: number, seqCol: DG.Column, df: DG.DataFrame, colName: string, simArr: DG.Column[],
28
26
  ): Promise<DG.Column[]> {
29
27
  const distances = new Array(simArr.length).fill(null);
30
28
  for (let i = 0; i != dim - 1; ++i) {
@@ -54,7 +52,7 @@ export async function getChemSimilaritiesMatrix(dim: number, seqCol: DG.Column,
54
52
  col: seqCol.temp[MONOMERIC_COL_TAGS.MONOMERIC_MOLS],
55
53
  df: fpDf,
56
54
  colName: colName,
57
- simArr: simArr
55
+ simArr: simArr,
58
56
  });
59
57
  return res;
60
58
  }
@@ -69,7 +67,7 @@ export function createTooltipElement(params: ITooltipAndPanelParams): HTMLDivEle
69
67
  columnNames.style.display = 'flex';
70
68
  columnNames.style.justifyContent = 'space-between';
71
69
  tooltipElement.append(columnNames);
72
- params.line.mols.forEach((molIdx: number, idx: number) => {
70
+ params.line.mols.forEach((molIdx: number, _idx: number) => {
73
71
  const activity = ui.divText(params.activityCol.get(molIdx).toFixed(2));
74
72
  activity.style.display = 'flex';
75
73
  activity.style.justifyContent = 'left';
@@ -82,7 +80,7 @@ export function createTooltipElement(params: ITooltipAndPanelParams): HTMLDivEle
82
80
  return tooltipElement;
83
81
  }
84
82
 
85
- function moleculeInfo(df: DG.DataFrame, idx: number, seqColName: string): HTMLElement {
83
+ function _moleculeInfo(df: DG.DataFrame, idx: number, seqColName: string): HTMLElement {
86
84
  const dict: { [key: string]: string } = {};
87
85
  for (const col of df.columns) {
88
86
  if (col.name !== seqColName)
@@ -124,7 +122,7 @@ export function createPropPanelElement(params: ITooltipAndPanelParams): HTMLDivE
124
122
  function createPropPanelField(name: string, value: number): HTMLDivElement {
125
123
  return ui.divH([
126
124
  ui.divText(`${name}: `, {style: {fontWeight: 'bold', paddingRight: '5px'}}),
127
- ui.divText(value.toFixed(2))
125
+ ui.divText(value.toFixed(2)),
128
126
  ], {style: {paddingTop: '10px'}});
129
127
  }
130
128
 
@@ -147,13 +145,13 @@ export function createDifferencesWithPositions(
147
145
  const diffsPanel = ui.divV([]);
148
146
  diffsPanel.append(ui.divH([
149
147
  ui.divText('Pos', {style: {fontWeight: 'bold', width: '30px', borderBottom: '1px solid'}}),
150
- ui.divText('Difference', {style: {fontWeight: 'bold', borderBottom: '1px solid'}})
148
+ ui.divText('Difference', {style: {fontWeight: 'bold', borderBottom: '1px solid'}}),
151
149
  ]));
152
150
  for (const key of Object.keys(molDifferences)) {
153
151
  molDifferences[key as any].style.borderBottom = '1px solid lightgray';
154
152
  diffsPanel.append(ui.divH([
155
153
  ui.divText((parseInt(key) + 1).toString(), {style: {width: '30px', borderBottom: '1px solid lightgray'}}),
156
- molDifferences[key as any]
154
+ molDifferences[key as any],
157
155
  ]));
158
156
  }
159
157
  div.append(diffsPanel);
@@ -27,7 +27,7 @@ export class SequenceDiversityViewer extends SequenceSearchBaseViewer {
27
27
  return;
28
28
  if (this.dataFrame) {
29
29
  if (computeData && this.moleculeColumn) {
30
- const uh = new UnitsHandler(this.moleculeColumn);
30
+ const uh = UnitsHandler.getOrCreate(this.moleculeColumn);
31
31
  await (uh.isFasta() ? this.computeByMM() : this.computeByChem());
32
32
 
33
33
  const diverseColumnName: string = this.diverseColumnLabel != null ? this.diverseColumnLabel :
@@ -37,6 +37,8 @@ export class SequenceDiversityViewer extends SequenceSearchBaseViewer {
37
37
  resCol.semType = DG.SEMTYPE.MACROMOLECULE;
38
38
  this.tags.forEach((tag) => resCol.setTag(tag, this.moleculeColumn!.getTag(tag)));
39
39
  const resDf = DG.DataFrame.fromColumns([resCol]);
40
+ resDf.onCurrentRowChanged.subscribe(
41
+ (_) => { this.dataFrame.currentRowIdx = this.renderMolIds![resDf.currentRowIdx]; });
40
42
  updateDivInnerHTML(this.root, resDf.plot.grid().root);
41
43
  this.computeCompleted.next(true);
42
44
  }
@@ -51,7 +53,7 @@ export class SequenceDiversityViewer extends SequenceSearchBaseViewer {
51
53
  col: monomericMols,
52
54
  metricName: this.distanceMetric,
53
55
  limit: this.limit,
54
- fingerprint: this.fingerprint
56
+ fingerprint: this.fingerprint,
55
57
  });
56
58
  }
57
59
 
@@ -60,6 +62,9 @@ export class SequenceDiversityViewer extends SequenceSearchBaseViewer {
60
62
  const len = this.moleculeColumn!.length;
61
63
  const linearizeFunc = dmLinearIndex(len);
62
64
  this.renderMolIds = getDiverseSubset(len, Math.min(len, this.limit),
63
- (i1: number, i2: number) => distanceMatrixData[linearizeFunc(i1, i2)]);
65
+ (i1: number, i2: number) => {
66
+ return this.moleculeColumn!.isNone(i1) || this.moleculeColumn!.isNone(i2) ? 0 :
67
+ distanceMatrixData[linearizeFunc(i1, i2)];
68
+ });
64
69
  }
65
70
  }
@@ -4,8 +4,8 @@ import * as grok from 'datagrok-api/grok';
4
4
 
5
5
  import {CHEM_SIMILARITY_METRICS} from '@datagrok-libraries/ml/src/distance-metrics-methods';
6
6
  import {TAGS as bioTAGS} from '@datagrok-libraries/bio/src/utils/macromolecule';
7
- import * as C from '../utils/constants';
8
7
 
8
+ const MAX_ROWS_FOR_DISTANCE_MATRIX = 22000;
9
9
  export class SequenceSearchBaseViewer extends DG.JsViewer {
10
10
  name: string = '';
11
11
  distanceMetric: string;
@@ -17,7 +17,7 @@ export class SequenceSearchBaseViewer extends DG.JsViewer {
17
17
  moleculeColumnName: string;
18
18
  initialized: boolean = false;
19
19
  tags = [DG.TAGS.UNITS, bioTAGS.aligned, bioTAGS.separator, bioTAGS.alphabet];
20
-
20
+ preComputeDistanceMatrix: boolean = false;
21
21
  constructor(name: string) {
22
22
  super();
23
23
  this.fingerprint = this.string('fingerprint', this.fingerprintChoices[0], {choices: this.fingerprintChoices});
@@ -39,6 +39,7 @@ export class SequenceSearchBaseViewer extends DG.JsViewer {
39
39
  this.init();
40
40
 
41
41
  if (this.dataFrame) {
42
+ this.preComputeDistanceMatrix = this.dataFrame.rowCount <= MAX_ROWS_FOR_DISTANCE_MATRIX;
42
43
  this.subs.push(DG.debounce(this.dataFrame.onRowsRemoved, 50).subscribe(async (_: any) => await this.render()));
43
44
  const compute = this.name !== 'diversity';
44
45
  this.subs.push(DG.debounce(this.dataFrame.onCurrentRowChanged, 50)
@@ -66,7 +67,7 @@ export class SequenceSearchBaseViewer extends DG.JsViewer {
66
67
  this.render();
67
68
  }
68
69
 
69
- async render(computeData = true) {
70
+ async render(_computeData = true) {
70
71
 
71
72
  }
72
73
 
@@ -4,13 +4,13 @@ import * as DG from 'datagrok-api/dg';
4
4
 
5
5
  import {SequenceSearchBaseViewer} from './sequence-search-base-viewer';
6
6
  import {getMonomericMols} from '../calculations/monomerLevelMols';
7
- import * as C from '../utils/constants';
8
7
  import {createDifferenceCanvas, createDifferencesWithPositions} from './sequence-activity-cliffs';
9
8
  import {updateDivInnerHTML} from '../utils/ui-utils';
10
9
  import {Subject} from 'rxjs';
11
10
  import {TAGS as bioTAGS, getSplitter} from '@datagrok-libraries/bio/src/utils/macromolecule';
12
11
  import {UnitsHandler} from '@datagrok-libraries/bio/src/utils/units-handler';
13
12
  import {calcMmDistanceMatrix, dmLinearIndex} from './workers/mm-distance-worker-creator';
13
+ import {calculateMMDistancesArray} from './workers/mm-distance-array-service';
14
14
 
15
15
  export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
16
16
  cutoff: number;
@@ -47,7 +47,7 @@ export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
47
47
  this.curIdx = this.dataFrame!.currentRowIdx == -1 ? 0 : this.dataFrame!.currentRowIdx;
48
48
  if (computeData && !this.gridSelect) {
49
49
  this.targetMoleculeIdx = this.dataFrame!.currentRowIdx == -1 ? 0 : this.dataFrame!.currentRowIdx;
50
- const uh = new UnitsHandler(this.moleculeColumn!);
50
+ const uh = UnitsHandler.getOrCreate(this.moleculeColumn!);
51
51
 
52
52
  await (uh.isFasta() ? this.computeByMM() : this.computeByChem());
53
53
  const similarColumnName: string = this.similarColumnLabel != null ? this.similarColumnLabel :
@@ -67,7 +67,7 @@ export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
67
67
  const targetMolRow = this.idxs?.getRawData().findIndex((it) => it == this.targetMoleculeIdx);
68
68
  const targetScoreCell = grid.cell('score', targetMolRow!);
69
69
  targetScoreCell.cell.value = null;
70
- (grok.shell.v as DG.TableView).grid.root.addEventListener('click', (event: MouseEvent) => {
70
+ (grok.shell.v as DG.TableView).grid.root.addEventListener('click', (_event: MouseEvent) => {
71
71
  this.gridSelect = false;
72
72
  });
73
73
  updateDivInnerHTML(this.root, grid.root);
@@ -87,23 +87,29 @@ export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
87
87
  metricName: this.distanceMetric,
88
88
  limit: this.limit,
89
89
  minScore: this.cutoff,
90
- fingerprint: this.fingerprint
90
+ fingerprint: this.fingerprint,
91
91
  });
92
92
  this.idxs = df.getCol('indexes');
93
93
  this.scores = df.getCol('score');
94
94
  }
95
95
 
96
96
  private async computeByMM() {
97
- if (!this.distanceMatrixComputed) {
97
+ let distanceArray = new Float32Array();
98
+ if (!this.distanceMatrixComputed && this.preComputeDistanceMatrix) {
98
99
  this.mmDistanceMatrix = await calcMmDistanceMatrix(this.moleculeColumn!);
99
100
  this.distanceMatrixComputed = true;
101
+ } else if (!this.preComputeDistanceMatrix) {
102
+ // use fast distance array calculation if matrix will take too much space
103
+ distanceArray = await calculateMMDistancesArray(this.moleculeColumn!, this.targetMoleculeIdx);
100
104
  }
101
105
  const len = this.moleculeColumn!.length;
102
106
  const linearizeFunc = dmLinearIndex(len);
103
107
  // array that keeps track of the indexes and scores together
104
108
  const indexWScore = Array(len).fill(0)
105
109
  .map((_, i) => ({idx: i, score: i === this.targetMoleculeIdx ? 1 :
106
- 1 - this.mmDistanceMatrix[linearizeFunc(this.targetMoleculeIdx, i)]}));
110
+ this.preComputeDistanceMatrix ? 1 - this.mmDistanceMatrix[linearizeFunc(this.targetMoleculeIdx, i)] :
111
+ 1 - distanceArray[i]
112
+ }));
107
113
  indexWScore.sort((a, b) => b.score - a.score);
108
114
  // get the most similar molecules
109
115
  const actualLimit = Math.min(this.limit, len);
@@ -127,7 +133,7 @@ export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
127
133
  propPanel.append(ui.divV([
128
134
  ui.divText(`Different sequence length:`, {style: {fontWeight: 'bold'}}),
129
135
  ui.divText(`target: ${subParts1.length} monomers`),
130
- ui.divText(`selected: ${subParts2.length} monomers`)
136
+ ui.divText(`selected: ${subParts2.length} monomers`),
131
137
  ], {style: {paddingBottom: '10px'}}));
132
138
  }
133
139
  propPanel.append(createDifferencesWithPositions(molDifferences));
@@ -1,19 +1,17 @@
1
1
  import * as DG from 'datagrok-api/dg';
2
- import {AvailableMetrics} from '@datagrok-libraries/ml/src/typed-metrics';
3
2
  import {reduceDimensinalityWithNormalization} from '@datagrok-libraries/ml/src/sequence-space';
4
3
  import {BitArrayMetrics, StringMetrics} from '@datagrok-libraries/ml/src/typed-metrics';
5
4
  import {Matrix} from '@datagrok-libraries/utils/src/type-declarations';
6
- import BitArray from '@datagrok-libraries/utils/src/bit-array';
7
5
  import {ISequenceSpaceParams} from '@datagrok-libraries/ml/src/viewers/activity-cliffs';
8
6
  import {invalidateMols, MONOMERIC_COL_TAGS} from '../substructure-search/substructure-search';
9
7
  import {UnitsHandler} from '@datagrok-libraries/bio/src/utils/units-handler';
10
8
  import * as grok from 'datagrok-api/grok';
11
- import { NotationConverter } from '@datagrok-libraries/bio/src/utils/notation-converter';
12
- import { ALPHABET, NOTATION } from '@datagrok-libraries/bio/src/utils/macromolecule';
13
- import { MmDistanceFunctionsNames } from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
9
+ import {NotationConverter} from '@datagrok-libraries/bio/src/utils/notation-converter';
10
+ import {ALPHABET, NOTATION} from '@datagrok-libraries/bio/src/utils/macromolecule';
11
+ import {MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
14
12
 
15
13
  export interface ISequenceSpaceResult {
16
- distance: Matrix;
14
+ distance?: Float32Array;
17
15
  coordinates: DG.ColumnList;
18
16
  }
19
17
 
@@ -44,7 +42,8 @@ export async function sequenceSpace(spaceParams: ISequenceSpaceParams): Promise<
44
42
 
45
43
  export async function sequenceSpaceByFingerprints(spaceParams: ISequenceSpaceParams): Promise<ISequenceSpaceResult> {
46
44
  if (spaceParams.seqCol.version !== spaceParams.seqCol.temp[MONOMERIC_COL_TAGS.LAST_INVALIDATED_VERSION])
47
- await invalidateMols(spaceParams.seqCol as unknown as DG.Column<string>, false); //we expect only string columns here
45
+ //we expect only string columns here
46
+ await invalidateMols(spaceParams.seqCol as unknown as DG.Column<string>, false);
48
47
 
49
48
  const result = await grok.functions.call('Chem:getChemSpaceEmbeddings', {
50
49
  col: spaceParams.seqCol.temp[MONOMERIC_COL_TAGS.MONOMERIC_MOLS],
@@ -52,7 +51,7 @@ export async function sequenceSpaceByFingerprints(spaceParams: ISequenceSpacePar
52
51
  similarityMetric: spaceParams.similarityMetric,
53
52
  xAxis: spaceParams.embedAxesNames[0],
54
53
  yAxis: spaceParams.embedAxesNames[1],
55
- options: spaceParams.options
54
+ options: spaceParams.options,
56
55
  });
57
56
  return result;
58
57
  }
@@ -65,17 +64,21 @@ export async function getSequenceSpace(spaceParams: ISequenceSpaceParams): Promi
65
64
  if (nc.isSeparator()) {
66
65
  const fastaCol = nc.convert(NOTATION.FASTA);
67
66
  seqList = fastaCol.toList();
68
- const uh = new UnitsHandler(fastaCol);
67
+ const uh = UnitsHandler.getOrCreate(fastaCol);
69
68
  distanceFName = uh.getDistanceFunctionName();
70
- }
71
- else {
69
+ } else {
72
70
  distanceFName = nc.getDistanceFunctionName();
73
71
  }
72
+ for (let i = 0; i < seqList.length; i++) {
73
+ // toList puts empty values in array and it causes downstream errors. replace with null
74
+ seqList[i] = spaceParams.seqCol.isNone(i) ? null : seqList[i];
75
+ }
74
76
  const sequenceSpaceResult = await reduceDimensinalityWithNormalization(
75
77
  seqList,
76
78
  spaceParams.methodName,
77
79
  distanceFName,
78
- spaceParams.options);
80
+ spaceParams.options,
81
+ true);
79
82
  const cols: DG.Column[] = spaceParams.embedAxesNames.map(
80
83
  (name: string, index: number) => DG.Column.fromFloat32Array(name, sequenceSpaceResult.embedding[index]));
81
84
  return {distance: sequenceSpaceResult.distance, coordinates: new DG.ColumnList(cols)};