@datagrok/bio 1.9.0 → 1.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/detectors.js CHANGED
@@ -8,6 +8,30 @@
8
8
  * TODO: Use detectors from WebLogo pickUp.. methods
9
9
  */
10
10
 
11
+ /** enum type to simplify setting "user-friendly" notation if necessary */
12
+ const NOTATION = {
13
+ FASTA: 'fasta',
14
+ SEPARATOR: 'separator',
15
+ HELM: 'helm',
16
+ };
17
+
18
+ const ALPHABET = {
19
+ DNA: 'DNA',
20
+ RNA: 'RNA',
21
+ PT: 'PT',
22
+ UN: 'UN',
23
+ };
24
+
25
+ /** Class for handling notation units in Macromolecule columns */
26
+ class UnitsHandler {
27
+ static TAGS = {
28
+ aligned: 'aligned',
29
+ alphabet: 'alphabet',
30
+ alphabetSize: '.alphabetSize',
31
+ alphabetIsMultichar: '.alphabetIsMultichar',
32
+ separator: 'separator',
33
+ };
34
+ }
11
35
 
12
36
  class BioPackageDetectors extends DG.Package {
13
37
 
@@ -52,7 +76,14 @@ class BioPackageDetectors extends DG.Package {
52
76
  !(col.categories.length == 1 && !col.categories[0]) && // TODO: Remove with tests for single empty category value
53
77
  DG.Detector.sampleCategories(col, (s) => BioPackageDetectors.isHelm(s), 1)
54
78
  ) {
55
- col.setTag(DG.TAGS.UNITS, 'helm');
79
+ const statsAsHelm = BioPackageDetectors.getStats(col, 2, BioPackageDetectors.splitterAsHelm);
80
+ col.setTag(DG.TAGS.UNITS, NOTATION.HELM);
81
+
82
+ const alphabetSize = Object.keys(statsAsHelm.freq).length;
83
+ const alphabetIsMultichar = Object.keys(statsAsHelm.freq).some((m) => m.length > 1);
84
+ col.setTag(UnitsHandler.TAGS.alphabetSize, alphabetSize.toString());
85
+ col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
86
+
56
87
  return DG.SEMTYPE.MACROMOLECULE;
57
88
  }
58
89
 
@@ -62,9 +93,9 @@ class BioPackageDetectors extends DG.Package {
62
93
  ];
63
94
 
64
95
  const candidateAlphabets = [
65
- ['PT', BioPackageDetectors.PeptideFastaAlphabet, 0.55],
66
- ['DNA', BioPackageDetectors.DnaFastaAlphabet, 0.55],
67
- ['RNA', BioPackageDetectors.RnaFastaAlphabet, 0.55],
96
+ [ALPHABET.PT, BioPackageDetectors.PeptideFastaAlphabet, 0.55],
97
+ [ALPHABET.DNA, BioPackageDetectors.DnaFastaAlphabet, 0.55],
98
+ [ALPHABET.RNA, BioPackageDetectors.RnaFastaAlphabet, 0.55],
68
99
  ];
69
100
 
70
101
  // Check for url column, maybe it is too heavy check
@@ -87,23 +118,24 @@ class BioPackageDetectors extends DG.Package {
87
118
  // if (Object.keys(statsAsChars.freq).length === 0) return;
88
119
 
89
120
  const decoy = BioPackageDetectors.detectAlphabet(statsAsChars.freq, decoyAlphabets, null);
90
- if (decoy != 'UN') return null;
121
+ if (decoy != ALPHABET.UN) return null;
91
122
 
92
123
  if (statsAsChars.sameLength) {
93
124
  if (Object.keys(statsAsChars.freq).length > 0) { // require non empty alphabet
94
125
  const alphabet = BioPackageDetectors.detectAlphabet(statsAsChars.freq, candidateAlphabets, '-');
95
- if (alphabet === 'UN') return null;
126
+ if (alphabet === ALPHABET.UN) return null;
96
127
 
97
- const units = 'fasta';
128
+ const units = NOTATION.FASTA;
98
129
  col.setTag(DG.TAGS.UNITS, units);
99
- col.setTag('aligned', 'SEQ.MSA');
100
- col.setTag('alphabet', alphabet);
130
+ col.setTag(UnitsHandler.TAGS.aligned, 'SEQ.MSA');
131
+ col.setTag(UnitsHandler.TAGS.alphabet, alphabet);
101
132
  return DG.SEMTYPE.MACROMOLECULE;
102
133
  }
103
134
  } else {
104
135
  const separator = BioPackageDetectors.detectSeparator(statsAsChars.freq);
105
136
  const gapSymbol = separator ? '' : '-';
106
- const splitter = separator ? BioPackageDetectors.getSplitterWithSeparator(separator) : BioPackageDetectors.splitterAsFasta;
137
+ const splitter = separator ? BioPackageDetectors.getSplitterWithSeparator(separator) :
138
+ BioPackageDetectors.splitterAsFasta;
107
139
 
108
140
  const stats = BioPackageDetectors.getStats(col, 5, splitter);
109
141
  // Empty monomer alphabet is not allowed
@@ -111,7 +143,7 @@ class BioPackageDetectors extends DG.Package {
111
143
  // Long monomer names for sequences with separators have constraints
112
144
  if (separator && BioPackageDetectors.checkForbiddenWithSeparators(stats.freq)) return null;
113
145
 
114
- const format = separator ? 'separator' : 'fasta';
146
+ const format = separator ? NOTATION.SEPARATOR : NOTATION.FASTA;
115
147
  const seqType = stats.sameLength ? 'SEQ.MSA' : 'SEQ';
116
148
 
117
149
  // TODO: If separator detected, then extra efforts to detect alphabet are allowed.
@@ -120,9 +152,15 @@ class BioPackageDetectors extends DG.Package {
120
152
  // const forbidden = BioPackageDetectors.checkForbiddenWoSeparator(stats.freq);
121
153
  if (separator || alphabet != 'UN') {
122
154
  col.setTag(DG.TAGS.UNITS, format);
123
- col.setTag('aligned', seqType);
124
- col.setTag('alphabet', alphabet);
125
- if (separator) col.setTag('separator', separator);
155
+ col.setTag(UnitsHandler.TAGS.aligned, seqType);
156
+ col.setTag(UnitsHandler.TAGS.alphabet, alphabet);
157
+ if (separator) col.setTag(UnitsHandler.TAGS.separator, separator);
158
+ if (alphabet === ALPHABET.UN) {
159
+ const alphabetSize = Object.keys(stats.freq).length;
160
+ const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
161
+ col.setTag(UnitsHandler.TAGS.alphabetSize, alphabetSize.toString());
162
+ col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
163
+ }
126
164
  return DG.SEMTYPE.MACROMOLECULE;
127
165
  }
128
166
  }
@@ -218,7 +256,7 @@ class BioPackageDetectors extends DG.Package {
218
256
  const sim = candidatesSims.find((cs) => cs[4] == maxSim);
219
257
  alphabetName = sim[0];
220
258
  } else {
221
- alphabetName = 'UN';
259
+ alphabetName = ALPHABET.UN;
222
260
  }
223
261
  return alphabetName;
224
262
  }
@@ -297,4 +335,28 @@ class BioPackageDetectors extends DG.Package {
297
335
  '[MeNle]': 'L', // Nle - norleucine
298
336
  '[MeA]': 'A', '[MeG]': 'G', '[MeF]': 'F',
299
337
  };
338
+
339
+ static helmRe = /(PEPTIDE1|DNA1|RNA1)\{([^}]+)}/g;
340
+ static helmPp1Re = /\[([^\[\]]+)]/g;
341
+
342
+ /** Splits Helm string to monomers, but does not replace monomer names to other notation (e.g. for RNA). */
343
+ static splitterAsHelm(seq) {
344
+ BioPackageDetectors.helmRe.lastIndex = 0;
345
+ const ea = BioPackageDetectors.helmRe.exec(seq.toString());
346
+ const inSeq = ea ? ea[2] : null;
347
+
348
+ const mmPostProcess = (mm) => {
349
+ BioPackageDetectors.helmPp1Re.lastIndex = 0;
350
+ const pp1M = BioPackageDetectors.helmPp1Re.exec(mm);
351
+ if (pp1M && pp1M.length >= 2) {
352
+ return pp1M[1];
353
+ } else {
354
+ return mm;
355
+ }
356
+ };
357
+
358
+ const mmList = inSeq ? inSeq.split('.') : [];
359
+ const mmListRes = mmList.map(mmPostProcess);
360
+ return mmListRes;
361
+ }
300
362
  }