@datagrok/bio 1.5.8 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/detectors.js CHANGED
@@ -22,6 +22,10 @@ class BioPackageDetectors extends DG.Package {
22
22
 
23
23
  static RnaFastaAlphabet = new Set(['A', 'C', 'G', 'U']);
24
24
 
25
+ static SmilesRawAlphabet = new Set([
26
+ 'O', 'C', 'c', 'N', 'S', 'F', '(', ')',
27
+ '1', '2', '3', '4', '5', '6', '7',
28
+ '+', '-', '@', '[', ']', '/', '\\', '#', '=']);
25
29
 
26
30
  /** @param s {String} - string to check
27
31
  * @returns {boolean} */
@@ -40,7 +44,11 @@ class BioPackageDetectors extends DG.Package {
40
44
  return BioPackageDetectors.mmSemType;
41
45
  }
42
46
 
43
- const alphabetCandidates = [
47
+ const decoyAlphabets = [
48
+ ['SMILES', BioPackageDetectors.SmilesRawAlphabet],
49
+ ];
50
+
51
+ const candidateAlphabets = [
44
52
  ['PT', BioPackageDetectors.PeptideFastaAlphabet],
45
53
  ['DNA', BioPackageDetectors.DnaFastaAlphabet],
46
54
  ['RNA', BioPackageDetectors.RnaFastaAlphabet],
@@ -49,9 +57,13 @@ class BioPackageDetectors extends DG.Package {
49
57
  // TODO: Detect HELM sequence
50
58
  // TODO: Lazy calculations could be helpful for performance and convenient for expressing classification logic.
51
59
  const statsAsChars = BioPackageDetectors.getStats(col, 5, BioPackageDetectors.splitterAsChars);
60
+
61
+ const decoy = BioPackageDetectors.detectAlphabet(statsAsChars.freq, decoyAlphabets, null, 0.5);
62
+ if (decoy != 'UN') return null;
63
+
52
64
  if (statsAsChars.sameLength) {
53
65
  if (Object.keys(statsAsChars.freq).length > 0) { // require non empty alphabet
54
- const alphabet = BioPackageDetectors.detectAlphabet(statsAsChars.freq, alphabetCandidates, '-');
66
+ const alphabet = BioPackageDetectors.detectAlphabet(statsAsChars.freq, candidateAlphabets, '-');
55
67
  if (alphabet === 'UN') return null;
56
68
 
57
69
  const units = `fasta:SEQ.MSA:${alphabet}`;
@@ -73,10 +85,10 @@ class BioPackageDetectors extends DG.Package {
73
85
  const seqType = stats.sameLength ? 'SEQ.MSA' : 'SEQ';
74
86
 
75
87
  // TODO: If separator detected, then extra efforts to detect alphabet are allowed.
76
- const alphabet = BioPackageDetectors.detectAlphabet(stats.freq, alphabetCandidates, gapSymbol);
88
+ const alphabet = BioPackageDetectors.detectAlphabet(stats.freq, candidateAlphabets, gapSymbol);
77
89
 
78
- const forbidden = BioPackageDetectors.checkForbiddenWoSeparator(stats.freq);
79
- if (separator || !forbidden) {
90
+ // const forbidden = BioPackageDetectors.checkForbiddenWoSeparator(stats.freq);
91
+ if (separator || alphabet != 'UN') {
80
92
  const units = `${format}:${seqType}:${alphabet}`;
81
93
  col.setTag(DG.TAGS.UNITS, units);
82
94
  if (separator) col.setTag('separator', separator);
@@ -123,11 +135,11 @@ class BioPackageDetectors extends DG.Package {
123
135
  return Object.keys(freq).filter((m) => forbiddenRe.test(m)).length > 0;
124
136
  }
125
137
 
126
- /** Without a separator, special symbols or digits are not allowed as monomers. */
127
- static checkForbiddenWoSeparator(freq) {
128
- const forbiddenRe = /[\d!@#$%^&*()_+\-=\[\]{};':"\\|,.<>\/?]/i;
129
- return Object.keys(freq).filter((m) => forbiddenRe.test(m)).length > 0;
130
- }
138
+ // /** Without a separator, special symbols or digits are not allowed as monomers. */
139
+ // static checkForbiddenWoSeparator(freq) {
140
+ // const forbiddenRe = /[\d!@#$%^&*()_+\-=\[\]{};':"\\|,.<>\/?]/i;
141
+ // return Object.keys(freq).filter((m) => forbiddenRe.test(m)).length > 0;
142
+ // }
131
143
 
132
144
  /** Stats of sequences with specified splitter func, returns { freq, sameLength } */
133
145
  static getStats(seqCol, minLength, splitter) {
@@ -160,7 +172,7 @@ class BioPackageDetectors extends DG.Package {
160
172
  * @param freq frequencies of monomers in sequence set
161
173
  * @param candidates an array of pairs [name, monomer set]
162
174
  * */
163
- static detectAlphabet(freq, candidates, gapSymbol) {
175
+ static detectAlphabet(freq, candidates, gapSymbol, cut = 0.65) {
164
176
  const candidatesSims = candidates.map((c) => {
165
177
  const sim = BioPackageDetectors.getAlphabetSimilarity(freq, c[1], gapSymbol);
166
178
  return [c[0], c[1], freq, sim];
@@ -168,7 +180,7 @@ class BioPackageDetectors extends DG.Package {
168
180
 
169
181
  let alphabetName;
170
182
  const maxSim = Math.max(...candidatesSims.map((cs) => cs[3]));
171
- if (maxSim > 0.65) {
183
+ if (maxSim > cut) {
172
184
  const sim = candidatesSims.find((cs) => cs[3] == maxSim);
173
185
  alphabetName = sim[0];
174
186
  } else {