@datagrok/bio 2.11.3 → 2.11.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,13 @@
1
1
  # Bio changelog
2
2
 
3
+ ## 2.12.0 (WIP)
4
+
5
+ ### Features
6
+
7
+ ### Bug fixes
8
+ * Fix detectMacromolecule allowing double quoted sequences and gaps.
9
+ * Fix for min seq length 10, tests.
10
+
3
11
  ## 2.11.0 (2023-10-25)
4
12
 
5
13
  ### Features
@@ -8,6 +16,7 @@
8
16
  * Add ToAtomicLevel for non-linear HELM structures.
9
17
  * Add WebLogo aggregation function.
10
18
  * Add WebLogo position tooltip with composition table (for count).
19
+ * Add PolyTool with Helm2Molfile support
11
20
 
12
21
  ### Bug fixes
13
22
 
package/detectors.js CHANGED
@@ -81,6 +81,19 @@ class BioPackageDetectors extends DG.Package {
81
81
  s.startsWith('RNA1{') || s.startsWith('DNA1{');
82
82
  }
83
83
 
84
+ //name: detectMacromoleculeEnableStore
85
+ //output: object result
86
+ detectMacromoleculeEnableStore() {
87
+ return window.$detectMacromoleculeStore = {last: null};
88
+ }
89
+
90
+ /** Returns last object (stores it if enabled earlier). */
91
+ detectMacromoleculeStoreLast() {
92
+ const last = {};
93
+ if (window.$detectMacromoleculeStore) window.$detectMacromoleculeStore.last = last;
94
+ return last;
95
+ }
96
+
84
97
  //tags: semTypeDetector
85
98
  //input: column col
86
99
  //output: string semType
@@ -89,13 +102,18 @@ class BioPackageDetectors extends DG.Package {
89
102
  console.debug(`Bio: detectMacromolecule( table: ${tableName}.${col.name} ), start`);
90
103
  const t1 = Date.now();
91
104
  try {
105
+ const last = this.detectMacromoleculeStoreLast();
92
106
  const colName = col.name;
93
107
  const colNameLikely = this.likelyColNamePartList.some(
94
108
  (requiredColNamePart) => colName.toLowerCase().includes(requiredColNamePart));
95
- const seqMinLength = colNameLikely ? 3 : 5;
109
+ const seqMinLength = colNameLikely ? 7 : 10;
110
+ const maxBadRatio = colNameLikely ? 0.05 : 0.005;
96
111
 
97
112
  // Fail early
98
- if (col.type !== DG.TYPE.STRING) return null;
113
+ if (col.type !== DG.TYPE.STRING) {
114
+ last.rejectReason = `The column must be of type '${DG.TYPE.STRING}'.`;
115
+ return null;
116
+ }
99
117
 
100
118
  const categoriesSample = [...new Set((col.length < SEQ_SAMPLE_LIMIT ?
101
119
  wu.count(0).take(Math.min(SEQ_SAMPLE_LIMIT, col.length)).map((rowI) => col.get(rowI)) :
@@ -103,6 +121,7 @@ class BioPackageDetectors extends DG.Package {
103
121
  .map((seq) => !!seq ? seq.substring(0, SEQ_SAMPLE_LENGTH_LIMIT * 5) : '')
104
122
  .filter((seq) => seq.length !== 0/* skip empty values for detector */),
105
123
  )];
124
+ last.categoriesSample = categoriesSample;
106
125
 
107
126
  // To collect alphabet freq three strategies can be used:
108
127
  // as chars, as fasta (single or within square brackets), as with the separator.
@@ -147,31 +166,47 @@ class BioPackageDetectors extends DG.Package {
147
166
  return res;
148
167
  // return isUrlRe.test(s);
149
168
  };
150
- const isUrl = categoriesSample.every((v) => { return !v || isUrlCheck(v); });
151
- if (isUrl) return null;
169
+ const isUrl = categoriesSample.every((v) => !v || isUrlCheck(v));
170
+ if (isUrl) {
171
+ last.rejectReason = 'URL detected.';
172
+ return null;
173
+ }
152
174
 
153
175
  // TODO: Detect HELM sequence
154
176
  // TODO: Lazy calculations could be helpful for performance and convenient for expressing classification logic.
155
177
  const statsAsChars = this.getStats(categoriesSample, seqMinLength,
156
178
  this.getSplitterAsChars(SEQ_SAMPLE_LENGTH_LIMIT));
157
179
  // Empty statsAsShars.freq alphabet means no strings of enough length presented in the data
158
- if (Object.keys(statsAsChars.freq).length === 0) return null;
180
+ if (Object.keys(statsAsChars.freq).length === 0) {
181
+ last.rejectReason = 'Monomer set (alphabet) is empty.';
182
+ return null;
183
+ }
159
184
 
160
185
  const decoy = this.detectAlphabet(statsAsChars.freq, decoyAlphabets, null, colNameLikely ? -0.05 : 0);
161
- if (decoy !== ALPHABET.UN) return null;
186
+ if (decoy !== ALPHABET.UN) {
187
+ last.rejectReason = `Decoy alphabet '${decoy}' detected.`;
188
+ return null;
189
+ }
162
190
 
163
- const separator = this.detectSeparator(statsAsChars.freq, categoriesSample);
164
- if (this.checkForbiddenSeparator(separator)) return null;
191
+ const separator = this.detectSeparator(statsAsChars.freq, categoriesSample, seqMinLength);
192
+ const checkForbiddenSeparatorRes = this.checkForbiddenSeparator(separator);
193
+ if (checkForbiddenSeparatorRes) {
194
+ last.rejectReason = `Separator '${separator}' is forbidden.`;
195
+ return null;
196
+ }
165
197
 
166
198
  const units = separator ? NOTATION.SEPARATOR : NOTATION.FASTA;
167
199
  const gapSymbol = separator ? '' : '-';
168
200
  const splitter = separator ? this.getSplitterWithSeparator(separator, SEQ_SAMPLE_LENGTH_LIMIT) :
169
201
  this.getSplitterAsFasta(SEQ_SAMPLE_LENGTH_LIMIT);
170
202
 
171
- if (statsAsChars.sameLength) {
203
+ if (statsAsChars.sameLength) { // MSA FASTA single character
172
204
  const stats = this.getStats(categoriesSample, seqMinLength, splitter);
173
205
  const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, '-', colNameLikely ? 0.20 : 0);
174
- if (alphabet === ALPHABET.UN) return null;
206
+ if (alphabet === ALPHABET.UN) {
207
+ last.rejectReason = `MSA FASTA single character alphabet is not allowed to be 'UN'.`;
208
+ return null;
209
+ }
175
210
 
176
211
  col.setTag(DG.TAGS.UNITS, units);
177
212
  if (separator) col.setTag(UnitsHandler.TAGS.separator, separator);
@@ -186,22 +221,38 @@ class BioPackageDetectors extends DG.Package {
186
221
  const stats = this.getStats(categoriesSample, seqMinLength, splitter);
187
222
  const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
188
223
  // Empty monomer alphabet is not allowed
189
- if (Object.keys(stats.freq).length === 0) return null;
190
- // Long monomer names for sequences with separators have constraints
191
- if (
192
- ((units === NOTATION.SEPARATOR || (units === NOTATION.FASTA && alphabetIsMultichar)) &&
193
- this.checkForbiddenMultichar(stats.freq)) ||
194
- ((units === NOTATION.FASTA && !alphabetIsMultichar) &&
195
- this.checkForbiddenSinglechar(stats.freq))
196
- ) {
224
+ if (Object.keys(stats.freq).length === 0) {
225
+ last.rejectReason = 'Monomer set (alphabet) is empty';
197
226
  return null;
198
227
  }
228
+ // Single- and multi-char monomer names for sequences with separators have constraints
229
+ if (units === NOTATION.SEPARATOR || (units === NOTATION.FASTA && alphabetIsMultichar)) {
230
+ const badSet = this.checkBadMultichar(stats.freq);
231
+ const [badCount, allCount] = this.calcBad(stats.freq, badSet);
232
+ if (badCount / allCount > maxBadRatio) {
233
+ last.rejectReason = `Forbidden multi-char monomers: ` +
234
+ `${wu(badSet.keys()).map((m) => `'${m}'`).toArray().join(', ')}`;
235
+ return null;
236
+ }
237
+ }
238
+ if (units === NOTATION.FASTA && !alphabetIsMultichar) {
239
+ const badSet = this.checkBadSinglechar(stats.freq);
240
+ const [badCount, allCount] = this.calcBad(stats.freq, badSet);
241
+ if (badCount / allCount > maxBadRatio) {
242
+ last.rejectReason = `Forbidden single-char monomers: ` +
243
+ `${wu(badSet.keys()).map((m) => `'${m}'`).toArray().join(', ')}`;
244
+ return null;
245
+ }
246
+ }
199
247
 
200
248
  const aligned = stats.sameLength ? ALIGNMENT.SEQ_MSA : ALIGNMENT.SEQ;
201
249
 
202
250
  // TODO: If separator detected, then extra efforts to detect alphabet are allowed.
203
251
  const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, gapSymbol, colNameLikely ? 0.15 : 0);
204
- if (units === NOTATION.FASTA && alphabet === ALPHABET.UN && !alphabetIsMultichar) return null;
252
+ if (units === NOTATION.FASTA && alphabet === ALPHABET.UN && !alphabetIsMultichar) {
253
+ last.rejectReason = `FASTA single character alphabet is not allowed to be 'UN'.`;
254
+ return null;
255
+ }
205
256
 
206
257
  // const forbidden = this.checkForbiddenWoSeparator(stats.freq);
207
258
  col.setTag(DG.TAGS.UNITS, units);
@@ -216,11 +267,12 @@ class BioPackageDetectors extends DG.Package {
216
267
  return DG.SEMTYPE.MACROMOLECULE;
217
268
  }
218
269
  } catch (err) {
219
- let errMsg = err instanceof Error ? err.message : err.toString();
270
+ const errMsg = err instanceof Error ? err.message : err.toString();
271
+ const errStack = err instanceof Error ? err.stack : undefined;
220
272
  const colTops = wu.count(0).take(Math.max(col.length, 4)).map((rowI) => col.get(rowI))
221
273
  .reduce((a, b) => a === undefined ? b : a + '\n' + b, undefined);
222
- errMsg += `\n${colTops}`;
223
- console.error(`Bio: detectMacromolecule( table: ${tableName}.${col.name} ), error:\n${errMsg}`);
274
+ console.error(`Bio: detectMacromolecule( table: ${tableName}.${col.name} ), error:\n${errMsg}` +
275
+ `${errStack ? '\n' + errStack : ''}` + `\n${colTops}`);
224
276
  } finally {
225
277
  const t2 = Date.now();
226
278
  console.debug(`Bio: detectMacromolecule( table: ${tableName}.${col.name} ), ` + `ET = ${t2 - t1} ms.`);
@@ -232,7 +284,7 @@ class BioPackageDetectors extends DG.Package {
232
284
  * @param freq Dictionary of characters freqs
233
285
  * @param sample A string array of seqs sample
234
286
  */
235
- detectSeparator(freq, categoriesSample) {
287
+ detectSeparator(freq, categoriesSample, seqMinLength) {
236
288
  // To detect a separator we analyze col's sequences character frequencies.
237
289
  // If there is an exceptionally frequent symbol, then we will call it the separator.
238
290
  // The most frequent symbol should occur with a rate of at least 0.15
@@ -261,7 +313,7 @@ class BioPackageDetectors extends DG.Package {
261
313
 
262
314
  // Splitter with separator test application
263
315
  const splitter = this.getSplitterWithSeparator(sep, SEQ_SAMPLE_LENGTH_LIMIT);
264
- const stats = this.getStats(categoriesSample, 0, splitter);
316
+ const stats = this.getStats(categoriesSample, seqMinLength, splitter);
265
317
  // TODO: Test for Gamma/Erlang distribution
266
318
  const totalMonomerCount = wu(Object.values(stats.freq)).reduce((sum, a) => sum + a, 0);
267
319
  const mLengthAvg = wu.entries(stats.freq)
@@ -285,19 +337,28 @@ class BioPackageDetectors extends DG.Package {
285
337
  return forbiddenSepRe.test(separator);
286
338
  }
287
339
 
288
- /** Spaces, dots and colons are nor allowed in multichar monomer names.
289
- * The monomer name/label cannot contain digits only.
340
+ /** Dots and colons are nor allowed in multichar monomer names (but space is allowed).
341
+ * The monomer name/label cannot contain digits only (but single digit is allowed).
290
342
  */
291
- checkForbiddenMultichar(freq) {
292
- const forbiddenRe = /[ .:]|^\d+$/i;
293
- const forbiddenMonomerList = Object.keys(freq).filter((m) => forbiddenRe.test(m));
294
- return forbiddenMonomerList.length > 0;
343
+ checkBadMultichar(freq) {
344
+ const badRe = /[ .:]|^\d+$/i;
345
+ return new Set(Object.keys(freq).filter((m) => badRe.test(m)));
295
346
  }
296
347
 
297
348
  /** Space, dot, colon, semicolon, digit, underscore are not allowed as singe char monomer names.*/
298
- checkForbiddenSinglechar(freq) {
299
- const forbiddenRe = /[ .:;\d_]/i;
300
- return Object.keys(freq).some((m) => forbiddenRe.test(m));
349
+ checkBadSinglechar(freq) {
350
+ const badRe = /[ .:;\d_]/i;
351
+ return new Set(Object.keys(freq).filter((m) => badRe.test(m)));
352
+ }
353
+
354
+ calcBad(freq, forbiddenSet) {
355
+ let allCount = 0;
356
+ let forbiddenCount = 0;
357
+ for (const [m, count] of Object.entries(freq)) {
358
+ if (forbiddenSet.has(m)) forbiddenCount += freq[m];
359
+ allCount += freq[m];
360
+ }
361
+ return [forbiddenCount, allCount];
301
362
  }
302
363
 
303
364
  // /** Without a separator, special symbols or digits are not allowed as monomers. */
@@ -397,22 +458,12 @@ class BioPackageDetectors extends DG.Package {
397
458
  };
398
459
  }
399
460
 
400
- getSplitterWithSeparator(separator, lengthLimit) {
461
+ getSplitterWithSeparator(separator, limit) {
401
462
  return function(seq) {
402
- // if (!!lengthLimit) {
403
- // const res = new Array(lengthLimit);
404
- // let pos = 0, count = 0;
405
- // while (pos < seq.length && count < lengthLimit) {
406
- // const newPos = seq.indexOf(separator, pos);
407
- // res[count] = seq.substring(pos, newPos);
408
- // count++;
409
- // pos = newPos;
410
- // }
411
- //
412
- // return res.slice(0, count);
413
- // } else {
414
- return seq.split(separator, lengthLimit);
415
- // }
463
+ const seq1 = !seq ? '' :
464
+ (seq.startsWith('"') && seq.endsWith('"')) ? seq.slice(1, -1).replaceAll('""-""', '') :
465
+ seq;
466
+ return !seq1 ? [] : seq1.split(separator, limit);
416
467
  };
417
468
  }
418
469