@datagrok/bio 2.25.0 → 2.25.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,19 @@
1
1
  # Bio changelog
2
2
 
3
+ ## 2.25.2 (2025-11-03)
4
+
5
+ * Update Bio Lib API
6
+ * Sequence header: Enable for shorter and non-MSA sequences
7
+ * Monomer manager: Chech both capped and uncapped monomers when matching mols with libraries, provide multiple sources for matching
8
+ * Mol-To-Helm converter: Add initial version of Molecules to sequence converter
9
+
10
+ ## 2.25.1 (2025-10-30)
11
+
12
+ * Rework User lib settings storage to accomodate shortened duplicate preferences
13
+ * Detectors: Improve BILN with SMILES/CHEMS detection
14
+ * Support BILN with SMILES/CHEMS rendering, conversion and Helm converter
15
+ * Monomer libraries: Fix dialogs for adding/removing libraries
16
+
3
17
  ## 2.25.0 (2025-10-29)
4
18
 
5
19
  * CHEMS and SMILES support in HELM
package/detectors.js CHANGED
@@ -90,7 +90,7 @@ class BioPackageDetectors extends DG.Package {
90
90
  }
91
91
 
92
92
  /** Parts of the column name required in the column's name under the detector. It must be in lowercase. */
93
- likelyColNamePartList = ['seq', 'msa', 'dna', 'rna', 'fasta', 'helm', 'sense', 'protein', 'pep', 'enumerated'];
93
+ likelyColNamePartList = ['seq', 'msa', 'dna', 'rna', 'fasta', 'helm', 'sense', 'protein', 'pep', 'enumerated', 'biln'];
94
94
 
95
95
  veryLikelyColNamePartList = ['peptide', 'oligo', 'sequence', 'enumerated',
96
96
  'heavy_chain', 'light_chain', 'heay-chain', 'light-chain', 'heavychain', 'lightchain',
@@ -156,10 +156,11 @@ class BioPackageDetectors extends DG.Package {
156
156
  try {
157
157
  const last = this.detectMacromoleculeStoreLast();
158
158
  const colName = col.name;
159
+ const colNameLower = colName.toLowerCase();
159
160
  const colNameLikely = this.likelyColNamePartList.some(
160
- (requiredColNamePart) => colName.toLowerCase().includes(requiredColNamePart));
161
+ (requiredColNamePart) => colNameLower.includes(requiredColNamePart));
161
162
  const colNameVeryLikely = this.veryLikelyColNamePartList.some(
162
- (requiredColNamePart) => colName.toLowerCase().includes(requiredColNamePart));
163
+ (requiredColNamePart) => colNameLower.includes(requiredColNamePart));
163
164
  const seqMinLength = colNameVeryLikely ? 3 : colNameLikely ? 7 : 10;
164
165
  const maxBadRatio = colNameLikely ? 0.05 : 0.005;
165
166
 
@@ -172,7 +173,7 @@ class BioPackageDetectors extends DG.Package {
172
173
  const categoriesSample = [...new Set((col.length < SEQ_SAMPLE_LIMIT ?
173
174
  wu.count(0).take(Math.min(SEQ_SAMPLE_LIMIT, col.length)).map((rowI) => col.get(rowI)) :
174
175
  this.sample(col, SEQ_SAMPLE_LIMIT))
175
- .map((seq) => !!seq ? seq.substring(0, SEQ_SAMPLE_LENGTH_LIMIT * 5) : '')
176
+ .map((seq) => !!seq ? seq.substring(0, SEQ_SAMPLE_LENGTH_LIMIT * 10) : '')
176
177
  .filter((seq) => seq.length !== 0/* skip empty values for detector */),
177
178
  )].map((s) => s?.trim());
178
179
  last.categoriesSample = categoriesSample;
@@ -198,11 +199,14 @@ class BioPackageDetectors extends DG.Package {
198
199
  }
199
200
 
200
201
  //not HELM
201
- const dotIsLikelyBilnSplitter = categoriesSample.every((s) => {
202
+ let hasDots = false;
203
+ let dotIsLikelyBilnSplitter = categoriesSample.every((s) => {
202
204
  const parts = s.split('.');
203
205
  // each part should be connected
206
+ hasDots = hasDots || parts.length > 1;
204
207
  return parts.length == 1 || parts.every((p) => /\(\d{1,2},\d{1,2}\)/g.test(p));
205
208
  });
209
+ dotIsLikelyBilnSplitter = dotIsLikelyBilnSplitter && hasDots;
206
210
  // if the dot (dissalowed character for macromolecules) is likely a biln separator,
207
211
  // we can just replace it with '-' and remove all connection parts to help detector detect it as separator
208
212
  if (dotIsLikelyBilnSplitter) {
@@ -257,7 +261,16 @@ class BioPackageDetectors extends DG.Package {
257
261
  return null;
258
262
  }
259
263
 
260
- const separator = this.detectSeparator(statsAsChars.freq, categoriesSample, seqMinLength);
264
+ // for BILN, there might be smiles in there, with bunch of special characters
265
+ let isPossiblyBiln = colNameLower.includes('biln') || dotIsLikelyBilnSplitter;
266
+ if (isPossiblyBiln) {
267
+ for (const symbol of ['@', '$', ';', '*'])
268
+ delete statsAsChars.freq[symbol];
269
+ }
270
+
271
+ const separator = this.detectSeparator(statsAsChars.freq, categoriesSample, seqMinLength, isPossiblyBiln);
272
+ if (separator !== '-')
273
+ isPossiblyBiln = false;
261
274
  const checkForbiddenSeparatorRes = this.checkForbiddenSeparator(separator);
262
275
  if (checkForbiddenSeparatorRes) {
263
276
  last.rejectReason = `Separator '${separator}' is forbidden.`;
@@ -300,7 +313,7 @@ class BioPackageDetectors extends DG.Package {
300
313
  }
301
314
  // Single- and multi-char monomer names for sequences with separators have constraints
302
315
  if (units === NOTATION.SEPARATOR || (units === NOTATION.FASTA && alphabetIsMultichar)) {
303
- const badSymbol /*: string | null*/ = this.checkBadMultichar(stats.freq);
316
+ const badSymbol /*: string | null*/ = this.checkBadMultichar(stats.freq, isPossiblyBiln);
304
317
  if (badSymbol) {
305
318
  last.rejectReason = `Forbidden multi-char monomer: '${badSymbol}'.`;
306
319
  return null;
@@ -349,7 +362,7 @@ class BioPackageDetectors extends DG.Package {
349
362
  * @param categoriesSample A string array of seqs sample
350
363
  * @param seqMinLength A threshold on min seq length for contributing to stats
351
364
  */
352
- detectSeparator(freq, categoriesSample, seqMinLength) {
365
+ detectSeparator(freq, categoriesSample, seqMinLength, isPossiblyBiln = false) {
353
366
  // To detect a separator we analyze col's sequences character frequencies.
354
367
  // If there is an exceptionally frequent symbol, then we will call it the separator.
355
368
  // The most frequent symbol should occur with a rate of at least 0.15
@@ -371,7 +384,7 @@ class BioPackageDetectors extends DG.Package {
371
384
 
372
385
  const maxFreq = Math.max(...Object.values(cleanFreq));
373
386
 
374
- const sep = Object.entries(freq).find(([k, v]) => v === maxFreq)[0];
387
+ const sep = Object.entries(cleanFreq).find(([k, v]) => v === maxFreq)[0];
375
388
  const sepFreq = freq[sep];
376
389
  const otherSumFreq = Object.entries(freq).filter((kv) => kv[0] !== sep)
377
390
  .map((kv) => kv[1]).reduce((pSum, a) => pSum + a, 0);
@@ -379,7 +392,7 @@ class BioPackageDetectors extends DG.Package {
379
392
  // Splitter with separator test application
380
393
  const splitter = this.getSplitterWithSeparator(sep, SEQ_SAMPLE_LENGTH_LIMIT);
381
394
  const stats = this.getStats(categoriesSample, seqMinLength, splitter);
382
- const badSymbol = this.checkBadMultichar(stats.freq);
395
+ const badSymbol = this.checkBadMultichar(stats.freq, isPossiblyBiln);
383
396
  if (badSymbol) return null;
384
397
  // TODO: Test for Gamma/Erlang distribution
385
398
  const totalMonomerCount = wu(Object.values(stats.freq)).reduce((sum, a) => sum + a, 0);
@@ -407,11 +420,12 @@ class BioPackageDetectors extends DG.Package {
407
420
  /** Dots and colons are nor allowed in multichar monomer names (but space is allowed).
408
421
  * The monomer name/label cannot contain digits only (but single digit is allowed).
409
422
  */
410
- checkBadMultichar(freq) /* : string | null */ {
423
+ checkBadMultichar(freq, isPossiblyBiln = false) /* : string | null */ {
411
424
  for (const symbol of Object.keys(freq)) {
412
425
  if (symbol && !isNaN(symbol))
413
426
  return symbol; // performance evaluated better with RegExp
414
-
427
+ if (isPossiblyBiln && symbol.startsWith('[') && symbol.endsWith(']'))
428
+ continue; // biln monomer smiles can contain forbidden characters within []
415
429
  const symbolLen = symbol.length;
416
430
  if (this.forbiddenMulticharFirst.includes(symbol[0]))
417
431
  return symbol;