@datagrok/bio 2.25.0 → 2.25.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +7 -0
- package/detectors.js +26 -12
- package/dist/package-test.js +2 -2
- package/dist/package-test.js.map +1 -1
- package/dist/package.js +2 -2
- package/dist/package.js.map +1 -1
- package/package.json +2 -2
- package/src/utils/monomer-lib/library-file-manager/ui.ts +21 -4
- package/src/utils/seq-helper/seq-handler.ts +15 -6
- package/test-console-output-1.log +785 -773
- package/test-record-1.mp4 +0 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
# Bio changelog
|
|
2
2
|
|
|
3
|
+
## 2.25.1 (2025-10-30)
|
|
4
|
+
|
|
5
|
+
* Rework User lib settings storage to accomodate shortened duplicate preferences
|
|
6
|
+
* Detectors: Improve BILN with SMILES/CHEMS detection
|
|
7
|
+
* Support BILN with SMILES/CHEMS rendering, conversion and Helm converter
|
|
8
|
+
* Monomer libraries: Fix dialogs for adding/removing libraries
|
|
9
|
+
|
|
3
10
|
## 2.25.0 (2025-10-29)
|
|
4
11
|
|
|
5
12
|
* CHEMS and SMILES support in HELM
|
package/detectors.js
CHANGED
|
@@ -90,7 +90,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
90
90
|
}
|
|
91
91
|
|
|
92
92
|
/** Parts of the column name required in the column's name under the detector. It must be in lowercase. */
|
|
93
|
-
likelyColNamePartList = ['seq', 'msa', 'dna', 'rna', 'fasta', 'helm', 'sense', 'protein', 'pep', 'enumerated'];
|
|
93
|
+
likelyColNamePartList = ['seq', 'msa', 'dna', 'rna', 'fasta', 'helm', 'sense', 'protein', 'pep', 'enumerated', 'biln'];
|
|
94
94
|
|
|
95
95
|
veryLikelyColNamePartList = ['peptide', 'oligo', 'sequence', 'enumerated',
|
|
96
96
|
'heavy_chain', 'light_chain', 'heay-chain', 'light-chain', 'heavychain', 'lightchain',
|
|
@@ -156,10 +156,11 @@ class BioPackageDetectors extends DG.Package {
|
|
|
156
156
|
try {
|
|
157
157
|
const last = this.detectMacromoleculeStoreLast();
|
|
158
158
|
const colName = col.name;
|
|
159
|
+
const colNameLower = colName.toLowerCase();
|
|
159
160
|
const colNameLikely = this.likelyColNamePartList.some(
|
|
160
|
-
(requiredColNamePart) =>
|
|
161
|
+
(requiredColNamePart) => colNameLower.includes(requiredColNamePart));
|
|
161
162
|
const colNameVeryLikely = this.veryLikelyColNamePartList.some(
|
|
162
|
-
(requiredColNamePart) =>
|
|
163
|
+
(requiredColNamePart) => colNameLower.includes(requiredColNamePart));
|
|
163
164
|
const seqMinLength = colNameVeryLikely ? 3 : colNameLikely ? 7 : 10;
|
|
164
165
|
const maxBadRatio = colNameLikely ? 0.05 : 0.005;
|
|
165
166
|
|
|
@@ -172,7 +173,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
172
173
|
const categoriesSample = [...new Set((col.length < SEQ_SAMPLE_LIMIT ?
|
|
173
174
|
wu.count(0).take(Math.min(SEQ_SAMPLE_LIMIT, col.length)).map((rowI) => col.get(rowI)) :
|
|
174
175
|
this.sample(col, SEQ_SAMPLE_LIMIT))
|
|
175
|
-
.map((seq) => !!seq ? seq.substring(0, SEQ_SAMPLE_LENGTH_LIMIT *
|
|
176
|
+
.map((seq) => !!seq ? seq.substring(0, SEQ_SAMPLE_LENGTH_LIMIT * 10) : '')
|
|
176
177
|
.filter((seq) => seq.length !== 0/* skip empty values for detector */),
|
|
177
178
|
)].map((s) => s?.trim());
|
|
178
179
|
last.categoriesSample = categoriesSample;
|
|
@@ -198,11 +199,14 @@ class BioPackageDetectors extends DG.Package {
|
|
|
198
199
|
}
|
|
199
200
|
|
|
200
201
|
//not HELM
|
|
201
|
-
|
|
202
|
+
let hasDots = false;
|
|
203
|
+
let dotIsLikelyBilnSplitter = categoriesSample.every((s) => {
|
|
202
204
|
const parts = s.split('.');
|
|
203
205
|
// each part should be connected
|
|
206
|
+
hasDots = hasDots || parts.length > 1;
|
|
204
207
|
return parts.length == 1 || parts.every((p) => /\(\d{1,2},\d{1,2}\)/g.test(p));
|
|
205
208
|
});
|
|
209
|
+
dotIsLikelyBilnSplitter = dotIsLikelyBilnSplitter && hasDots;
|
|
206
210
|
// if the dot (dissalowed character for macromolecules) is likely a biln separator,
|
|
207
211
|
// we can just replace it with '-' and remove all connection parts to help detector detect it as separator
|
|
208
212
|
if (dotIsLikelyBilnSplitter) {
|
|
@@ -257,7 +261,16 @@ class BioPackageDetectors extends DG.Package {
|
|
|
257
261
|
return null;
|
|
258
262
|
}
|
|
259
263
|
|
|
260
|
-
|
|
264
|
+
// for BILN, there might be smiles in there, with bunch of special characters
|
|
265
|
+
let isPossiblyBiln = colNameLower.includes('biln') || dotIsLikelyBilnSplitter;
|
|
266
|
+
if (isPossiblyBiln) {
|
|
267
|
+
for (const symbol of ['@', '$', ';', '*'])
|
|
268
|
+
delete statsAsChars.freq[symbol];
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
const separator = this.detectSeparator(statsAsChars.freq, categoriesSample, seqMinLength, isPossiblyBiln);
|
|
272
|
+
if (separator !== '-')
|
|
273
|
+
isPossiblyBiln = false;
|
|
261
274
|
const checkForbiddenSeparatorRes = this.checkForbiddenSeparator(separator);
|
|
262
275
|
if (checkForbiddenSeparatorRes) {
|
|
263
276
|
last.rejectReason = `Separator '${separator}' is forbidden.`;
|
|
@@ -300,7 +313,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
300
313
|
}
|
|
301
314
|
// Single- and multi-char monomer names for sequences with separators have constraints
|
|
302
315
|
if (units === NOTATION.SEPARATOR || (units === NOTATION.FASTA && alphabetIsMultichar)) {
|
|
303
|
-
const badSymbol /*: string | null*/ = this.checkBadMultichar(stats.freq);
|
|
316
|
+
const badSymbol /*: string | null*/ = this.checkBadMultichar(stats.freq, isPossiblyBiln);
|
|
304
317
|
if (badSymbol) {
|
|
305
318
|
last.rejectReason = `Forbidden multi-char monomer: '${badSymbol}'.`;
|
|
306
319
|
return null;
|
|
@@ -349,7 +362,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
349
362
|
* @param categoriesSample A string array of seqs sample
|
|
350
363
|
* @param seqMinLength A threshold on min seq length for contributing to stats
|
|
351
364
|
*/
|
|
352
|
-
detectSeparator(freq, categoriesSample, seqMinLength) {
|
|
365
|
+
detectSeparator(freq, categoriesSample, seqMinLength, isPossiblyBiln = false) {
|
|
353
366
|
// To detect a separator we analyze col's sequences character frequencies.
|
|
354
367
|
// If there is an exceptionally frequent symbol, then we will call it the separator.
|
|
355
368
|
// The most frequent symbol should occur with a rate of at least 0.15
|
|
@@ -371,7 +384,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
371
384
|
|
|
372
385
|
const maxFreq = Math.max(...Object.values(cleanFreq));
|
|
373
386
|
|
|
374
|
-
const sep = Object.entries(
|
|
387
|
+
const sep = Object.entries(cleanFreq).find(([k, v]) => v === maxFreq)[0];
|
|
375
388
|
const sepFreq = freq[sep];
|
|
376
389
|
const otherSumFreq = Object.entries(freq).filter((kv) => kv[0] !== sep)
|
|
377
390
|
.map((kv) => kv[1]).reduce((pSum, a) => pSum + a, 0);
|
|
@@ -379,7 +392,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
379
392
|
// Splitter with separator test application
|
|
380
393
|
const splitter = this.getSplitterWithSeparator(sep, SEQ_SAMPLE_LENGTH_LIMIT);
|
|
381
394
|
const stats = this.getStats(categoriesSample, seqMinLength, splitter);
|
|
382
|
-
const badSymbol = this.checkBadMultichar(stats.freq);
|
|
395
|
+
const badSymbol = this.checkBadMultichar(stats.freq, isPossiblyBiln);
|
|
383
396
|
if (badSymbol) return null;
|
|
384
397
|
// TODO: Test for Gamma/Erlang distribution
|
|
385
398
|
const totalMonomerCount = wu(Object.values(stats.freq)).reduce((sum, a) => sum + a, 0);
|
|
@@ -407,11 +420,12 @@ class BioPackageDetectors extends DG.Package {
|
|
|
407
420
|
/** Dots and colons are nor allowed in multichar monomer names (but space is allowed).
|
|
408
421
|
* The monomer name/label cannot contain digits only (but single digit is allowed).
|
|
409
422
|
*/
|
|
410
|
-
checkBadMultichar(freq) /* : string | null */ {
|
|
423
|
+
checkBadMultichar(freq, isPossiblyBiln = false) /* : string | null */ {
|
|
411
424
|
for (const symbol of Object.keys(freq)) {
|
|
412
425
|
if (symbol && !isNaN(symbol))
|
|
413
426
|
return symbol; // performance evaluated better with RegExp
|
|
414
|
-
|
|
427
|
+
if (isPossiblyBiln && symbol.startsWith('[') && symbol.endsWith(']'))
|
|
428
|
+
continue; // biln monomer smiles can contain forbidden characters within []
|
|
415
429
|
const symbolLen = symbol.length;
|
|
416
430
|
if (this.forbiddenMulticharFirst.includes(symbol[0]))
|
|
417
431
|
return symbol;
|