@datagrok/bio 2.11.3 → 2.11.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +9 -0
- package/detectors.js +99 -48
- package/dist/196.js +1 -1
- package/dist/196.js.map +1 -1
- package/dist/361.js +1 -1
- package/dist/361.js.map +1 -1
- package/dist/381.js +1 -1
- package/dist/381.js.map +1 -1
- package/dist/770.js +1 -1
- package/dist/770.js.map +1 -1
- package/dist/79.js.map +1 -1
- package/dist/868.js +1 -1
- package/dist/868.js.map +1 -1
- package/dist/package-test.js +1 -1
- package/dist/package-test.js.map +1 -1
- package/dist/package.js +1 -1
- package/dist/package.js.map +1 -1
- package/package.json +3 -3
- package/src/analysis/sequence-space.ts +34 -12
- package/src/demo/bio01b-hierarchical-clustering-and-activity-cliffs.ts +2 -1
- package/src/package.ts +51 -29
- package/src/tests/activity-cliffs-tests.ts +5 -3
- package/src/tests/activity-cliffs-utils.ts +5 -2
- package/src/tests/converters-test.ts +72 -72
- package/src/tests/detectors-benchmark-tests.ts +2 -2
- package/src/tests/detectors-tests.ts +36 -36
- package/src/tests/detectors-weak-and-likely-tests.ts +24 -24
- package/src/tests/mm-distance-tests.ts +10 -9
- package/src/tests/units-handler-splitted-tests.ts +33 -36
- package/src/tests/units-handler-tests.ts +9 -9
- package/src/utils/detect-macromolecule-probe.ts +44 -0
- package/src/utils/monomer-lib.ts +4 -9
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,13 @@
|
|
|
1
1
|
# Bio changelog
|
|
2
2
|
|
|
3
|
+
## 2.12.0 (WIP)
|
|
4
|
+
|
|
5
|
+
### Features
|
|
6
|
+
|
|
7
|
+
### Bug fixes
|
|
8
|
+
* Fix detectMacromolecule allowing double quoted sequences and gaps.
|
|
9
|
+
* Fix for min seq length 10, tests.
|
|
10
|
+
|
|
3
11
|
## 2.11.0 (2023-10-25)
|
|
4
12
|
|
|
5
13
|
### Features
|
|
@@ -8,6 +16,7 @@
|
|
|
8
16
|
* Add ToAtomicLevel for non-linear HELM structures.
|
|
9
17
|
* Add WebLogo aggregation function.
|
|
10
18
|
* Add WebLogo position tooltip with composition table (for count).
|
|
19
|
+
* Add PolyTool with Helm2Molfile support
|
|
11
20
|
|
|
12
21
|
### Bug fixes
|
|
13
22
|
|
package/detectors.js
CHANGED
|
@@ -81,6 +81,19 @@ class BioPackageDetectors extends DG.Package {
|
|
|
81
81
|
s.startsWith('RNA1{') || s.startsWith('DNA1{');
|
|
82
82
|
}
|
|
83
83
|
|
|
84
|
+
//name: detectMacromoleculeEnableStore
|
|
85
|
+
//output: object result
|
|
86
|
+
detectMacromoleculeEnableStore() {
|
|
87
|
+
return window.$detectMacromoleculeStore = {last: null};
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/** Returns last object (stores it if enabled earlier). */
|
|
91
|
+
detectMacromoleculeStoreLast() {
|
|
92
|
+
const last = {};
|
|
93
|
+
if (window.$detectMacromoleculeStore) window.$detectMacromoleculeStore.last = last;
|
|
94
|
+
return last;
|
|
95
|
+
}
|
|
96
|
+
|
|
84
97
|
//tags: semTypeDetector
|
|
85
98
|
//input: column col
|
|
86
99
|
//output: string semType
|
|
@@ -89,13 +102,18 @@ class BioPackageDetectors extends DG.Package {
|
|
|
89
102
|
console.debug(`Bio: detectMacromolecule( table: ${tableName}.${col.name} ), start`);
|
|
90
103
|
const t1 = Date.now();
|
|
91
104
|
try {
|
|
105
|
+
const last = this.detectMacromoleculeStoreLast();
|
|
92
106
|
const colName = col.name;
|
|
93
107
|
const colNameLikely = this.likelyColNamePartList.some(
|
|
94
108
|
(requiredColNamePart) => colName.toLowerCase().includes(requiredColNamePart));
|
|
95
|
-
const seqMinLength = colNameLikely ?
|
|
109
|
+
const seqMinLength = colNameLikely ? 7 : 10;
|
|
110
|
+
const maxBadRatio = colNameLikely ? 0.05 : 0.005;
|
|
96
111
|
|
|
97
112
|
// Fail early
|
|
98
|
-
if (col.type !== DG.TYPE.STRING)
|
|
113
|
+
if (col.type !== DG.TYPE.STRING) {
|
|
114
|
+
last.rejectReason = `The column must be of type '${DG.TYPE.STRING}'.`;
|
|
115
|
+
return null;
|
|
116
|
+
}
|
|
99
117
|
|
|
100
118
|
const categoriesSample = [...new Set((col.length < SEQ_SAMPLE_LIMIT ?
|
|
101
119
|
wu.count(0).take(Math.min(SEQ_SAMPLE_LIMIT, col.length)).map((rowI) => col.get(rowI)) :
|
|
@@ -103,6 +121,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
103
121
|
.map((seq) => !!seq ? seq.substring(0, SEQ_SAMPLE_LENGTH_LIMIT * 5) : '')
|
|
104
122
|
.filter((seq) => seq.length !== 0/* skip empty values for detector */),
|
|
105
123
|
)];
|
|
124
|
+
last.categoriesSample = categoriesSample;
|
|
106
125
|
|
|
107
126
|
// To collect alphabet freq three strategies can be used:
|
|
108
127
|
// as chars, as fasta (single or within square brackets), as with the separator.
|
|
@@ -147,31 +166,47 @@ class BioPackageDetectors extends DG.Package {
|
|
|
147
166
|
return res;
|
|
148
167
|
// return isUrlRe.test(s);
|
|
149
168
|
};
|
|
150
|
-
const isUrl = categoriesSample.every((v) =>
|
|
151
|
-
if (isUrl)
|
|
169
|
+
const isUrl = categoriesSample.every((v) => !v || isUrlCheck(v));
|
|
170
|
+
if (isUrl) {
|
|
171
|
+
last.rejectReason = 'URL detected.';
|
|
172
|
+
return null;
|
|
173
|
+
}
|
|
152
174
|
|
|
153
175
|
// TODO: Detect HELM sequence
|
|
154
176
|
// TODO: Lazy calculations could be helpful for performance and convenient for expressing classification logic.
|
|
155
177
|
const statsAsChars = this.getStats(categoriesSample, seqMinLength,
|
|
156
178
|
this.getSplitterAsChars(SEQ_SAMPLE_LENGTH_LIMIT));
|
|
157
179
|
// Empty statsAsShars.freq alphabet means no strings of enough length presented in the data
|
|
158
|
-
if (Object.keys(statsAsChars.freq).length === 0)
|
|
180
|
+
if (Object.keys(statsAsChars.freq).length === 0) {
|
|
181
|
+
last.rejectReason = 'Monomer set (alphabet) is empty.';
|
|
182
|
+
return null;
|
|
183
|
+
}
|
|
159
184
|
|
|
160
185
|
const decoy = this.detectAlphabet(statsAsChars.freq, decoyAlphabets, null, colNameLikely ? -0.05 : 0);
|
|
161
|
-
if (decoy !== ALPHABET.UN)
|
|
186
|
+
if (decoy !== ALPHABET.UN) {
|
|
187
|
+
last.rejectReason = `Decoy alphabet '${decoy}' detected.`;
|
|
188
|
+
return null;
|
|
189
|
+
}
|
|
162
190
|
|
|
163
|
-
const separator = this.detectSeparator(statsAsChars.freq, categoriesSample);
|
|
164
|
-
|
|
191
|
+
const separator = this.detectSeparator(statsAsChars.freq, categoriesSample, seqMinLength);
|
|
192
|
+
const checkForbiddenSeparatorRes = this.checkForbiddenSeparator(separator);
|
|
193
|
+
if (checkForbiddenSeparatorRes) {
|
|
194
|
+
last.rejectReason = `Separator '${separator}' is forbidden.`;
|
|
195
|
+
return null;
|
|
196
|
+
}
|
|
165
197
|
|
|
166
198
|
const units = separator ? NOTATION.SEPARATOR : NOTATION.FASTA;
|
|
167
199
|
const gapSymbol = separator ? '' : '-';
|
|
168
200
|
const splitter = separator ? this.getSplitterWithSeparator(separator, SEQ_SAMPLE_LENGTH_LIMIT) :
|
|
169
201
|
this.getSplitterAsFasta(SEQ_SAMPLE_LENGTH_LIMIT);
|
|
170
202
|
|
|
171
|
-
if (statsAsChars.sameLength) {
|
|
203
|
+
if (statsAsChars.sameLength) { // MSA FASTA single character
|
|
172
204
|
const stats = this.getStats(categoriesSample, seqMinLength, splitter);
|
|
173
205
|
const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, '-', colNameLikely ? 0.20 : 0);
|
|
174
|
-
if (alphabet === ALPHABET.UN)
|
|
206
|
+
if (alphabet === ALPHABET.UN) {
|
|
207
|
+
last.rejectReason = `MSA FASTA single character alphabet is not allowed to be 'UN'.`;
|
|
208
|
+
return null;
|
|
209
|
+
}
|
|
175
210
|
|
|
176
211
|
col.setTag(DG.TAGS.UNITS, units);
|
|
177
212
|
if (separator) col.setTag(UnitsHandler.TAGS.separator, separator);
|
|
@@ -186,22 +221,38 @@ class BioPackageDetectors extends DG.Package {
|
|
|
186
221
|
const stats = this.getStats(categoriesSample, seqMinLength, splitter);
|
|
187
222
|
const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
|
|
188
223
|
// Empty monomer alphabet is not allowed
|
|
189
|
-
if (Object.keys(stats.freq).length === 0)
|
|
190
|
-
|
|
191
|
-
if (
|
|
192
|
-
((units === NOTATION.SEPARATOR || (units === NOTATION.FASTA && alphabetIsMultichar)) &&
|
|
193
|
-
this.checkForbiddenMultichar(stats.freq)) ||
|
|
194
|
-
((units === NOTATION.FASTA && !alphabetIsMultichar) &&
|
|
195
|
-
this.checkForbiddenSinglechar(stats.freq))
|
|
196
|
-
) {
|
|
224
|
+
if (Object.keys(stats.freq).length === 0) {
|
|
225
|
+
last.rejectReason = 'Monomer set (alphabet) is empty';
|
|
197
226
|
return null;
|
|
198
227
|
}
|
|
228
|
+
// Single- and multi-char monomer names for sequences with separators have constraints
|
|
229
|
+
if (units === NOTATION.SEPARATOR || (units === NOTATION.FASTA && alphabetIsMultichar)) {
|
|
230
|
+
const badSet = this.checkBadMultichar(stats.freq);
|
|
231
|
+
const [badCount, allCount] = this.calcBad(stats.freq, badSet);
|
|
232
|
+
if (badCount / allCount > maxBadRatio) {
|
|
233
|
+
last.rejectReason = `Forbidden multi-char monomers: ` +
|
|
234
|
+
`${wu(badSet.keys()).map((m) => `'${m}'`).toArray().join(', ')}`;
|
|
235
|
+
return null;
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
if (units === NOTATION.FASTA && !alphabetIsMultichar) {
|
|
239
|
+
const badSet = this.checkBadSinglechar(stats.freq);
|
|
240
|
+
const [badCount, allCount] = this.calcBad(stats.freq, badSet);
|
|
241
|
+
if (badCount / allCount > maxBadRatio) {
|
|
242
|
+
last.rejectReason = `Forbidden single-char monomers: ` +
|
|
243
|
+
`${wu(badSet.keys()).map((m) => `'${m}'`).toArray().join(', ')}`;
|
|
244
|
+
return null;
|
|
245
|
+
}
|
|
246
|
+
}
|
|
199
247
|
|
|
200
248
|
const aligned = stats.sameLength ? ALIGNMENT.SEQ_MSA : ALIGNMENT.SEQ;
|
|
201
249
|
|
|
202
250
|
// TODO: If separator detected, then extra efforts to detect alphabet are allowed.
|
|
203
251
|
const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, gapSymbol, colNameLikely ? 0.15 : 0);
|
|
204
|
-
if (units === NOTATION.FASTA && alphabet === ALPHABET.UN && !alphabetIsMultichar)
|
|
252
|
+
if (units === NOTATION.FASTA && alphabet === ALPHABET.UN && !alphabetIsMultichar) {
|
|
253
|
+
last.rejectReason = `FASTA single character alphabet is not allowed to be 'UN'.`;
|
|
254
|
+
return null;
|
|
255
|
+
}
|
|
205
256
|
|
|
206
257
|
// const forbidden = this.checkForbiddenWoSeparator(stats.freq);
|
|
207
258
|
col.setTag(DG.TAGS.UNITS, units);
|
|
@@ -216,11 +267,12 @@ class BioPackageDetectors extends DG.Package {
|
|
|
216
267
|
return DG.SEMTYPE.MACROMOLECULE;
|
|
217
268
|
}
|
|
218
269
|
} catch (err) {
|
|
219
|
-
|
|
270
|
+
const errMsg = err instanceof Error ? err.message : err.toString();
|
|
271
|
+
const errStack = err instanceof Error ? err.stack : undefined;
|
|
220
272
|
const colTops = wu.count(0).take(Math.max(col.length, 4)).map((rowI) => col.get(rowI))
|
|
221
273
|
.reduce((a, b) => a === undefined ? b : a + '\n' + b, undefined);
|
|
222
|
-
|
|
223
|
-
|
|
274
|
+
console.error(`Bio: detectMacromolecule( table: ${tableName}.${col.name} ), error:\n${errMsg}` +
|
|
275
|
+
`${errStack ? '\n' + errStack : ''}` + `\n${colTops}`);
|
|
224
276
|
} finally {
|
|
225
277
|
const t2 = Date.now();
|
|
226
278
|
console.debug(`Bio: detectMacromolecule( table: ${tableName}.${col.name} ), ` + `ET = ${t2 - t1} ms.`);
|
|
@@ -232,7 +284,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
232
284
|
* @param freq Dictionary of characters freqs
|
|
233
285
|
* @param sample A string array of seqs sample
|
|
234
286
|
*/
|
|
235
|
-
detectSeparator(freq, categoriesSample) {
|
|
287
|
+
detectSeparator(freq, categoriesSample, seqMinLength) {
|
|
236
288
|
// To detect a separator we analyze col's sequences character frequencies.
|
|
237
289
|
// If there is an exceptionally frequent symbol, then we will call it the separator.
|
|
238
290
|
// The most frequent symbol should occur with a rate of at least 0.15
|
|
@@ -261,7 +313,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
261
313
|
|
|
262
314
|
// Splitter with separator test application
|
|
263
315
|
const splitter = this.getSplitterWithSeparator(sep, SEQ_SAMPLE_LENGTH_LIMIT);
|
|
264
|
-
const stats = this.getStats(categoriesSample,
|
|
316
|
+
const stats = this.getStats(categoriesSample, seqMinLength, splitter);
|
|
265
317
|
// TODO: Test for Gamma/Erlang distribution
|
|
266
318
|
const totalMonomerCount = wu(Object.values(stats.freq)).reduce((sum, a) => sum + a, 0);
|
|
267
319
|
const mLengthAvg = wu.entries(stats.freq)
|
|
@@ -285,19 +337,28 @@ class BioPackageDetectors extends DG.Package {
|
|
|
285
337
|
return forbiddenSepRe.test(separator);
|
|
286
338
|
}
|
|
287
339
|
|
|
288
|
-
/**
|
|
289
|
-
* The monomer name/label cannot contain digits only.
|
|
340
|
+
/** Dots and colons are nor allowed in multichar monomer names (but space is allowed).
|
|
341
|
+
* The monomer name/label cannot contain digits only (but single digit is allowed).
|
|
290
342
|
*/
|
|
291
|
-
|
|
292
|
-
const
|
|
293
|
-
|
|
294
|
-
return forbiddenMonomerList.length > 0;
|
|
343
|
+
checkBadMultichar(freq) {
|
|
344
|
+
const badRe = /[ .:]|^\d+$/i;
|
|
345
|
+
return new Set(Object.keys(freq).filter((m) => badRe.test(m)));
|
|
295
346
|
}
|
|
296
347
|
|
|
297
348
|
/** Space, dot, colon, semicolon, digit, underscore are not allowed as singe char monomer names.*/
|
|
298
|
-
|
|
299
|
-
const
|
|
300
|
-
return Object.keys(freq).
|
|
349
|
+
checkBadSinglechar(freq) {
|
|
350
|
+
const badRe = /[ .:;\d_]/i;
|
|
351
|
+
return new Set(Object.keys(freq).filter((m) => badRe.test(m)));
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
calcBad(freq, forbiddenSet) {
|
|
355
|
+
let allCount = 0;
|
|
356
|
+
let forbiddenCount = 0;
|
|
357
|
+
for (const [m, count] of Object.entries(freq)) {
|
|
358
|
+
if (forbiddenSet.has(m)) forbiddenCount += freq[m];
|
|
359
|
+
allCount += freq[m];
|
|
360
|
+
}
|
|
361
|
+
return [forbiddenCount, allCount];
|
|
301
362
|
}
|
|
302
363
|
|
|
303
364
|
// /** Without a separator, special symbols or digits are not allowed as monomers. */
|
|
@@ -397,22 +458,12 @@ class BioPackageDetectors extends DG.Package {
|
|
|
397
458
|
};
|
|
398
459
|
}
|
|
399
460
|
|
|
400
|
-
getSplitterWithSeparator(separator,
|
|
461
|
+
getSplitterWithSeparator(separator, limit) {
|
|
401
462
|
return function(seq) {
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
// const newPos = seq.indexOf(separator, pos);
|
|
407
|
-
// res[count] = seq.substring(pos, newPos);
|
|
408
|
-
// count++;
|
|
409
|
-
// pos = newPos;
|
|
410
|
-
// }
|
|
411
|
-
//
|
|
412
|
-
// return res.slice(0, count);
|
|
413
|
-
// } else {
|
|
414
|
-
return seq.split(separator, lengthLimit);
|
|
415
|
-
// }
|
|
463
|
+
const seq1 = !seq ? '' :
|
|
464
|
+
(seq.startsWith('"') && seq.endsWith('"')) ? seq.slice(1, -1).replaceAll('""-""', '') :
|
|
465
|
+
seq;
|
|
466
|
+
return !seq1 ? [] : seq1.split(separator, limit);
|
|
416
467
|
};
|
|
417
468
|
}
|
|
418
469
|
|