@datagrok/bio 2.5.0 → 2.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/detectors.js +23 -5
- package/dist/package-test.js +1 -1
- package/dist/package-test.js.map +1 -1
- package/dist/package.js +1 -1
- package/dist/package.js.map +1 -1
- package/package.json +2 -2
- package/src/package-test.ts +1 -3
- package/src/tests/WebLogo-positions-test.ts +10 -14
- package/src/tests/detectors-benchmark-tests.ts +9 -9
- package/src/tests/detectors-tests.ts +30 -0
- package/src/tests/msa-tests.ts +4 -4
- package/src/tests/renderers-test.ts +26 -44
- package/src/tests/similarity-diversity-tests.ts +35 -52
- package/src/tests/splitters-test.ts +16 -6
- package/src/tests/utils/sequences-generators.ts +7 -3
- package/src/tests/viewers.ts +3 -7
- package/src/utils/convert.ts +0 -2
- package/src/viewers/web-logo-viewer.ts +35 -32
package/detectors.js
CHANGED
|
@@ -158,7 +158,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
158
158
|
const decoy = this.detectAlphabet(statsAsChars.freq, decoyAlphabets, null, colNameLikely ? -0.05 : 0);
|
|
159
159
|
if (decoy !== ALPHABET.UN) return null;
|
|
160
160
|
|
|
161
|
-
const separator = this.detectSeparator(statsAsChars.freq);
|
|
161
|
+
const separator = this.detectSeparator(statsAsChars.freq, categoriesSample);
|
|
162
162
|
if (this.checkForbiddenSeparator(separator)) return null;
|
|
163
163
|
|
|
164
164
|
const units = separator ? NOTATION.SEPARATOR : NOTATION.FASTA;
|
|
@@ -227,8 +227,10 @@ class BioPackageDetectors extends DG.Package {
|
|
|
227
227
|
|
|
228
228
|
/** Detects the most frequent char with a rate of at least 0.15 of others in sum.
|
|
229
229
|
* Does not use any splitting strategies, estimates just by single characters.
|
|
230
|
-
*
|
|
231
|
-
|
|
230
|
+
* @param freq Dictionary of characters freqs
|
|
231
|
+
* @param sample A string array of seqs sample
|
|
232
|
+
*/
|
|
233
|
+
detectSeparator(freq, categoriesSample) {
|
|
232
234
|
// To detect a separator we analyze col's sequences character frequencies.
|
|
233
235
|
// If there is an exceptionally frequent symbol, then we will call it the separator.
|
|
234
236
|
// The most frequent symbol should occur with a rate of at least 0.15
|
|
@@ -254,8 +256,24 @@ class BioPackageDetectors extends DG.Package {
|
|
|
254
256
|
const sepFreq = freq[sep];
|
|
255
257
|
const otherSumFreq = Object.entries(freq).filter((kv) => kv[0] !== sep)
|
|
256
258
|
.map((kv) => kv[1]).reduce((pSum, a) => pSum + a, 0);
|
|
257
|
-
|
|
258
|
-
|
|
259
|
+
|
|
260
|
+
// Splitter with separator test application
|
|
261
|
+
const splitter = this.getSplitterWithSeparator(sep, SEQ_SAMPLE_LENGTH_LIMIT);
|
|
262
|
+
const stats = this.getStats(categoriesSample, 0, splitter);
|
|
263
|
+
// TODO: Test for Gamma/Erlang distribution
|
|
264
|
+
const totalMonomerCount = wu(Object.values(stats.freq)).reduce((sum, a) => sum + a, 0);
|
|
265
|
+
const mLengthAvg = wu.entries(stats.freq)
|
|
266
|
+
.reduce((sum, [m, c]) => sum + m.length * c, 0) / totalMonomerCount;
|
|
267
|
+
const mLengthVarN = Math.sqrt(wu.entries(stats.freq)
|
|
268
|
+
.reduce((sum, [m, c]) => sum + Math.pow(m.length - mLengthAvg, 2) * c, 0) / (totalMonomerCount - 1),
|
|
269
|
+
) / mLengthAvg;
|
|
270
|
+
|
|
271
|
+
const sepRate = sepFreq / (sepFreq + otherSumFreq);
|
|
272
|
+
const expSepRate = 1 / Object.keys(freq).length; // expected
|
|
273
|
+
// const freqThreshold = (1 / (Math.log2(Object.keys(freq).length) + 2));
|
|
274
|
+
|
|
275
|
+
return (sepRate / expSepRate > 2.2 && mLengthVarN < 0.7) ||
|
|
276
|
+
(sepRate / expSepRate > 4) ? sep : null;
|
|
259
277
|
}
|
|
260
278
|
|
|
261
279
|
checkForbiddenSeparator(separator) {
|