npm - @datagrok/bio - Versions diffs - 2.5.0 → 2.6.0 - Mend

@datagrok/bio 2.5.0 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/detectors.js +23 -5
package/dist/package-test.js +1 -1
package/dist/package-test.js.map +1 -1
package/dist/package.js +1 -1
package/dist/package.js.map +1 -1
package/package.json +2 -2
package/src/package-test.ts +1 -3
package/src/tests/WebLogo-positions-test.ts +10 -14
package/src/tests/detectors-benchmark-tests.ts +9 -9
package/src/tests/detectors-tests.ts +30 -0
package/src/tests/msa-tests.ts +4 -4
package/src/tests/renderers-test.ts +26 -44
package/src/tests/similarity-diversity-tests.ts +35 -52
package/src/tests/splitters-test.ts +16 -6
package/src/tests/utils/sequences-generators.ts +7 -3
package/src/tests/viewers.ts +3 -7
package/src/utils/convert.ts +0 -2
package/src/viewers/web-logo-viewer.ts +35 -32

package/detectors.js CHANGED Viewed

@@ -158,7 +158,7 @@ class BioPackageDetectors extends DG.Package {
       const decoy = this.detectAlphabet(statsAsChars.freq, decoyAlphabets, null, colNameLikely ? -0.05 : 0);
       if (decoy !== ALPHABET.UN) return null;
-      const separator = this.detectSeparator(statsAsChars.freq);
+      const separator = this.detectSeparator(statsAsChars.freq, categoriesSample);
       if (this.checkForbiddenSeparator(separator)) return null;
       const units = separator ? NOTATION.SEPARATOR : NOTATION.FASTA;
@@ -227,8 +227,10 @@ class BioPackageDetectors extends DG.Package {
   /** Detects the most frequent char with a rate of at least 0.15 of others in sum.
    * Does not use any splitting strategies, estimates just by single characters.
-   * */
-  detectSeparator(freq) {
+   * @param freq Dictionary of characters freqs
+   * @param sample A string array of seqs sample
+   */
+  detectSeparator(freq, categoriesSample) {
     // To detect a separator we analyze col's sequences character frequencies.
     // If there is an exceptionally frequent symbol, then we will call it the separator.
     // The most frequent symbol should occur with a rate of at least 0.15
@@ -254,8 +256,24 @@ class BioPackageDetectors extends DG.Package {
     const sepFreq = freq[sep];
     const otherSumFreq = Object.entries(freq).filter((kv) => kv[0] !== sep)
       .map((kv) => kv[1]).reduce((pSum, a) => pSum + a, 0);
-    const freqThreshold = 3.5 * (1 / Object.keys(freq).length);
-    return sepFreq / otherSumFreq > freqThreshold ? sep : null;
+    // Splitter with separator test application
+    const splitter = this.getSplitterWithSeparator(sep, SEQ_SAMPLE_LENGTH_LIMIT);
+    const stats = this.getStats(categoriesSample, 0, splitter);
+    // TODO: Test for Gamma/Erlang distribution
+    const totalMonomerCount = wu(Object.values(stats.freq)).reduce((sum, a) => sum + a, 0);
+    const mLengthAvg = wu.entries(stats.freq)
+      .reduce((sum, [m, c]) => sum + m.length * c, 0) / totalMonomerCount;
+    const mLengthVarN = Math.sqrt(wu.entries(stats.freq)
+      .reduce((sum, [m, c]) => sum + Math.pow(m.length - mLengthAvg, 2) * c, 0) / (totalMonomerCount - 1),
+    ) / mLengthAvg;
+    const sepRate = sepFreq / (sepFreq + otherSumFreq);
+    const expSepRate = 1 / Object.keys(freq).length; // expected
+    // const freqThreshold = (1 / (Math.log2(Object.keys(freq).length) + 2));
+    return (sepRate / expSepRate > 2.2 && mLengthVarN < 0.7) ||
+    (sepRate / expSepRate > 4) ? sep : null;
   }
   checkForbiddenSeparator(separator) {