npm - @datagrok/bio - Versions diffs - 2.4.2 → 2.4.3 - Mend

@datagrok/bio 2.4.2 → 2.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/.eslintrc.json +1 -1
package/detectors.js +21 -17
package/dist/package-test.js +1 -1
package/dist/package-test.js.map +1 -1
package/package.json +1 -1
package/src/package-test.ts +1 -0
package/src/tests/detectors-weak-and-likely-tests.ts +129 -0
package/tsconfig.json +1 -1

package/.eslintrc.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "env": {
     "browser": true,
-    "es2021": true
+    "es2022": true
   },
   "extends": [
     "google"

package/detectors.js CHANGED Viewed

@@ -46,7 +46,7 @@ const isUrlRe = /[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9
 class BioPackageDetectors extends DG.Package {
   /** Parts of the column name required in the column's name under the detector. It must be in lowercase. */
-  requiredColNamePartList = ['seq', 'msa', 'dna', 'rna', 'fasta', 'helm', 'sense', 'protein'];
+  likelyColNamePartList = ['seq', 'msa', 'dna', 'rna', 'fasta', 'helm', 'sense', 'protein'];
   peptideFastaAlphabet = new Set([
     'G', 'L', 'Y', 'S', 'E', 'Q', 'D', 'N', 'F', 'A',
@@ -89,9 +89,9 @@ class BioPackageDetectors extends DG.Package {
     const t1 = Date.now();
     try {
       const colName = col.name;
-      if (!this.requiredColNamePartList.some(
-        (requiredColNamePart) => colName.toLowerCase().includes(requiredColNamePart),
-      )) return null;
+      const colNameLikely = this.likelyColNamePartList.some(
+        (requiredColNamePart) => colName.toLowerCase().includes(requiredColNamePart));
+      const seqMinLength = colNameLikely ? 3 : 5;
       // Fail early
       if (col.type !== DG.TYPE.STRING) return null;
@@ -147,7 +147,7 @@ class BioPackageDetectors extends DG.Package {
       // TODO: Detect HELM sequence
       // TODO: Lazy calculations could be helpful for performance and convenient for expressing classification logic.
-      const statsAsChars = this.getStats(categoriesSample, 5,
+      const statsAsChars = this.getStats(categoriesSample, seqMinLength,
         this.getSplitterAsChars(SEQ_SAMPLE_LENGTH_LIMIT));
       // Empty statsAsShars.freq alphabet means no strings of enough length presented in the data
       if (Object.keys(statsAsChars.freq).length === 0) return null;
@@ -164,17 +164,21 @@ class BioPackageDetectors extends DG.Package {
         this.getSplitterAsFasta(SEQ_SAMPLE_LENGTH_LIMIT);
       if (statsAsChars.sameLength) {
-        const stats = this.getStats(categoriesSample, 5, splitter);
-        const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, '-');
-        if (alphabet === ALPHABET.UN) return null;
+        const stats = this.getStats(categoriesSample, seqMinLength, splitter);
+        const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, '-', colNameLikely);
+        if (alphabet === ALPHABET.UN && !colNameLikely) return null;
         col.setTag(DG.TAGS.UNITS, units);
         if (separator) col.setTag(UnitsHandler.TAGS.separator, separator);
         col.setTag(UnitsHandler.TAGS.aligned, ALIGNMENT.SEQ_MSA);
         col.setTag(UnitsHandler.TAGS.alphabet, alphabet);
+        if (alphabet === ALPHABET.UN) {
+          const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
+          col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
+        }
         return DG.SEMTYPE.MACROMOLECULE;
       } else {
-        const stats = this.getStats(categoriesSample, 5, splitter);
+        const stats = this.getStats(categoriesSample, seqMinLength, splitter);
         const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
         // Empty monomer alphabet is not allowed
         if (Object.keys(stats.freq).length === 0) return null;
@@ -189,8 +193,9 @@ class BioPackageDetectors extends DG.Package {
         const aligned = stats.sameLength ? ALIGNMENT.SEQ_MSA : ALIGNMENT.SEQ;
         // TODO: If separator detected, then extra efforts to detect alphabet are allowed.
-        const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, gapSymbol);
-        if (units === NOTATION.FASTA && alphabet === ALPHABET.UN && !alphabetIsMultichar) return null;
+        const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, gapSymbol, colNameLikely);
+        /* Likely column name allows detecting 'fasta' notation with 'UN' alphabet, 2023-04-13, atanas, askalkin */
+        if (units === NOTATION.FASTA && alphabet === ALPHABET.UN && !alphabetIsMultichar && !colNameLikely) return null;
         // const forbidden = this.checkForbiddenWoSeparator(stats.freq);
         col.setTag(DG.TAGS.UNITS, units);
@@ -199,9 +204,7 @@ class BioPackageDetectors extends DG.Package {
         col.setTag(UnitsHandler.TAGS.alphabet, alphabet);
         if (alphabet === ALPHABET.UN) {
           // alphabetSize calculated on (sub)sample of data is incorrect
-          // const alphabetSize = Object.keys(stats.freq).length;
           const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
-          // col.setTag(UnitsHandler.TAGS.alphabetSize, alphabetSize.toString());
           col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
         }
         return DG.SEMTYPE.MACROMOLECULE;
@@ -287,7 +290,7 @@ class BioPackageDetectors extends DG.Package {
         sameLength = false;
       }
-      if (mSeq.length > minLength) {
+      if (mSeq.length >= minLength) {
         for (const m of mSeq) {
           if (!(m in freq)) freq[m] = 0;
           freq[m] += 1;
@@ -300,10 +303,11 @@ class BioPackageDetectors extends DG.Package {
   /** Detects alphabet for freq by freq similarity to alphabet monomer set.
    * @param freq       frequencies of monomers in sequence set
    * @param candidates  an array of pairs [name, monomer set]
-   * */
-  detectAlphabet(freq, candidates, gapSymbol) {
+   * @param {boolean} colNameLikely The column name suggests the column is Macromolecule more likely
+   */
+  detectAlphabet(freq, candidates, gapSymbol, colNameLikely = false) {
     const candidatesSims = candidates.map((c) => {
-      const sim = this.getAlphabetSimilarity(freq, c[1], gapSymbol);
+      const sim = this.getAlphabetSimilarity(freq, c[1], gapSymbol) + (colNameLikely ? 0.15 : 0);
       return [c[0], c[1], c[2], freq, sim];
     });