npm - @datagrok/bio - Versions diffs - 2.1.12 → 2.4.2 - Mend

@datagrok/bio 2.1.12 → 2.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

package/README.md +11 -12
package/css/helm.css +10 -0
package/detectors.js +83 -59
package/dist/package-test.js +2 -13168
package/dist/package-test.js.map +1 -0
package/dist/package.js +2 -10560
package/dist/package.js.map +1 -0
package/dockerfiles/Dockerfile +86 -0
package/files/icons/composition-analysis.svg +17 -0
package/files/icons/sequence-diversity-viewer.svg +4 -0
package/files/icons/sequence-similarity-viewer.svg +4 -0
package/files/icons/vdregions-viewer.svg +22 -0
package/files/icons/weblogo-viewer.svg +7 -0
package/files/tests/testUrl.csv +11 -0
package/files/tests/toAtomicLevelTest.csv +4 -0
package/package.json +24 -25
package/src/analysis/sequence-activity-cliffs.ts +11 -9
package/src/analysis/sequence-search-base-viewer.ts +2 -1
package/src/analysis/sequence-similarity-viewer.ts +3 -3
package/src/analysis/sequence-space.ts +2 -1
package/src/calculations/monomerLevelMols.ts +4 -4
package/src/package-test.ts +9 -2
package/src/package.ts +215 -131
package/src/substructure-search/substructure-search.ts +19 -16
package/src/tests/Palettes-test.ts +1 -1
package/src/tests/WebLogo-positions-test.ts +113 -57
package/src/tests/_first-tests.ts +9 -0
package/src/tests/activity-cliffs-tests.ts +8 -7
package/src/tests/activity-cliffs-utils.ts +17 -9
package/src/tests/bio-tests.ts +4 -5
package/src/tests/checkInputColumn-tests.ts +1 -1
package/src/tests/converters-test.ts +52 -17
package/src/tests/detectors-benchmark-tests.ts +3 -2
package/src/tests/detectors-tests.ts +177 -172
package/src/tests/fasta-export-tests.ts +1 -1
package/src/tests/monomer-libraries-tests.ts +34 -0
package/src/tests/pepsea-tests.ts +21 -0
package/src/tests/renderers-test.ts +21 -19
package/src/tests/sequence-space-test.ts +6 -4
package/src/tests/similarity-diversity-tests.ts +4 -4
package/src/tests/splitters-test.ts +4 -5
package/src/tests/substructure-filters-tests.ts +23 -1
package/src/tests/utils/sequences-generators.ts +1 -1
package/src/tests/utils.ts +2 -1
package/src/tests/viewers.ts +16 -0
package/src/utils/cell-renderer.ts +88 -35
package/src/utils/constants.ts +7 -6
package/src/utils/convert.ts +8 -2
package/src/utils/monomer-lib.ts +174 -0
package/src/utils/multiple-sequence-alignment.ts +44 -20
package/src/utils/pepsea.ts +78 -0
package/src/utils/save-as-fasta.ts +2 -1
package/src/utils/ui-utils.ts +15 -3
package/src/viewers/vd-regions-viewer.ts +113 -72
package/src/viewers/web-logo-viewer.ts +1031 -0
package/src/widgets/bio-substructure-filter.ts +38 -24
package/tsconfig.json +71 -72
package/webpack.config.js +4 -11
package/dist/vendors-node_modules_datagrok-libraries_ml_src_workers_dimensionality-reducer_js.js +0 -9039

package/README.md CHANGED Viewed

@@ -4,7 +4,7 @@ Bio is a bioinformatics support [package](https://datagrok.ai/help/develop/devel
 [Datagrok](https://datagrok.ai) platform with an extensive toolset supporting SAR analisys for small molecules
 and antibodies.
-# Notations
+## Notations
 [@datagrok/bio](https://github.com/datagrok-ai/public/tree/master/packages/Bio) can ingest data in multiple file
 formats (such as fasta o csv) and multiple notations for natural and modified residues, aligned and non-aligned forms,
@@ -18,7 +18,7 @@ See:
 * [detectMacromolecule()](../Bio/detectors.js)
 * [class NotationConverter](../../libraries/bio/src/utils/notation-converter.ts)
-# Atomic-Level structures from sequences
+## Atomic-Level structures from sequences
 For linear sequences, the linear form (see the illustration below) of molecules is reproduced. This is useful
 for better visual inspection of sequence and duplex comparison. Structure at atomic level could be saved in available
@@ -34,11 +34,10 @@ See:
 * [getMolfilesFromSeq()](./src/utils/atomic-works.ts)
-# MSA
+## MSA
 For multiple-sequence alignment, Datagrok uses the “kalign” that relies on Wu-Manber string-matching algorithm
-[Lassmann, Timo. _Kalign 3: multiple sequence alignment of large data sets._ **Bioinformatics** (2019).[pdf](
-https://academic.oup.com/bioinformatics/advance-article-pdf/doi/10.1093/bioinformatics/btz795/30314127/btz795.pdf)].
+[Lassmann, Timo. _Kalign 3: multiple sequence alignment of large data sets._ **Bioinformatics** (2019).pdf](https://academic.oup.com/bioinformatics/advance-article-pdf/doi/10.1093/bioinformatics/btz795/30314127/btz795.pdf).
 “kalign“ is suited for sequences containing only natural monomers. Sequences of a particular column can be analyzed
 using MSA algorithm available at the top menu. Aligned sequences can be inspected for base composition
 at the position of MSA result.
@@ -52,7 +51,7 @@ See:
 TODO: MSA with PepSeA
-# Splitting to monomers
+## Splitting to monomers
 Splitting to monomers allows splitting aligned sequences in separate monomers.
@@ -62,7 +61,7 @@ See:
 * [splitAlignedSequences()](../../libraries/bio/src/utils/splitter.ts)
-# Web Logo
+## Web Logo
 Web Logo visualizes a graphical representation of multiple sequence alignment (amino acids or nucleotides or
 modified residues with multi-char labels). Each logo consists of stacks of symbols, one for each position
@@ -81,13 +80,13 @@ You can customize the look of the viewer with properties. Properties ```startPos
 allow to display multiple alignment partially. If property  ```startPosition``` (```endPosition```)
 is not specified, then the Logo will be plotted from the first (till the last) position of sequences.
-## General
+### General
 |             |              |
 |-------------|--------------|
 | Right click | Context menu |
-## Properties
+### Properties
 | Property name        | Default  | Description                                                                                                             |
 |----------------------|----------|-------------------------------------------------------------------------------------------------------------------------|
@@ -116,7 +115,7 @@ See also:
 * [Viewers](../../help/visualize/viewers.md)
 * [Table view](../../help/datagrok/table-view.md)
-# Sequence space
+## Sequence space
 Datagrok allows visualizing multidimensional sequence space using a dimensionality reduction approach.
 Several distance-based dimensionality reduction algorithms are available, such as UMAP or t-SNE.
@@ -132,7 +131,7 @@ See:
 * [sequenceSpace()](src/utils/sequence-space.ts)
-# Sequence activity cliffs
+## Sequence activity cliffs
 Activity cliffs tool finds pairs of sequences where small changes in the sequence yield significant
 changes in activity or any other numerical property. open the tool from a top menu by selecting.
@@ -145,4 +144,4 @@ To launch the analysis from the top menu, select Bio | Sequence Activity Cliffs.
 See:
-* [getActivityCliffs()](../../libraries/ml/src/viewers/activity-cliffs.ts)
+* [getActivityCliffs()](../../libraries/ml/src/viewers/activity-cliffs.ts)

package/css/helm.css CHANGED Viewed

@@ -1,3 +1,13 @@
 .d4-g-cell[semType="Macromolecule"] * {
     pointer-events: none !important;
 }
+.helm-substructure-filter {
+    border: 1px solid var(--grey-2);
+    height: 25px;
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    padding-left: 0px;
+    margin-left: 0px;
+}

package/detectors.js CHANGED Viewed

@@ -45,23 +45,28 @@ const isUrlRe = /[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9
 class BioPackageDetectors extends DG.Package {
-  PeptideFastaAlphabet = new Set([
+  /** Parts of the column name required in the column's name under the detector. It must be in lowercase. */
+  requiredColNamePartList = ['seq', 'msa', 'dna', 'rna', 'fasta', 'helm', 'sense', 'protein'];
+  peptideFastaAlphabet = new Set([
     'G', 'L', 'Y', 'S', 'E', 'Q', 'D', 'N', 'F', 'A',
     'K', 'R', 'H', 'C', 'V', 'P', 'W', 'I', 'M', 'T',
     'MeNle', 'MeA', 'MeG', 'MeF',
   ]);
-  DnaFastaAlphabet = new Set(['A', 'C', 'G', 'T']);
+  dnaFastaAlphabet = new Set(['A', 'C', 'G', 'T']);
+  rnaFastaAlphabet = new Set(['A', 'C', 'G', 'U']);
-  RnaFastaAlphabet = new Set(['A', 'C', 'G', 'U']);
+  numbersRawAlphabet = new Set(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']);
-  SmilesRawAlphabet = new Set([
+  smilesRawAlphabet = new Set([
     'A', 'B', 'C', 'E', 'F', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'Z',
     'a', 'c', 'e', 'g', 'i', 'l', 'n', 'o', 'r', 's', 't', 'u',
     '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
     '+', '-', '.', , '/', '\\', '@', '[', ']', '(', ')', '#', '%', '=']);
-  SmartsRawAlphabet = new Set([
+  smartsRawAlphabet = new Set([
     '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
     '!', '#', '$', '&', '(', ')', '*', '+', ',', '-', '.', ':', ';', '=', '@', '~', '[', ']',
     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M',
@@ -83,6 +88,11 @@ class BioPackageDetectors extends DG.Package {
   detectMacromolecule(col) {
     const t1 = Date.now();
     try {
+      const colName = col.name;
+      if (!this.requiredColNamePartList.some(
+        (requiredColNamePart) => colName.toLowerCase().includes(requiredColNamePart),
+      )) return null;
       // Fail early
       if (col.type !== DG.TYPE.STRING) return null;
@@ -92,7 +102,7 @@ class BioPackageDetectors extends DG.Package {
       // To collect alphabet freq three strategies can be used:
       // as chars, as fasta (single or within square brackets), as with the separator.
       if (
-        !(col.categories.length == 1 && !col.categories[0]) && // TODO: Remove with tests for single empty category value
+        !(col.categories.length === 1 && !col.categories[0]) && // TODO: Remove with tests for single empty category
         DG.Detector.sampleCategories(col, (s) => this.isHelm(s), 1, SEQ_SAMPLE_LIMIT)
       ) {
         const statsAsHelm = this.getStats(categoriesSample, 2,
@@ -109,14 +119,15 @@ class BioPackageDetectors extends DG.Package {
       }
       const decoyAlphabets = [
-        ['SMILES', this.SmilesRawAlphabet, 0.30],
-        ['SMARTS', this.SmartsRawAlphabet, 0.43],
+        ['NUMBERS', this.numbersRawAlphabet, 0.25],
+        ['SMILES', this.smilesRawAlphabet, 0.25],
+        ['SMARTS', this.smartsRawAlphabet, 0.43],
       ];
       const candidateAlphabets = [
-        [ALPHABET.PT, this.PeptideFastaAlphabet, 0.50],
-        [ALPHABET.DNA, this.DnaFastaAlphabet, 0.55],
-        [ALPHABET.RNA, this.RnaFastaAlphabet, 0.55],
+        [ALPHABET.PT, this.peptideFastaAlphabet, 0.50],
+        [ALPHABET.DNA, this.dnaFastaAlphabet, 0.55],
+        [ALPHABET.RNA, this.rnaFastaAlphabet, 0.55],
       ];
       // Check for url column, maybe it is too heavy check
@@ -142,9 +153,11 @@ class BioPackageDetectors extends DG.Package {
       if (Object.keys(statsAsChars.freq).length === 0) return null;
       const decoy = this.detectAlphabet(statsAsChars.freq, decoyAlphabets, null);
-      if (decoy != ALPHABET.UN) return null;
+      if (decoy !== ALPHABET.UN) return null;
       const separator = this.detectSeparator(statsAsChars.freq);
+      if (this.checkForbiddenSeparator(separator)) return null;
       const units = separator ? NOTATION.SEPARATOR : NOTATION.FASTA;
       const gapSymbol = separator ? '' : '-';
       const splitter = separator ? this.getSplitterWithSeparator(separator, SEQ_SAMPLE_LENGTH_LIMIT) :
@@ -162,31 +175,36 @@ class BioPackageDetectors extends DG.Package {
         return DG.SEMTYPE.MACROMOLECULE;
       } else {
         const stats = this.getStats(categoriesSample, 5, splitter);
+        const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
         // Empty monomer alphabet is not allowed
         if (Object.keys(stats.freq).length === 0) return null;
         // Long monomer names for sequences with separators have constraints
-        if (separator && this.checkForbiddenWithSeparators(stats.freq)) return null;
+        if (
+          ((units === NOTATION.SEPARATOR || (units === NOTATION.FASTA && alphabetIsMultichar)) &&
+            this.checkForbiddenMultichar(stats.freq)) ||
+          ((units === NOTATION.FASTA && !alphabetIsMultichar) &&
+            this.checkForbiddenSinglechar(stats.freq))
+        ) return null;
         const aligned = stats.sameLength ? ALIGNMENT.SEQ_MSA : ALIGNMENT.SEQ;
         // TODO: If separator detected, then extra efforts to detect alphabet are allowed.
         const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, gapSymbol);
+        if (units === NOTATION.FASTA && alphabet === ALPHABET.UN && !alphabetIsMultichar) return null;
         // const forbidden = this.checkForbiddenWoSeparator(stats.freq);
-        if (separator || alphabet != 'UN') {
-          col.setTag(DG.TAGS.UNITS, units);
-          if (separator) col.setTag(UnitsHandler.TAGS.separator, separator);
-          col.setTag(UnitsHandler.TAGS.aligned, aligned);
-          col.setTag(UnitsHandler.TAGS.alphabet, alphabet);
-          if (alphabet === ALPHABET.UN) {
-            // alphabetSize calculated on (sub)sample of data is incorrect
-            // const alphabetSize = Object.keys(stats.freq).length;
-            const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
-            // col.setTag(UnitsHandler.TAGS.alphabetSize, alphabetSize.toString());
-            col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
-          }
-          return DG.SEMTYPE.MACROMOLECULE;
+        col.setTag(DG.TAGS.UNITS, units);
+        if (separator) col.setTag(UnitsHandler.TAGS.separator, separator);
+        col.setTag(UnitsHandler.TAGS.aligned, aligned);
+        col.setTag(UnitsHandler.TAGS.alphabet, alphabet);
+        if (alphabet === ALPHABET.UN) {
+          // alphabetSize calculated on (sub)sample of data is incorrect
+          // const alphabetSize = Object.keys(stats.freq).length;
+          const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
+          // col.setTag(UnitsHandler.TAGS.alphabetSize, alphabetSize.toString());
+          col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
         }
+        return DG.SEMTYPE.MACROMOLECULE;
       }
     } finally {
       const t2 = Date.now();
@@ -207,15 +225,15 @@ class BioPackageDetectors extends DG.Package {
     // !!! What is the difference between the gap symbol and separator symbol in stats terms?
     // const noSeparatorRe = /[a-z\d]+$/i;
     const noSeparatorChemRe = /[HBCNOFPSKVYI]/i; // Mendeleev's periodic table single char elements
-    const noSeparatorAlphaDigitRe = /[\dA-Z,& _\r\n]/i; // ..., comma, ampersand, space, underscore, CR, LF
+    const noSeparatorAlphaDigitRe = /[\dA-Z]/i;
     const noSeparatorBracketsRe = /[\[\]()<>{}]/i;
     const cleanFreq = Object.assign({}, ...Object.entries(freq)
       .filter(([m, f]) =>
         !noSeparatorChemRe.test(m) && !noSeparatorAlphaDigitRe.test(m) && !noSeparatorBracketsRe.test(m) &&
-        !this.PeptideFastaAlphabet.has(m) &&
-        !this.DnaFastaAlphabet.has(m))
+        !this.peptideFastaAlphabet.has(m) &&
+        !this.dnaFastaAlphabet.has(m))
       .map(([m, f]) => ({[m]: f})));
-    if (Object.keys(cleanFreq).length == 0) return null;
+    if (Object.keys(cleanFreq).length === 0) return null;
     const maxFreq = Math.max(...Object.values(cleanFreq));
@@ -227,12 +245,24 @@ class BioPackageDetectors extends DG.Package {
     return sepFreq / otherSumFreq > freqThreshold ? sep : null;
   }
-  /** With a separator, spaces are nor allowed in monomer names.
+  checkForbiddenSeparator(separator) {
+    // dot, comma, ampersand, space, underscore, CR, LF
+    const forbiddenSepRe = / |\.|,|&|_|\r\n|\n/i;
+    return forbiddenSepRe.test(separator);
+  }
+  /** Spaces, dots and colons are nor allowed in multichar monomer names.
    * The monomer name/label cannot contain digits only.
    */
-  checkForbiddenWithSeparators(freq) {
-    const forbiddenRe = /[ ]|^\d+$/i;
-    return Object.keys(freq).filter((m) => forbiddenRe.test(m)).length > 0;
+  checkForbiddenMultichar(freq) {
+    const forbiddenRe = /[ .:]|^\d+$/i;
+    return Object.keys(freq).some((m) => forbiddenRe.test(m));
+  }
+  /** Space, dot, colon, semicolon, digit, underscore are not allowed as singe char monomer names.*/
+  checkForbiddenSinglechar(freq) {
+    const forbiddenRe = /[ .:;\d_]/i;
+    return Object.keys(freq).some((m) => forbiddenRe.test(m));
   }
   // /** Without a separator, special symbols or digits are not allowed as monomers. */
@@ -250,7 +280,8 @@ class BioPackageDetectors extends DG.Package {
     for (const seq of values) {
       const mSeq = splitter(seq);
-      if (firstLength == null) {
+      if (firstLength === null) {
+        //
         firstLength = mSeq.length;
       } else if (mSeq.length !== firstLength) {
         sameLength = false;
@@ -258,9 +289,7 @@ class BioPackageDetectors extends DG.Package {
       if (mSeq.length > minLength) {
         for (const m of mSeq) {
-          if (!(m in freq)) {
-            freq[m] = 0;
-          }
+          if (!(m in freq)) freq[m] = 0;
           freq[m] += 1;
         }
       }
@@ -281,7 +310,7 @@ class BioPackageDetectors extends DG.Package {
     let alphabetName;
     const maxSim = Math.max(...candidatesSims.map((cs) => cs[4] > cs[2] ? cs[4] : -1));
     if (maxSim > 0) {
-      const sim = candidatesSims.find((cs) => cs[4] == maxSim);
+      const sim = candidatesSims.find((cs) => cs[4] === maxSim);
       alphabetName = sim[0];
     } else {
       alphabetName = ALPHABET.UN;
@@ -306,20 +335,19 @@ class BioPackageDetectors extends DG.Package {
   vectorLength(v) {
     let sqrSum = 0;
-    for (let i = 0; i < v.length; i++) {
+    for (let i = 0; i < v.length; i++)
       sqrSum += v[i] * v[i];
-    }
     return Math.sqrt(sqrSum);
   }
   vectorDotProduct(v1, v2) {
-    if (v1.length != v2.length) {
+    if (v1.length !== v2.length)
       throw Error('The dimensionality of the vectors must match');
-    }
     let prod = 0;
-    for (let i = 0; i < v1.length; i++) {
+    for (let i = 0; i < v1.length; i++)
       prod += v1[i] * v2[i];
-    }
     return prod;
   }
@@ -327,7 +355,7 @@ class BioPackageDetectors extends DG.Package {
   getSplitterAsChars(lengthLimit) {
     return function(seq) {
       return seq.split('', lengthLimit);
-    }.bind(this);
+    };
   }
   getSplitterWithSeparator(separator, lengthLimit) {
@@ -346,11 +374,11 @@ class BioPackageDetectors extends DG.Package {
       // } else {
       return seq.split(separator, lengthLimit);
       // }
-    }.bind(this);
+    };
   }
   // Multichar monomer names in square brackets, single char monomers or gap symbol
-  monomerRe = /\[(\w+)\]|(\w)|(-)/g;
+  monomerRe = /\[(\w+)\]|(.)/g;
   /** Split sequence for single character monomers, square brackets multichar monomer names or gap symbol. */
   getSplitterAsFasta(lengthLimit) {
@@ -360,11 +388,11 @@ class BioPackageDetectors extends DG.Package {
         .map((ma) => {
           let mRes;
           const m = ma[0];
-          if (m.length > 1) {
+          if (m.length > 1)
             mRes = ma[1];
-          } else {
+          else
             mRes = m;
-          }
           return mRes;
         }).toArray();
@@ -391,11 +419,10 @@ class BioPackageDetectors extends DG.Package {
       const mmPostProcess = (mm) => {
         this.helmPp1Re.lastIndex = 0;
         const pp1M = this.helmPp1Re.exec(mm);
-        if (pp1M && pp1M.length >= 2) {
+        if (pp1M && pp1M.length >= 2)
           return pp1M[1];
-        } else {
+        else
           return mm;
-        }
       };
       const mmList = inSeq ? inSeq.split('.') : [];
@@ -405,16 +432,13 @@ class BioPackageDetectors extends DG.Package {
   }
   sample(src, n) {
-    if (src.length < n) {
+    if (src.length < n)
       throw new Error('Sample source is less than n requested.');
-    }
     const idxSet = new Set();
     while (idxSet.size < n) {
       const idx = Math.floor(Math.random() * src.length);
-      if (!idxSet.has(idx)) {
-        idxSet.add(idx);
-      }
+      if (!idxSet.has(idx)) idxSet.add(idx);
     }
     return [...idxSet].map((idx) => src[idx]);