npm - @datagrok/bio - Versions diffs - 2.11.13 → 2.11.15 - Mend

@datagrok/bio 2.11.13 → 2.11.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

package/dist/23.js +2 -0
package/dist/23.js.map +1 -0
package/dist/282.js +2 -0
package/dist/282.js.map +1 -0
package/dist/356.js +2 -0
package/dist/356.js.map +1 -0
package/dist/361.js +1 -1
package/dist/361.js.map +1 -1
package/dist/40.js +2 -0
package/dist/40.js.map +1 -0
package/dist/562.js +2 -0
package/dist/562.js.map +1 -0
package/dist/586.js +2 -0
package/dist/586.js.map +1 -0
package/dist/65.js +2 -0
package/dist/65.js.map +1 -0
package/dist/796.js +2 -0
package/dist/796.js.map +1 -0
package/dist/8473fcbfb6e85ca6c852.wasm +0 -0
package/dist/{931.js → 935.js} +3 -3
package/dist/935.js.map +1 -0
package/dist/9a8fbf37666e32487835.wasm +0 -0
package/dist/package-test.js +1 -1
package/dist/package-test.js.map +1 -1
package/dist/package.js +1 -1
package/dist/package.js.map +1 -1
package/package.json +4 -4
package/scripts/sequence_generator.py +24 -22
package/src/demo/bio05-helm-msa-sequence-space.ts +53 -13
package/src/demo/utils.ts +2 -1
package/src/function-edtiors/split-to-monomers-editor.ts +2 -2
package/src/package.ts +82 -177
package/src/tests/pepsea-tests.ts +1 -1
package/src/tests/sequence-space-utils.ts +6 -2
package/src/tests/similarity-diversity-tests.ts +16 -4
package/src/utils/cell-renderer.ts +1 -1
package/src/utils/helm-to-molfile.ts +1 -1
package/src/utils/monomer-lib.ts +1 -1
package/src/utils/pepsea.ts +16 -1
package/src/viewers/vd-regions-viewer.ts +42 -16
package/src/viewers/web-logo-viewer.ts +40 -20
package/dist/1.js +0 -2
package/dist/1.js.map +0 -1
package/dist/190.js +0 -2
package/dist/190.js.map +0 -1
package/dist/381.js +0 -2
package/dist/381.js.map +0 -1
package/dist/770.js +0 -2
package/dist/770.js.map +0 -1
package/dist/868.js +0 -2
package/dist/868.js.map +0 -1
package/dist/931.js.map +0 -1
package/src/utils/err-info.ts +0 -28
/package/dist/{931.js.LICENSE.txt → 935.js.LICENSE.txt} +0 -0

package/package.json CHANGED Viewed

@@ -5,7 +5,7 @@
     "name": "Leonid Stolbov",
     "email": "lstolbov@datagrok.ai"
   },
-  "version": "2.11.13",
+  "version": "2.11.15",
   "description": "Bioinformatics support (import/export of sequences, conversion, visualization, analysis). [See more](https://github.com/datagrok-ai/public/blob/master/packages/Bio/README.md) for details.",
   "repository": {
     "type": "git",
@@ -34,11 +34,11 @@
   ],
   "dependencies": {
     "@biowasm/aioli": "^3.1.0",
-    "@datagrok-libraries/bio": "^5.39.9",
+    "@datagrok-libraries/bio": "^5.39.10",
     "@datagrok-libraries/chem-meta": "^1.2.1",
-    "@datagrok-libraries/ml": "^6.3.56",
+    "@datagrok-libraries/ml": "^6.3.62",
     "@datagrok-libraries/tutorials": "^1.3.11",
-    "@datagrok-libraries/utils": "^4.1.28",
+    "@datagrok-libraries/utils": "^4.1.34",
     "cash-dom": "^8.0.0",
     "css-loader": "^6.7.3",
     "datagrok-api": "^1.16.0",

package/scripts/sequence_generator.py CHANGED Viewed

@@ -3,25 +3,28 @@
 # description: Create the model peptides/DNA sequences with peptides data
 # language: python
 # tags: template, demo
-# input: int clusters = 5 [Number of superclusters]
-# input: int num_sequences = 50 [Number of sequences in each supercluster]
-# input: int motif_length = 12 [Average length of motif]
-# input: int max_variants_position = 3 [Maximum number of different letters in conservative position in motif]
-# input: int random_length = 3 [Average length of random sequence parts before and after motif]
-# input: int dispersion = 2 [Variation of total sequence length]
-# input: string alphabet_key = 'PT' [Sequence alphabet: PT/DNA/RNA/custom. Custom alphabet is a list of values separated by comma]
-# input: bool disable_cliffs = False [Disable generation of cliffs]
-# input: double cliff_probability = 0.01 [Probability to make activity cliff of a sequence]
-# input: double cliff_strength = 4.0 [Strength of cliff]
-# input: string fasta_separator = '' {nullable: true}
+# input: int clusters = 5 { caption: Number of clusters; category: Clusters }
+# input: int num_sequences = 50 { caption: Number of sequences in each cluster; category: Clusters }
+# input: int motif_length = 12 { caption: Average length of motif; category: Motif }
+# input: int max_variants_position = 3 { caption: Maximum number of different letters in conservative position in motif; category: Motif }
+# input: int random_length = 3 { caption: Average length of random sequence parts before and after motif; category: Motif }
+# input: int dispersion = 2 { caption: Variation of total sequence length; category: Motif }
+# input: bool enable_cliffs = true { caption: Enable activity cliffs; category: Activity cliffs }
+# input: double cliff_probability = 0.01 { caption: Probability to make activity cliff of a sequence; category: Activity cliffs; format: 0.000}
+# input: double cliff_strength = 4.0 { caption: Strength of cliff; category: Activity cliffs }
+# input: string alphabet_key = "PT" { caption: Sequence alphabet; category: Output format; hint: PT/DNA/RNA/custom. Custom alphabet is a list of values separated by comma}
+# input: string fasta_separator = "" { caption: Fasta format separator; nullable: true; category: Output format}
+# input: file helm_library_file { caption: HELM library to produce HELM output; nullable: true; category: Output format}
+# input: string helm_connection_mode = "linear" { choices: ["linear", "cyclic", "mixed"]; caption: Peptides connection mode (HELM only); category: Output format}
 # output: dataframe sequences
-"""
-The most simple options set running from command line
-  python sequence_generator.py -c 4 -s 50 > output_file.tsv
-Basic options:
-  -с number of clusters
-  -s cluster size (number of sequences per cluster)
+description="""The utility generates clusters of macromolecule sequences to test SAR fucntionality.
+Each cluster contains randomly generated sequence motif.
+Each sequence has activity - a Gauss-distributed random value.
+All sequences in the cluster has activities from the same distibution.
+The utility can simulate activity cliffs - random changes in the conservative motif letters,
+leading to drastical change in the activity.
 """
 import random
@@ -280,8 +283,7 @@ def alphabet_from_helm(helm_library_file: str) -> Alphabet:
 def parse_command_line_args() -> Any:
     parser = argparse.ArgumentParser(
         prog="MotifSequencesGenerator",
-        description="The program generates set of sequences containing sequence motifs "
-        "for SAR functionality testing",
+        description=description,
         epilog="Utility author and support: Gennadii Zakharov <Gennadiy.Zakharov@gmail.com>",
     )
@@ -386,14 +388,14 @@ if not grok:
     random_length = args.random_length
     dispersion = args.dispersion
     alphabet_key = args.alphabet
-    disable_cliffs = args.disable_cliffs
+    enable_cliffs = not args.disable_cliffs
     cliff_probability = args.cliff_probability
     cliff_strength = args.cliff_strength
     fasta_separator = args.fasta_separator
     helm_library_file = args.helm_library_file
     helm_connection_mode = args.helm_connection_mode
-helm_init = "helm_library_file" in globals() and helm_library_file is not None
+helm_init = "helm_library_file" in globals() and helm_library_file is not None and helm_library_file != ''
 if not helm_init:
     alphabet: Alphabet = (
@@ -413,7 +415,7 @@ header, data = generate_sequences(
     random_length,
     dispersion,
     alphabet,
-    not disable_cliffs,
+    enable_cliffs,
     cliff_probability,
     cliff_strength,
 )

package/src/demo/bio05-helm-msa-sequence-space.ts CHANGED Viewed

@@ -2,15 +2,18 @@ import * as grok from 'datagrok-api/grok';
 import * as ui from 'datagrok-api/ui';
 import * as DG from 'datagrok-api/dg';
-import {_package, sequenceSpaceTopMenu} from '../package';
-import {handleError} from './utils';
 import {IWebLogoViewer} from '@datagrok-libraries/bio/src/viewers/web-logo';
-import {pepseaMethods, runPepsea} from '../utils/pepsea';
+import {awaitStatus, DockerContainerStatus} from '@datagrok-libraries/bio/src/utils/docker';
 import {DemoScript} from '@datagrok-libraries/tutorials/src/demo-script';
 import {DimReductionMethods} from '@datagrok-libraries/ml/src/reduce-dimensionality';
 import {MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
+import {Pepsea, pepseaMethods, runPepsea} from '../utils/pepsea';
+import {sequenceSpaceTopMenu} from '../package';
+import {handleError} from './utils';
+import {_package} from '../package';
 const helmFn: string = 'samples/HELM.csv';
 export async function demoBio05UI(): Promise<void> {
@@ -25,36 +28,73 @@ export async function demoBio05UI(): Promise<void> {
   const msaHelmColName: string = 'msa(HELM)';
   const dimRedMethod: DimReductionMethods = DimReductionMethods.UMAP;
+  const pepseaDcId = (await Pepsea.getDockerContainer()).id;
+  // // region For test: Stop container to test auto-start
+  // await grok.dapi.docker.dockerContainers.stop(pepseaDcId);
+  // await Pepsea.awaitStatus(pepseaDcId, 'stopped', 15000);
+  // // endregion
+  const pepseaDcPromise: Promise<DG.DockerContainer> = Pepsea.getDockerContainer();
+  let pepseaDcStatus: DockerContainerStatus;
+  let pepseaDcStartPromise: Promise<void>;
   try {
     const demoScript = new DemoScript(
       'Helm, MSA, Sequence Space',
       'MSA and composition analysis on Helm data');
     await demoScript
       .step(`Load peptides with non-natural aminoacids in 'HELM' notation`, async () => {
-        view = grok.shell.addTableView(df = await _package.files.readCsv(helmFn));
+        [pepseaDcStatus, df] = await Promise.all([
+          (async () => { return (await pepseaDcPromise).status; })(),
+          _package.files.readCsv(helmFn)
+        ]);
+        view = grok.shell.addTableView(df);
         grok.shell.windows.showContextPanel = false;
         grok.shell.windows.showProperties = false;
+        if (pepseaDcStatus === 'started' || pepseaDcStatus === 'checking') {
+          _package.logger.debug(
+            `demoBio05UI(), PepSeA ('${Pepsea.dcName}') docker container status = '${pepseaDcStatus}'.`);
+          pepseaDcStartPromise = Promise.resolve();
+        } else {
+          _package.logger.warning(
+            `demoBio05UI(), PepSeA ('${Pepsea.dcName}') docker container is trying to start...`);
+          await grok.dapi.docker.dockerContainers.run(pepseaDcId);
+          pepseaDcStartPromise = awaitStatus(pepseaDcId, 'started', 30000, _package.logger);
+        }
       }, {
         description: 'Load dataset with macromolecules of \'Helm\' notation.',
         delay: 2000,
       })
       .step('Align peptides with non-natural aminoacids with PepSeA', async () => {
-        helmCol = df.getCol(helmColName);
-        const method: string = pepseaMethods[0];
-        const gapOpen: number = 1.53;
-        const gapExtend: number = 0;
-        msaHelmCol = (await runPepsea(helmCol, msaHelmColName, method, gapOpen, gapExtend, undefined))!;
-        df.columns.add(msaHelmCol);
-        await grok.data.detectSemanticTypes(df);
+        const pi = DG.TaskBarProgressIndicator.create('MSA by PepSeA ...');
+        try {
+          // TODO: Show splash if pepseaDcStartPromise is not resolved still
+          await pepseaDcStartPromise; // throws timeout
+          // Hide splash
+          helmCol = df.getCol(helmColName);
+          const method: string = pepseaMethods[0];
+          const gapOpen: number = 1.53;
+          const gapExtend: number = 0;
+          msaHelmCol = (await runPepsea(helmCol, msaHelmColName, method, gapOpen, gapExtend, undefined))!;
+          if (!msaHelmCol)
+            throw new Error(`Empty MSA result.`);
+          df.columns.add(msaHelmCol);
+          await grok.data.detectSemanticTypes(df);
+        } finally {
+          pi.close();
+        }
       }, {
         // eslint-disable-next-line max-len
         description: 'Multiple sequence alignment (MSA) performed with PepSeA tool operating on non-natural aminoacids as well.',
         delay: 2000,
       })
       .step('Build sequence space', async () => {
+        const preprocessingFunc = DG.Func.find({package: 'Bio', name: 'macromoleculePreprocessingFunction'})[0];
         ssViewer = (await sequenceSpaceTopMenu(df, msaHelmCol,
-          dimRedMethod, MmDistanceFunctionsNames.LEVENSHTEIN, true)) as DG.ScatterPlotViewer;
+          dimRedMethod, MmDistanceFunctionsNames.LEVENSHTEIN, true, preprocessingFunc)) as DG.ScatterPlotViewer;
         view.dockManager.dock(ssViewer, DG.DOCK_TYPE.RIGHT, null, 'Sequence Space', 0.35);
       }, {
         description: 'Reduce sequence space dimensionality to display on 2D representation.',

package/src/demo/utils.ts CHANGED Viewed

@@ -63,8 +63,9 @@ export async function demoSequenceSpace(
       'lassoTool': true,
     })) as DG.ScatterPlotViewer;
   } else {
+    const preprocessingFunc = DG.Func.find({package: 'Bio', name: 'macromoleculePreprocessingFunction'})[0];
     resSpaceViewer = (await sequenceSpaceTopMenu(df, df.getCol(colName),
-      DimReductionMethods.UMAP, MmDistanceFunctionsNames.LEVENSHTEIN, true)) as DG.ScatterPlotViewer;
+      DimReductionMethods.UMAP, MmDistanceFunctionsNames.LEVENSHTEIN, true, preprocessingFunc)) as DG.ScatterPlotViewer;
   }
   view.dockManager.dock(resSpaceViewer!, DG.DOCK_TYPE.RIGHT, null, 'Sequence Space', 0.35);
   return resSpaceViewer;

package/src/function-edtiors/split-to-monomers-editor.ts CHANGED Viewed

@@ -6,7 +6,7 @@ export class SplitToMonomersFunctionEditor {
   tableInput: DG.InputBase;
   seqColInput: DG.InputBase;
-  funcParamsDiv: HTMLDivElement;
+  funcParamsDiv: HTMLElement;
   get funcParams(): {} {
     return {
@@ -15,7 +15,7 @@ export class SplitToMonomersFunctionEditor {
     };
   }
-  get paramsUI(): HTMLDivElement {
+  get paramsUI(): HTMLElement {
     return this.funcParamsDiv;
   }

package/src/package.ts CHANGED Viewed

@@ -9,12 +9,15 @@ import {removeEmptyStringRows} from '@datagrok-libraries/utils/src/dataframe-uti
 import {Options} from '@datagrok-libraries/utils/src/type-declarations';
 import {DimReductionMethods, ITSNEOptions, IUMAPOptions} from '@datagrok-libraries/ml/src/reduce-dimensionality';
 import {SequenceSpaceFunctionEditor} from '@datagrok-libraries/ml/src/functionEditors/seq-space-editor';
+import {DimReductionBaseEditor, PreprocessFunctionReturnType}
+  from '@datagrok-libraries/ml/src/functionEditors/dimensionality-reduction-editor';
+import {reduceDimensionality} from '@datagrok-libraries/ml/src/functionEditors/dimensionality-reducer';
 import {ActivityCliffsFunctionEditor} from '@datagrok-libraries/ml/src/functionEditors/activity-cliffs-editor';
 import {
   ISequenceSpaceParams, getActivityCliffs, SequenceSpaceFunc, CLIFFS_COL_ENCODE_FN
 } from '@datagrok-libraries/ml/src/viewers/activity-cliffs';
 import {MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
-import {BitArrayMetrics} from '@datagrok-libraries/ml/src/typed-metrics';
+import {BitArrayMetrics, KnownMetrics} from '@datagrok-libraries/ml/src/typed-metrics';
 import {
   TAGS as bioTAGS, ALPHABET, NOTATION,
 } from '@datagrok-libraries/bio/src/utils/macromolecule';
@@ -27,6 +30,7 @@ import {SCORE, calculateScores} from '@datagrok-libraries/bio/src/utils/macromol
 import {
   createJsonMonomerLibFromSdf, IMonomerLibHelper
 } from '@datagrok-libraries/bio/src/monomer-works/monomer-utils';
+import {errInfo} from '@datagrok-libraries/bio/src/utils/err-info';
 import {getMacromoleculeColumns} from './utils/ui-utils';
 import {
@@ -74,7 +78,6 @@ import {getRegionDo} from './utils/get-region';
 import {GetRegionApp} from './apps/get-region-app';
 import {GetRegionFuncEditor} from './utils/get-region-func-editor';
 import {sequenceToMolfile} from './utils/sequence-to-mol';
-import {errInfo} from './utils/err-info';
 import {detectMacromoleculeProbeDo} from './utils/detect-macromolecule-probe';
 import {SHOW_SCATTERPLOT_PROGRESS} from '@datagrok-libraries/ml/src/functionEditors/seq-space-base-editor';
@@ -230,11 +233,21 @@ export function SplitToMonomersEditor(call: DG.FuncCall): void {
 //tags: editor
 //input: funccall call
 export function SequenceSpaceEditor(call: DG.FuncCall) {
-  const funcEditor = new SequenceSpaceFunctionEditor(DG.SEMTYPE.MACROMOLECULE);
+  const funcEditor = new DimReductionBaseEditor({semtype: DG.SEMTYPE.MACROMOLECULE});
   ui.dialog({title: 'Sequence Space'})
-    .add(funcEditor.paramsUI)
+    .add(funcEditor.getEditor())
     .onOK(async () => {
-      return call.func.prepare(funcEditor.funcParams).call();
+      const params = funcEditor.getParams();
+      return call.func.prepare({
+        molecules: params.col,
+        table: params.table,
+        methodName: params.methodName,
+        similarityMetric: params.similarityMetric,
+        plotEmbeddings: params.plotEmbeddings,
+        options: params.options,
+        preprocessingFunction: params.preprocessingFunction,
+        clusterEmbeddings: params.clusterEmbeddings,
+      }).call();
     })
     .show();
 }
@@ -477,6 +490,48 @@ export async function activityCliffs(df: DG.DataFrame, macroMolecule: DG.Column<
   }).finally(() => { pi.close(); });
 }
+//name: Encode Sequences
+//tags: dim-red-preprocessing-function
+//meta.supportedSemTypes: Macromolecule
+//meta.supportedTypes: string
+//meta.supportedUnits: fasta,separator,helm
+//meta.supportedDistanceFunctions: Hamming,Levenshtein,Monomer chemical distance,Needlemann-Wunsch
+//input: column col {semType: Macromolecule}
+//input: string metric
+//output: object result
+export async function macromoleculePreprocessingFunction(
+  col: DG.Column, metric: MmDistanceFunctionsNames): Promise<PreprocessFunctionReturnType> {
+  const {seqList, options} = await getEncodedSeqSpaceCol(col, metric);
+  return {entries: seqList, options};
+}
+//name: Helm Fingerprints
+//tags: dim-red-preprocessing-function
+//meta.supportedSemTypes: Macromolecule
+//meta.supportedTypes: string
+//meta.supportedUnits: helm
+//meta.supportedDistanceFunctions: Tanimoto,Asymmetric,Cosine,Sokal
+//input: column col {semType: Macromolecule}
+//input: string _metric
+//output: object result
+export async function helmPreprocessingFunction(
+  col: DG.Column<string>, _metric: BitArrayMetrics): Promise<PreprocessFunctionReturnType> {
+  if (col.version !== col.temp[MONOMERIC_COL_TAGS.LAST_INVALIDATED_VERSION])
+    await invalidateMols(col, false);
+  const molCol = col.temp[MONOMERIC_COL_TAGS.MONOMERIC_MOLS];
+  const fingerPrints: DG.Column<DG.BitSet | null> =
+    await grok.functions.call('Chem:getMorganFingerprints', {molColumn: molCol});
+  const entries: Array<BitArray | null> = new Array(fingerPrints.length).fill(null);
+  for (let i = 0; i < fingerPrints.length; i++) {
+    if (fingerPrints.isNone(i) || !fingerPrints.get(i))
+      continue;
+    const fp = fingerPrints.get(i)!;
+    entries[i] = BitArray.fromUint32Array(fp.length, new Uint32Array(fp.getBuffer().buffer));
+  }
+  return {entries, options: {}};
+}
 //top-menu: Bio | Analyze | Sequence Space...
 //name: Sequence Space
 //description: Creates 2D sequence space with projected sequences by pairwise distance
@@ -485,182 +540,27 @@ export async function activityCliffs(df: DG.DataFrame, macroMolecule: DG.Column<
 //input: string methodName { choices:["UMAP", "t-SNE"] }
 //input: string similarityMetric { choices:["Hamming", "Levenshtein", "Monomer chemical distance"] }
 //input: bool plotEmbeddings = true
-//input: double sparseMatrixThreshold = 0 [Similarity Threshold for sparse matrix calculation]
+//input: func preprocessingFunction {optional: true}
 //input: object options {optional: true}
+//input: bool clusterEmbeddings = true { optional: true }
 //output: viewer result
 //editor: Bio:SequenceSpaceEditor
-export async function sequenceSpaceTopMenu(
-  table: DG.DataFrame, macroMolecule: DG.Column, methodName: DimReductionMethods,
-  similarityMetric: BitArrayMetrics | MmDistanceFunctionsNames = MmDistanceFunctionsNames.LEVENSHTEIN,
-  plotEmbeddings: boolean, sparseMatrixThreshold?: number, options?: (IUMAPOptions | ITSNEOptions) & Options,
-): Promise<DG.Viewer | undefined> {
-  // Delay is required for initial function dialog to close before starting invalidating of molfiles.
-  // Otherwise, dialog is freezing
-  await delay(10);
-  if (!checkInputColumnUI(macroMolecule, 'Sequence space')) return;
-  let scatterPlot: DG.ScatterPlotViewer | undefined = undefined;
-  const pg = DG.TaskBarProgressIndicator.create('Initializing sequence space ...');
-  // function for progress of umap
-  try {
-    function progressFunc(_nEpoch: number, epochsLength: number, embeddings: number[][]) {
-      let embedXCol: DG.Column | null = null;
-      let embedYCol: DG.Column | null = null;
-      if (!table.columns.names().includes(embedColsNames[0])) {
-        embedXCol = table.columns.add(DG.Column.float(embedColsNames[0], table.rowCount));
-        embedYCol = table.columns.add(DG.Column.float(embedColsNames[1], table.rowCount));
-        if (plotEmbeddings) {
-          scatterPlot = grok.shell
-            .tableView(table.name)
-            .scatterPlot({x: embedColsNames[0], y: embedColsNames[1], title: 'Sequence space'});
-        }
-      } else {
-        embedXCol = table.columns.byName(embedColsNames[0]);
-        embedYCol = table.columns.byName(embedColsNames[1]);
-      }
-      if (options?.[SHOW_SCATTERPLOT_PROGRESS]) {
-        scatterPlot?.root && ui.setUpdateIndicator(scatterPlot!.root, false);
-        embedXCol.init((i) => embeddings[i] ? embeddings[i][0] : undefined);
-        embedYCol.init((i) => embeddings[i] ? embeddings[i][1] : undefined);
-      }
-      const progress = (_nEpoch / epochsLength * 100);
-      pg.update(progress, `Running sequence space ... ${progress.toFixed(0)}%`);
-    }
-    const embedColsNames = getEmbeddingColsNames(table);
-    const withoutEmptyValues = DG.DataFrame.fromColumns([macroMolecule]).clone();
-    const emptyValsIdxs = removeEmptyStringRows(withoutEmptyValues, macroMolecule);
-    const chemSpaceParams: ISequenceSpaceParams = {
-      seqCol: withoutEmptyValues.col(macroMolecule.name)!,
-      methodName: methodName,
-      similarityMetric: similarityMetric,
-      embedAxesNames: embedColsNames,
-      options: {...options, sparseMatrixThreshold: sparseMatrixThreshold ?? 0.5,
-        usingSparseMatrix: table.rowCount > 20000},
-    };
-    const allowedRowCount = methodName === DimReductionMethods.UMAP ? 500000 : 15000;
-    // number of rows which will be processed relatively fast
-    const fastRowCount = methodName === DimReductionMethods.UMAP ? 5000 : 2000;
-    if (table.rowCount > allowedRowCount) {
-      grok.shell.warning(`Too many rows, maximum for sequence space is ${allowedRowCount}`);
-      return;
-    }
-    async function getSeqSpace() {
-      table.columns.add(DG.Column.float(embedColsNames[0], table.rowCount));
-      table.columns.add(DG.Column.float(embedColsNames[1], table.rowCount));
-      if (plotEmbeddings) {
-        scatterPlot = grok.shell
-          .tableView(table.name)
-          .scatterPlot({x: embedColsNames[0], y: embedColsNames[1], title: 'Sequence space'});
-        ui.setUpdateIndicator(scatterPlot.root, true);
-      }
-      let resolveF: Function | null = null;
-      const sub = grok.events.onViewerClosed.subscribe((args) => {
-        const v = args.args.viewer as unknown as DG.Viewer<any>;
-        if (v?.getOptions()?.look?.title && scatterPlot?.getOptions()?.look?.title &&
-          v?.getOptions()?.look?.title === scatterPlot?.getOptions()?.look?.title) {
-          grok.events.fireCustomEvent(DIMENSIONALITY_REDUCER_TERMINATE_EVENT, {});
-          sub.unsubscribe();
-          resolveF?.();
-          pg.close();
-        }
-      });
-      const sequenceSpaceResPromise = new Promise<ISequenceSpaceResult | undefined>(async (resolve, reject) => {
-        try {
-          resolveF = resolve;
-          const res = await getSequenceSpace(chemSpaceParams,
-            options?.[BYPASS_LARGE_DATA_WARNING] ? undefined : progressFunc);
-          resolve(res);
-        } catch (e) {
-          reject(e);
-        }
-      });
-      const sequenceSpaceRes = await sequenceSpaceResPromise;
-      pg.close();
-      sub.unsubscribe();
-      return sequenceSpaceRes ? processResult(sequenceSpaceRes) : sequenceSpaceRes;
-    }
-    if (table.rowCount > fastRowCount && !options?.[BYPASS_LARGE_DATA_WARNING]) {
-      ui.dialog().add(ui.divText(`Sequence space analysis might take several minutes.
-    Do you want to continue?`))
-        .onOK(async () => {
-          await getSeqSpace().catch((err: any) => {
-            pg.close();
-            const [errMsg, errStack] = errInfo(err);
-            _package.logger.error(errMsg, undefined, errStack);
-            if (scatterPlot)
-              scatterPlot.close();
-          });
-        })
-        .onCancel(() => { pg.close(); })
-        .show();
-    } else {
-      return await getSeqSpace();
-    }
-    function processResult(sequenceSpaceRes: ISequenceSpaceResult): DG.ScatterPlotViewer | undefined {
-      const embeddings = sequenceSpaceRes.coordinates;
-      for (const col of embeddings) {
-        const listValues = col.toList();
-        emptyValsIdxs.forEach((ind: number) => listValues.splice(ind, 0, null));
-        let embedCol = table.columns.byName(col.name);
-        if (!embedCol) {
-          embedCol = DG.Column.float(col.name, listValues.length);
-          table.columns.add(embedCol);
-        }
-        embedCol.init((i) => listValues[i]);
-        //table.columns.add(DG.Column.float(col.name, table.rowCount).init((i) => listValues[i]));
-      }
-      if (plotEmbeddings) {
-        if (!scatterPlot) {
-          scatterPlot = grok.shell
-            .tableView(table.name)
-            .scatterPlot({x: embedColsNames[0], y: embedColsNames[1], title: 'Sequence space'});
-        }
-        ui.setUpdateIndicator(scatterPlot.root, false);
-        return scatterPlot;
-      }
-    }
-  } catch (e) {
-    console.error(e);
-    pg.close();
-    const [errMsg, errStack] = errInfo(e);
-    _package.logger.error(errMsg, undefined, errStack);
-    if (scatterPlot)
-      (scatterPlot as unknown as DG.Viewer).close();
-  }
-  /*   const encodedCol = encodeMonomers(macroMolecule);
-  if (!encodedCol)
+export async function sequenceSpaceTopMenu(table: DG.DataFrame, molecules: DG.Column,
+  methodName: DimReductionMethods, similarityMetric: BitArrayMetrics | MmDistanceFunctionsNames,
+  plotEmbeddings: boolean, preprocessingFunction?: DG.Func, options?: (IUMAPOptions | ITSNEOptions) & Options,
+  clusterEmbeddings?: boolean): Promise<DG.ScatterPlotViewer | undefined> {
+  if (!checkInputColumnUI(molecules, 'Sequence Space'))
     return;
-  const embedColsNames = getEmbeddingColsNames(table);
-  const withoutEmptyValues = DG.DataFrame.fromColumns([encodedCol]).clone();
-  const emptyValsIdxs = removeEmptyStringRows(withoutEmptyValues, encodedCol);
-  const chemSpaceParams = {
-    seqCol: withoutEmptyValues.col(encodedCol.name)!,
-    methodName: methodName,
-    similarityMetric: similarityMetric,
-    embedAxesNames: embedColsNames
-  };
-  const sequenceSpaceRes = await sequenceSpace(chemSpaceParams);
-  const embeddings = sequenceSpaceRes.coordinates;
-  for (const col of embeddings) {
-    const listValues = col.toList();
-    emptyValsIdxs.forEach((ind: number) => listValues.splice(ind, 0, null));
-    table.columns.add(DG.Column.fromList('double', col.name, listValues));
-  }
-  let sp;
-  if (plotEmbeddings) {
-    for (const v of grok.shell.views) {
-      if (v.name === table.name)
-        sp = (v as DG.TableView).scatterPlot({x: embedColsNames[0], y: embedColsNames[1], title: 'Sequence space'});
-    }
-  } */
+  if (!preprocessingFunction)
+    preprocessingFunction = DG.Func.find({name: 'macromoleculePreprocessingFunction', package: 'Bio'})[0];
+  const res = await reduceDimensionality(table, molecules, methodName,
+      similarityMetric as KnownMetrics, preprocessingFunction, plotEmbeddings, clusterEmbeddings ?? false, options, {
+        fastRowCount: 10000,
+        scatterPlotName: 'Sequence space',
+        bypassLargeDataWarning: options?.[BYPASS_LARGE_DATA_WARNING],
+      });
+  return res;
 }
 //top-menu: Bio | Convert | To Atomic Level...
@@ -1066,6 +966,7 @@ export function addCopyMenu(cell: DG.Cell, menu: DG.Menu): void {
 //description: Sequence similarity tracking and evaluation dataset diversity
 //meta.path: /apps/Tutorials/Demo/Bioinformatics/Similarity,%20Diversity
 //meta.isDemoScript: True
+//meta.demoSkip: GROK-14320
 export async function demoBioSimilarityDiversity(): Promise<void> {
   await demoBio01UI();
 }
@@ -1076,6 +977,7 @@ export async function demoBioSimilarityDiversity(): Promise<void> {
 //description: Exploring sequence space of Macromolecules, comparison with hierarchical clustering results
 //meta.path: /apps/Tutorials/Demo/Bioinformatics/Sequence%20Space
 //meta.isDemoScript: True
+//meta.demoSkip: GROK-14320
 export async function demoBioSequenceSpace(): Promise<void> {
   await demoBio01aUI();
 }
@@ -1086,6 +988,7 @@ export async function demoBioSequenceSpace(): Promise<void> {
 //description: Activity Cliffs analysis on Macromolecules data
 //meta.path: /apps/Tutorials/Demo/Bioinformatics/Activity%20Cliffs
 //meta.isDemoScript: True
+//meta.demoSkip: GROK-14320
 export async function demoBioActivityCliffs(): Promise<void> {
   await demoBio01bUI();
 }
@@ -1096,6 +999,7 @@ export async function demoBioActivityCliffs(): Promise<void> {
 //description: Atomic level structure of Macromolecules
 //meta.path: /apps/Tutorials/Demo/Bioinformatics/Atomic%20Level
 //meta.isDemoScript: True
+//meta.demoSkip: GROK-14320
 export async function demoBioAtomicLevel(): Promise<void> {
   await demoBio03UI();
 }
@@ -1106,6 +1010,7 @@ export async function demoBioAtomicLevel(): Promise<void> {
 //description: MSA and composition analysis on Helm data
 //meta.path: /apps/Tutorials/Demo/Bioinformatics/Helm,%20MSA,%20Sequence%20Space
 //meta.isDemoScript: True
+//meta.demoSkip: GROK-14320
 export async function demoBioHelmMsaSequenceSpace(): Promise<void> {
   await demoBio05UI();
 }

package/src/tests/pepsea-tests.ts CHANGED Viewed

@@ -16,7 +16,7 @@ category('PepSeA', () => {
     await awaitContainerStart();
     const table = DG.DataFrame.fromCsv(testCsv);
     const alignedCol = await runPepsea(table.getCol('HELM'), 'msa(HELM)');
-    expect(alignedCol !== null, true, 'PepSeA conainter has not started');
+    expect(alignedCol !== null, true, 'PepSeA container has not started');
     const alignedTestCol = table.getCol('MSA');
     for (let i = 0; i < alignedCol!.length; ++i)
       expect(alignedCol!.get(i) == alignedTestCol.get(i), true);

package/src/tests/sequence-space-utils.ts CHANGED Viewed

@@ -1,9 +1,10 @@
 import * as DG from 'datagrok-api/dg';
 import * as grok from 'datagrok-api/grok';
 import {expect} from '@datagrok-libraries/utils/src/test';
-import {BYPASS_LARGE_DATA_WARNING, sequenceSpaceTopMenu} from '../package';
+import {sequenceSpaceTopMenu} from '../package';
 import {MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
 import {DimReductionMethods} from '@datagrok-libraries/ml/src/reduce-dimensionality';
+import {BYPASS_LARGE_DATA_WARNING} from '@datagrok-libraries/ml/src/functionEditors/consts';
 export async function _testSequenceSpaceReturnsResult(
   df: DG.DataFrame, algorithm: DimReductionMethods, colName: string,
@@ -14,7 +15,10 @@ export async function _testSequenceSpaceReturnsResult(
   if (semType)
     col.semType = semType;
+  const preprocessingFunc = DG.Func.find({package: 'Bio', name: 'macromoleculePreprocessingFunction'})[0];
+  if (!preprocessingFunc)
+    throw new Error('Preprocessing function not found');
   const sp = await sequenceSpaceTopMenu(df, df.col(colName)!, algorithm, MmDistanceFunctionsNames.LEVENSHTEIN, true,
-    0.6, {[`${BYPASS_LARGE_DATA_WARNING}`]: true});
+    preprocessingFunc, {[BYPASS_LARGE_DATA_WARNING]: true});
   expect(sp != null, true);
 }