@datagrok/bio 2.11.13 → 2.11.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/dist/23.js +2 -0
  2. package/dist/23.js.map +1 -0
  3. package/dist/282.js +2 -0
  4. package/dist/282.js.map +1 -0
  5. package/dist/361.js +1 -1
  6. package/dist/361.js.map +1 -1
  7. package/dist/40.js +2 -0
  8. package/dist/40.js.map +1 -0
  9. package/dist/562.js +2 -0
  10. package/dist/562.js.map +1 -0
  11. package/dist/586.js +2 -0
  12. package/dist/586.js.map +1 -0
  13. package/dist/65.js +2 -0
  14. package/dist/65.js.map +1 -0
  15. package/dist/{931.js → 935.js} +3 -3
  16. package/dist/935.js.map +1 -0
  17. package/dist/package-test.js +1 -1
  18. package/dist/package-test.js.map +1 -1
  19. package/dist/package.js +1 -1
  20. package/dist/package.js.map +1 -1
  21. package/package.json +4 -4
  22. package/scripts/sequence_generator.py +24 -22
  23. package/src/demo/bio05-helm-msa-sequence-space.ts +53 -13
  24. package/src/demo/utils.ts +2 -1
  25. package/src/function-edtiors/split-to-monomers-editor.ts +2 -2
  26. package/src/package.ts +80 -177
  27. package/src/tests/pepsea-tests.ts +1 -1
  28. package/src/tests/sequence-space-utils.ts +6 -2
  29. package/src/tests/similarity-diversity-tests.ts +16 -4
  30. package/src/utils/cell-renderer.ts +1 -1
  31. package/src/utils/helm-to-molfile.ts +1 -1
  32. package/src/utils/monomer-lib.ts +1 -1
  33. package/src/utils/pepsea.ts +16 -1
  34. package/src/viewers/vd-regions-viewer.ts +42 -16
  35. package/src/viewers/web-logo-viewer.ts +40 -20
  36. package/dist/1.js +0 -2
  37. package/dist/1.js.map +0 -1
  38. package/dist/190.js +0 -2
  39. package/dist/190.js.map +0 -1
  40. package/dist/381.js +0 -2
  41. package/dist/381.js.map +0 -1
  42. package/dist/770.js +0 -2
  43. package/dist/770.js.map +0 -1
  44. package/dist/868.js +0 -2
  45. package/dist/868.js.map +0 -1
  46. package/dist/931.js.map +0 -1
  47. package/src/utils/err-info.ts +0 -28
  48. /package/dist/{931.js.LICENSE.txt → 935.js.LICENSE.txt} +0 -0
package/package.json CHANGED
@@ -5,7 +5,7 @@
5
5
  "name": "Leonid Stolbov",
6
6
  "email": "lstolbov@datagrok.ai"
7
7
  },
8
- "version": "2.11.13",
8
+ "version": "2.11.14",
9
9
  "description": "Bioinformatics support (import/export of sequences, conversion, visualization, analysis). [See more](https://github.com/datagrok-ai/public/blob/master/packages/Bio/README.md) for details.",
10
10
  "repository": {
11
11
  "type": "git",
@@ -34,11 +34,11 @@
34
34
  ],
35
35
  "dependencies": {
36
36
  "@biowasm/aioli": "^3.1.0",
37
- "@datagrok-libraries/bio": "^5.39.9",
37
+ "@datagrok-libraries/bio": "^5.39.10",
38
38
  "@datagrok-libraries/chem-meta": "^1.2.1",
39
- "@datagrok-libraries/ml": "^6.3.56",
39
+ "@datagrok-libraries/ml": "^6.3.61",
40
40
  "@datagrok-libraries/tutorials": "^1.3.11",
41
- "@datagrok-libraries/utils": "^4.1.28",
41
+ "@datagrok-libraries/utils": "^4.1.34",
42
42
  "cash-dom": "^8.0.0",
43
43
  "css-loader": "^6.7.3",
44
44
  "datagrok-api": "^1.16.0",
@@ -3,25 +3,28 @@
3
3
  # description: Create the model peptides/DNA sequences with peptides data
4
4
  # language: python
5
5
  # tags: template, demo
6
- # input: int clusters = 5 [Number of superclusters]
7
- # input: int num_sequences = 50 [Number of sequences in each supercluster]
8
- # input: int motif_length = 12 [Average length of motif]
9
- # input: int max_variants_position = 3 [Maximum number of different letters in conservative position in motif]
10
- # input: int random_length = 3 [Average length of random sequence parts before and after motif]
11
- # input: int dispersion = 2 [Variation of total sequence length]
12
- # input: string alphabet_key = 'PT' [Sequence alphabet: PT/DNA/RNA/custom. Custom alphabet is a list of values separated by comma]
13
- # input: bool disable_cliffs = False [Disable generation of cliffs]
14
- # input: double cliff_probability = 0.01 [Probability to make activity cliff of a sequence]
15
- # input: double cliff_strength = 4.0 [Strength of cliff]
16
- # input: string fasta_separator = '' {nullable: true}
6
+ # input: int clusters = 5 { caption: Number of clusters; category: Clusters }
7
+ # input: int num_sequences = 50 { caption: Number of sequences in each cluster; category: Clusters }
8
+ # input: int motif_length = 12 { caption: Average length of motif; category: Motif }
9
+ # input: int max_variants_position = 3 { caption: Maximum number of different letters in conservative position in motif; category: Motif }
10
+ # input: int random_length = 3 { caption: Average length of random sequence parts before and after motif; category: Motif }
11
+ # input: int dispersion = 2 { caption: Variation of total sequence length; category: Motif }
12
+ # input: bool enable_cliffs = true { caption: Enable activity cliffs; category: Activity cliffs }
13
+ # input: double cliff_probability = 0.01 { caption: Probability to make activity cliff of a sequence; category: Activity cliffs; format: 0.000}
14
+ # input: double cliff_strength = 4.0 { caption: Strength of cliff; category: Activity cliffs }
15
+ # input: string alphabet_key = "PT" { caption: Sequence alphabet; category: Output format; hint: PT/DNA/RNA/custom. Custom alphabet is a list of values separated by comma}
16
+ # input: string fasta_separator = "" { caption: Fasta format separator; nullable: true; category: Output format}
17
+ # input: file helm_library_file { caption: HELM library to produce HELM output; nullable: true; category: Output format}
18
+ # input: string helm_connection_mode = "linear" { choices: ["linear", "cyclic", "mixed"]; caption: Peptides connection mode (HELM only); category: Output format}
17
19
  # output: dataframe sequences
18
20
 
19
- """
20
- The most simple options set running from command line
21
- python sequence_generator.py -c 4 -s 50 > output_file.tsv
22
- Basic options:
23
- number of clusters
24
- -s cluster size (number of sequences per cluster)
21
+
22
+ description="""The utility generates clusters of macromolecule sequences to test SAR fucntionality.
23
+ Each cluster contains randomly generated sequence motif.
24
+ Each sequence has activity - a Gauss-distributed random value.
25
+ All sequences in the cluster has activities from the same distibution.
26
+ The utility can simulate activity cliffs - random changes in the conservative motif letters,
27
+ leading to drastical change in the activity.
25
28
  """
26
29
 
27
30
  import random
@@ -280,8 +283,7 @@ def alphabet_from_helm(helm_library_file: str) -> Alphabet:
280
283
  def parse_command_line_args() -> Any:
281
284
  parser = argparse.ArgumentParser(
282
285
  prog="MotifSequencesGenerator",
283
- description="The program generates set of sequences containing sequence motifs "
284
- "for SAR functionality testing",
286
+ description=description,
285
287
  epilog="Utility author and support: Gennadii Zakharov <Gennadiy.Zakharov@gmail.com>",
286
288
  )
287
289
 
@@ -386,14 +388,14 @@ if not grok:
386
388
  random_length = args.random_length
387
389
  dispersion = args.dispersion
388
390
  alphabet_key = args.alphabet
389
- disable_cliffs = args.disable_cliffs
391
+ enable_cliffs = not args.disable_cliffs
390
392
  cliff_probability = args.cliff_probability
391
393
  cliff_strength = args.cliff_strength
392
394
  fasta_separator = args.fasta_separator
393
395
  helm_library_file = args.helm_library_file
394
396
  helm_connection_mode = args.helm_connection_mode
395
397
 
396
- helm_init = "helm_library_file" in globals() and helm_library_file is not None
398
+ helm_init = "helm_library_file" in globals() and helm_library_file is not None and helm_library_file != ''
397
399
 
398
400
  if not helm_init:
399
401
  alphabet: Alphabet = (
@@ -413,7 +415,7 @@ header, data = generate_sequences(
413
415
  random_length,
414
416
  dispersion,
415
417
  alphabet,
416
- not disable_cliffs,
418
+ enable_cliffs,
417
419
  cliff_probability,
418
420
  cliff_strength,
419
421
  )
@@ -2,15 +2,18 @@ import * as grok from 'datagrok-api/grok';
2
2
  import * as ui from 'datagrok-api/ui';
3
3
  import * as DG from 'datagrok-api/dg';
4
4
 
5
- import {_package, sequenceSpaceTopMenu} from '../package';
6
- import {handleError} from './utils';
7
-
8
5
  import {IWebLogoViewer} from '@datagrok-libraries/bio/src/viewers/web-logo';
9
- import {pepseaMethods, runPepsea} from '../utils/pepsea';
6
+ import {awaitStatus, DockerContainerStatus} from '@datagrok-libraries/bio/src/utils/docker';
10
7
  import {DemoScript} from '@datagrok-libraries/tutorials/src/demo-script';
11
8
  import {DimReductionMethods} from '@datagrok-libraries/ml/src/reduce-dimensionality';
12
9
  import {MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
13
10
 
11
+ import {Pepsea, pepseaMethods, runPepsea} from '../utils/pepsea';
12
+ import {sequenceSpaceTopMenu} from '../package';
13
+ import {handleError} from './utils';
14
+
15
+ import {_package} from '../package';
16
+
14
17
  const helmFn: string = 'samples/HELM.csv';
15
18
 
16
19
  export async function demoBio05UI(): Promise<void> {
@@ -25,36 +28,73 @@ export async function demoBio05UI(): Promise<void> {
25
28
  const msaHelmColName: string = 'msa(HELM)';
26
29
  const dimRedMethod: DimReductionMethods = DimReductionMethods.UMAP;
27
30
 
31
+ const pepseaDcId = (await Pepsea.getDockerContainer()).id;
32
+ // // region For test: Stop container to test auto-start
33
+ // await grok.dapi.docker.dockerContainers.stop(pepseaDcId);
34
+ // await Pepsea.awaitStatus(pepseaDcId, 'stopped', 15000);
35
+ // // endregion
36
+ const pepseaDcPromise: Promise<DG.DockerContainer> = Pepsea.getDockerContainer();
37
+ let pepseaDcStatus: DockerContainerStatus;
38
+ let pepseaDcStartPromise: Promise<void>;
39
+
28
40
  try {
29
41
  const demoScript = new DemoScript(
30
42
  'Helm, MSA, Sequence Space',
31
43
  'MSA and composition analysis on Helm data');
32
44
  await demoScript
33
45
  .step(`Load peptides with non-natural aminoacids in 'HELM' notation`, async () => {
34
- view = grok.shell.addTableView(df = await _package.files.readCsv(helmFn));
46
+ [pepseaDcStatus, df] = await Promise.all([
47
+ (async () => { return (await pepseaDcPromise).status; })(),
48
+ _package.files.readCsv(helmFn)
49
+ ]);
50
+ view = grok.shell.addTableView(df);
35
51
 
36
52
  grok.shell.windows.showContextPanel = false;
37
53
  grok.shell.windows.showProperties = false;
54
+
55
+ if (pepseaDcStatus === 'started' || pepseaDcStatus === 'checking') {
56
+ _package.logger.debug(
57
+ `demoBio05UI(), PepSeA ('${Pepsea.dcName}') docker container status = '${pepseaDcStatus}'.`);
58
+ pepseaDcStartPromise = Promise.resolve();
59
+ } else {
60
+ _package.logger.warning(
61
+ `demoBio05UI(), PepSeA ('${Pepsea.dcName}') docker container is trying to start...`);
62
+
63
+ await grok.dapi.docker.dockerContainers.run(pepseaDcId);
64
+ pepseaDcStartPromise = awaitStatus(pepseaDcId, 'started', 30000, _package.logger);
65
+ }
38
66
  }, {
39
67
  description: 'Load dataset with macromolecules of \'Helm\' notation.',
40
68
  delay: 2000,
41
69
  })
42
70
  .step('Align peptides with non-natural aminoacids with PepSeA', async () => {
43
- helmCol = df.getCol(helmColName);
44
- const method: string = pepseaMethods[0];
45
- const gapOpen: number = 1.53;
46
- const gapExtend: number = 0;
47
- msaHelmCol = (await runPepsea(helmCol, msaHelmColName, method, gapOpen, gapExtend, undefined))!;
48
- df.columns.add(msaHelmCol);
49
- await grok.data.detectSemanticTypes(df);
71
+ const pi = DG.TaskBarProgressIndicator.create('MSA by PepSeA ...');
72
+ try {
73
+ // TODO: Show splash if pepseaDcStartPromise is not resolved still
74
+ await pepseaDcStartPromise; // throws timeout
75
+ // Hide splash
76
+
77
+ helmCol = df.getCol(helmColName);
78
+ const method: string = pepseaMethods[0];
79
+ const gapOpen: number = 1.53;
80
+ const gapExtend: number = 0;
81
+ msaHelmCol = (await runPepsea(helmCol, msaHelmColName, method, gapOpen, gapExtend, undefined))!;
82
+ if (!msaHelmCol)
83
+ throw new Error(`Empty MSA result.`);
84
+ df.columns.add(msaHelmCol);
85
+ await grok.data.detectSemanticTypes(df);
86
+ } finally {
87
+ pi.close();
88
+ }
50
89
  }, {
51
90
  // eslint-disable-next-line max-len
52
91
  description: 'Multiple sequence alignment (MSA) performed with PepSeA tool operating on non-natural aminoacids as well.',
53
92
  delay: 2000,
54
93
  })
55
94
  .step('Build sequence space', async () => {
95
+ const preprocessingFunc = DG.Func.find({package: 'Bio', name: 'macromoleculePreprocessingFunction'})[0];
56
96
  ssViewer = (await sequenceSpaceTopMenu(df, msaHelmCol,
57
- dimRedMethod, MmDistanceFunctionsNames.LEVENSHTEIN, true)) as DG.ScatterPlotViewer;
97
+ dimRedMethod, MmDistanceFunctionsNames.LEVENSHTEIN, true, preprocessingFunc)) as DG.ScatterPlotViewer;
58
98
  view.dockManager.dock(ssViewer, DG.DOCK_TYPE.RIGHT, null, 'Sequence Space', 0.35);
59
99
  }, {
60
100
  description: 'Reduce sequence space dimensionality to display on 2D representation.',
package/src/demo/utils.ts CHANGED
@@ -63,8 +63,9 @@ export async function demoSequenceSpace(
63
63
  'lassoTool': true,
64
64
  })) as DG.ScatterPlotViewer;
65
65
  } else {
66
+ const preprocessingFunc = DG.Func.find({package: 'Bio', name: 'macromoleculePreprocessingFunction'})[0];
66
67
  resSpaceViewer = (await sequenceSpaceTopMenu(df, df.getCol(colName),
67
- DimReductionMethods.UMAP, MmDistanceFunctionsNames.LEVENSHTEIN, true)) as DG.ScatterPlotViewer;
68
+ DimReductionMethods.UMAP, MmDistanceFunctionsNames.LEVENSHTEIN, true, preprocessingFunc)) as DG.ScatterPlotViewer;
68
69
  }
69
70
  view.dockManager.dock(resSpaceViewer!, DG.DOCK_TYPE.RIGHT, null, 'Sequence Space', 0.35);
70
71
  return resSpaceViewer;
@@ -6,7 +6,7 @@ export class SplitToMonomersFunctionEditor {
6
6
  tableInput: DG.InputBase;
7
7
  seqColInput: DG.InputBase;
8
8
 
9
- funcParamsDiv: HTMLDivElement;
9
+ funcParamsDiv: HTMLElement;
10
10
 
11
11
  get funcParams(): {} {
12
12
  return {
@@ -15,7 +15,7 @@ export class SplitToMonomersFunctionEditor {
15
15
  };
16
16
  }
17
17
 
18
- get paramsUI(): HTMLDivElement {
18
+ get paramsUI(): HTMLElement {
19
19
  return this.funcParamsDiv;
20
20
  }
21
21
 
package/src/package.ts CHANGED
@@ -9,12 +9,15 @@ import {removeEmptyStringRows} from '@datagrok-libraries/utils/src/dataframe-uti
9
9
  import {Options} from '@datagrok-libraries/utils/src/type-declarations';
10
10
  import {DimReductionMethods, ITSNEOptions, IUMAPOptions} from '@datagrok-libraries/ml/src/reduce-dimensionality';
11
11
  import {SequenceSpaceFunctionEditor} from '@datagrok-libraries/ml/src/functionEditors/seq-space-editor';
12
+ import {DimReductionBaseEditor, PreprocessFunctionReturnType}
13
+ from '@datagrok-libraries/ml/src/functionEditors/dimensionality-reduction-editor';
14
+ import {reduceDimensionality} from '@datagrok-libraries/ml/src/functionEditors/dimensionality-reducer';
12
15
  import {ActivityCliffsFunctionEditor} from '@datagrok-libraries/ml/src/functionEditors/activity-cliffs-editor';
13
16
  import {
14
17
  ISequenceSpaceParams, getActivityCliffs, SequenceSpaceFunc, CLIFFS_COL_ENCODE_FN
15
18
  } from '@datagrok-libraries/ml/src/viewers/activity-cliffs';
16
19
  import {MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
17
- import {BitArrayMetrics} from '@datagrok-libraries/ml/src/typed-metrics';
20
+ import {BitArrayMetrics, KnownMetrics} from '@datagrok-libraries/ml/src/typed-metrics';
18
21
  import {
19
22
  TAGS as bioTAGS, ALPHABET, NOTATION,
20
23
  } from '@datagrok-libraries/bio/src/utils/macromolecule';
@@ -27,6 +30,7 @@ import {SCORE, calculateScores} from '@datagrok-libraries/bio/src/utils/macromol
27
30
  import {
28
31
  createJsonMonomerLibFromSdf, IMonomerLibHelper
29
32
  } from '@datagrok-libraries/bio/src/monomer-works/monomer-utils';
33
+ import {errInfo} from '@datagrok-libraries/bio/src/utils/err-info';
30
34
 
31
35
  import {getMacromoleculeColumns} from './utils/ui-utils';
32
36
  import {
@@ -74,7 +78,6 @@ import {getRegionDo} from './utils/get-region';
74
78
  import {GetRegionApp} from './apps/get-region-app';
75
79
  import {GetRegionFuncEditor} from './utils/get-region-func-editor';
76
80
  import {sequenceToMolfile} from './utils/sequence-to-mol';
77
- import {errInfo} from './utils/err-info';
78
81
  import {detectMacromoleculeProbeDo} from './utils/detect-macromolecule-probe';
79
82
 
80
83
  import {SHOW_SCATTERPLOT_PROGRESS} from '@datagrok-libraries/ml/src/functionEditors/seq-space-base-editor';
@@ -230,11 +233,20 @@ export function SplitToMonomersEditor(call: DG.FuncCall): void {
230
233
  //tags: editor
231
234
  //input: funccall call
232
235
  export function SequenceSpaceEditor(call: DG.FuncCall) {
233
- const funcEditor = new SequenceSpaceFunctionEditor(DG.SEMTYPE.MACROMOLECULE);
236
+ const funcEditor = new DimReductionBaseEditor({semtype: DG.SEMTYPE.MACROMOLECULE});
234
237
  ui.dialog({title: 'Sequence Space'})
235
- .add(funcEditor.paramsUI)
238
+ .add(funcEditor.getEditor())
236
239
  .onOK(async () => {
237
- return call.func.prepare(funcEditor.funcParams).call();
240
+ const params = funcEditor.getParams();
241
+ return call.func.prepare({
242
+ molecules: params.col,
243
+ table: params.table,
244
+ methodName: params.methodName,
245
+ similarityMetric: params.similarityMetric,
246
+ plotEmbeddings: params.plotEmbeddings,
247
+ options: params.options,
248
+ preprocessingFunction: params.preprocessingFunction,
249
+ }).call();
238
250
  })
239
251
  .show();
240
252
  }
@@ -477,6 +489,48 @@ export async function activityCliffs(df: DG.DataFrame, macroMolecule: DG.Column<
477
489
  }).finally(() => { pi.close(); });
478
490
  }
479
491
 
492
+ //name: Encode Sequences
493
+ //tags: dim-red-preprocessing-function
494
+ //meta.supportedSemTypes: Macromolecule
495
+ //meta.supportedTypes: string
496
+ //meta.supportedUnits: fasta,separator
497
+ //meta.supportedDistanceFunctions: Hamming,Levenshtein,Monomer chemical distance,Needlemann-Wunsch
498
+ //input: column col {semType: Macromolecule}
499
+ //input: string metric
500
+ //output: object result
501
+ export async function macromoleculePreprocessingFunction(
502
+ col: DG.Column, metric: MmDistanceFunctionsNames): Promise<PreprocessFunctionReturnType> {
503
+ const {seqList, options} = await getEncodedSeqSpaceCol(col, metric);
504
+ return {entries: seqList, options};
505
+ }
506
+
507
+ //name: Helm Fingerprints
508
+ //tags: dim-red-preprocessing-function
509
+ //meta.supportedSemTypes: Macromolecule
510
+ //meta.supportedTypes: string
511
+ //meta.supportedUnits: helm
512
+ //meta.supportedDistanceFunctions: Tanimoto,Asymmetric,Cosine,Sokal
513
+ //input: column col {semType: Macromolecule}
514
+ //input: string _metric
515
+ //output: object result
516
+ export async function helmPreprocessingFunction(
517
+ col: DG.Column<string>, _metric: BitArrayMetrics): Promise<PreprocessFunctionReturnType> {
518
+ if (col.version !== col.temp[MONOMERIC_COL_TAGS.LAST_INVALIDATED_VERSION])
519
+ await invalidateMols(col, false);
520
+ const molCol = col.temp[MONOMERIC_COL_TAGS.MONOMERIC_MOLS];
521
+ const fingerPrints: DG.Column<DG.BitSet | null> =
522
+ await grok.functions.call('Chem:getMorganFingerprints', {molColumn: molCol});
523
+
524
+ const entries: Array<BitArray | null> = new Array(fingerPrints.length).fill(null);
525
+ for (let i = 0; i < fingerPrints.length; i++) {
526
+ if (fingerPrints.isNone(i) || !fingerPrints.get(i))
527
+ continue;
528
+ const fp = fingerPrints.get(i)!;
529
+ entries[i] = BitArray.fromUint32Array(fp.length, new Uint32Array(fp.getBuffer().buffer));
530
+ }
531
+ return {entries, options: {}};
532
+ }
533
+
480
534
  //top-menu: Bio | Analyze | Sequence Space...
481
535
  //name: Sequence Space
482
536
  //description: Creates 2D sequence space with projected sequences by pairwise distance
@@ -485,182 +539,26 @@ export async function activityCliffs(df: DG.DataFrame, macroMolecule: DG.Column<
485
539
  //input: string methodName { choices:["UMAP", "t-SNE"] }
486
540
  //input: string similarityMetric { choices:["Hamming", "Levenshtein", "Monomer chemical distance"] }
487
541
  //input: bool plotEmbeddings = true
488
- //input: double sparseMatrixThreshold = 0 [Similarity Threshold for sparse matrix calculation]
542
+ //input: func preprocessingFunction {optional: true}
489
543
  //input: object options {optional: true}
490
544
  //output: viewer result
491
545
  //editor: Bio:SequenceSpaceEditor
492
- export async function sequenceSpaceTopMenu(
493
- table: DG.DataFrame, macroMolecule: DG.Column, methodName: DimReductionMethods,
494
- similarityMetric: BitArrayMetrics | MmDistanceFunctionsNames = MmDistanceFunctionsNames.LEVENSHTEIN,
495
- plotEmbeddings: boolean, sparseMatrixThreshold?: number, options?: (IUMAPOptions | ITSNEOptions) & Options,
496
- ): Promise<DG.Viewer | undefined> {
497
- // Delay is required for initial function dialog to close before starting invalidating of molfiles.
498
- // Otherwise, dialog is freezing
499
- await delay(10);
500
- if (!checkInputColumnUI(macroMolecule, 'Sequence space')) return;
501
- let scatterPlot: DG.ScatterPlotViewer | undefined = undefined;
502
- const pg = DG.TaskBarProgressIndicator.create('Initializing sequence space ...');
503
- // function for progress of umap
504
- try {
505
- function progressFunc(_nEpoch: number, epochsLength: number, embeddings: number[][]) {
506
- let embedXCol: DG.Column | null = null;
507
- let embedYCol: DG.Column | null = null;
508
- if (!table.columns.names().includes(embedColsNames[0])) {
509
- embedXCol = table.columns.add(DG.Column.float(embedColsNames[0], table.rowCount));
510
- embedYCol = table.columns.add(DG.Column.float(embedColsNames[1], table.rowCount));
511
- if (plotEmbeddings) {
512
- scatterPlot = grok.shell
513
- .tableView(table.name)
514
- .scatterPlot({x: embedColsNames[0], y: embedColsNames[1], title: 'Sequence space'});
515
- }
516
- } else {
517
- embedXCol = table.columns.byName(embedColsNames[0]);
518
- embedYCol = table.columns.byName(embedColsNames[1]);
519
- }
520
-
521
- if (options?.[SHOW_SCATTERPLOT_PROGRESS]) {
522
- scatterPlot?.root && ui.setUpdateIndicator(scatterPlot!.root, false);
523
- embedXCol.init((i) => embeddings[i] ? embeddings[i][0] : undefined);
524
- embedYCol.init((i) => embeddings[i] ? embeddings[i][1] : undefined);
525
- }
526
- const progress = (_nEpoch / epochsLength * 100);
527
- pg.update(progress, `Running sequence space ... ${progress.toFixed(0)}%`);
528
- }
529
-
530
- const embedColsNames = getEmbeddingColsNames(table);
531
- const withoutEmptyValues = DG.DataFrame.fromColumns([macroMolecule]).clone();
532
- const emptyValsIdxs = removeEmptyStringRows(withoutEmptyValues, macroMolecule);
533
-
534
- const chemSpaceParams: ISequenceSpaceParams = {
535
- seqCol: withoutEmptyValues.col(macroMolecule.name)!,
536
- methodName: methodName,
537
- similarityMetric: similarityMetric,
538
- embedAxesNames: embedColsNames,
539
- options: {...options, sparseMatrixThreshold: sparseMatrixThreshold ?? 0.5,
540
- usingSparseMatrix: table.rowCount > 20000},
541
- };
542
-
543
- const allowedRowCount = methodName === DimReductionMethods.UMAP ? 500000 : 15000;
544
- // number of rows which will be processed relatively fast
545
- const fastRowCount = methodName === DimReductionMethods.UMAP ? 5000 : 2000;
546
- if (table.rowCount > allowedRowCount) {
547
- grok.shell.warning(`Too many rows, maximum for sequence space is ${allowedRowCount}`);
548
- return;
549
- }
550
-
551
- async function getSeqSpace() {
552
- table.columns.add(DG.Column.float(embedColsNames[0], table.rowCount));
553
- table.columns.add(DG.Column.float(embedColsNames[1], table.rowCount));
554
- if (plotEmbeddings) {
555
- scatterPlot = grok.shell
556
- .tableView(table.name)
557
- .scatterPlot({x: embedColsNames[0], y: embedColsNames[1], title: 'Sequence space'});
558
- ui.setUpdateIndicator(scatterPlot.root, true);
559
- }
560
- let resolveF: Function | null = null;
561
-
562
- const sub = grok.events.onViewerClosed.subscribe((args) => {
563
- const v = args.args.viewer as unknown as DG.Viewer<any>;
564
- if (v?.getOptions()?.look?.title && scatterPlot?.getOptions()?.look?.title &&
565
- v?.getOptions()?.look?.title === scatterPlot?.getOptions()?.look?.title) {
566
- grok.events.fireCustomEvent(DIMENSIONALITY_REDUCER_TERMINATE_EVENT, {});
567
- sub.unsubscribe();
568
- resolveF?.();
569
- pg.close();
570
- }
571
- });
572
- const sequenceSpaceResPromise = new Promise<ISequenceSpaceResult | undefined>(async (resolve, reject) => {
573
- try {
574
- resolveF = resolve;
575
- const res = await getSequenceSpace(chemSpaceParams,
576
- options?.[BYPASS_LARGE_DATA_WARNING] ? undefined : progressFunc);
577
- resolve(res);
578
- } catch (e) {
579
- reject(e);
580
- }
581
- });
582
- const sequenceSpaceRes = await sequenceSpaceResPromise;
583
- pg.close();
584
- sub.unsubscribe();
585
- return sequenceSpaceRes ? processResult(sequenceSpaceRes) : sequenceSpaceRes;
586
- }
587
-
588
- if (table.rowCount > fastRowCount && !options?.[BYPASS_LARGE_DATA_WARNING]) {
589
- ui.dialog().add(ui.divText(`Sequence space analysis might take several minutes.
590
- Do you want to continue?`))
591
- .onOK(async () => {
592
- await getSeqSpace().catch((err: any) => {
593
- pg.close();
594
- const [errMsg, errStack] = errInfo(err);
595
- _package.logger.error(errMsg, undefined, errStack);
596
- if (scatterPlot)
597
- scatterPlot.close();
598
- });
599
- })
600
- .onCancel(() => { pg.close(); })
601
- .show();
602
- } else {
603
- return await getSeqSpace();
604
- }
605
-
606
- function processResult(sequenceSpaceRes: ISequenceSpaceResult): DG.ScatterPlotViewer | undefined {
607
- const embeddings = sequenceSpaceRes.coordinates;
608
- for (const col of embeddings) {
609
- const listValues = col.toList();
610
- emptyValsIdxs.forEach((ind: number) => listValues.splice(ind, 0, null));
611
- let embedCol = table.columns.byName(col.name);
612
- if (!embedCol) {
613
- embedCol = DG.Column.float(col.name, listValues.length);
614
- table.columns.add(embedCol);
615
- }
616
- embedCol.init((i) => listValues[i]);
617
- //table.columns.add(DG.Column.float(col.name, table.rowCount).init((i) => listValues[i]));
618
- }
619
- if (plotEmbeddings) {
620
- if (!scatterPlot) {
621
- scatterPlot = grok.shell
622
- .tableView(table.name)
623
- .scatterPlot({x: embedColsNames[0], y: embedColsNames[1], title: 'Sequence space'});
624
- }
625
- ui.setUpdateIndicator(scatterPlot.root, false);
626
- return scatterPlot;
627
- }
628
- }
629
- } catch (e) {
630
- console.error(e);
631
- pg.close();
632
- const [errMsg, errStack] = errInfo(e);
633
- _package.logger.error(errMsg, undefined, errStack);
634
- if (scatterPlot)
635
- (scatterPlot as unknown as DG.Viewer).close();
636
- }
637
- /* const encodedCol = encodeMonomers(macroMolecule);
638
- if (!encodedCol)
546
+ export async function sequenceSpaceTopMenu(table: DG.DataFrame, molecules: DG.Column,
547
+ methodName: DimReductionMethods, similarityMetric: BitArrayMetrics | MmDistanceFunctionsNames,
548
+ plotEmbeddings: boolean, preprocessingFunction?: DG.Func, options?: (IUMAPOptions | ITSNEOptions) & Options
549
+ ): Promise<DG.ScatterPlotViewer | undefined> {
550
+ if (!checkInputColumnUI(molecules, 'Sequence Space'))
639
551
  return;
640
- const embedColsNames = getEmbeddingColsNames(table);
641
- const withoutEmptyValues = DG.DataFrame.fromColumns([encodedCol]).clone();
642
- const emptyValsIdxs = removeEmptyStringRows(withoutEmptyValues, encodedCol);
643
-
644
- const chemSpaceParams = {
645
- seqCol: withoutEmptyValues.col(encodedCol.name)!,
646
- methodName: methodName,
647
- similarityMetric: similarityMetric,
648
- embedAxesNames: embedColsNames
649
- };
650
- const sequenceSpaceRes = await sequenceSpace(chemSpaceParams);
651
- const embeddings = sequenceSpaceRes.coordinates;
652
- for (const col of embeddings) {
653
- const listValues = col.toList();
654
- emptyValsIdxs.forEach((ind: number) => listValues.splice(ind, 0, null));
655
- table.columns.add(DG.Column.fromList('double', col.name, listValues));
656
- }
657
- let sp;
658
- if (plotEmbeddings) {
659
- for (const v of grok.shell.views) {
660
- if (v.name === table.name)
661
- sp = (v as DG.TableView).scatterPlot({x: embedColsNames[0], y: embedColsNames[1], title: 'Sequence space'});
662
- }
663
- } */
552
+ if (!preprocessingFunction)
553
+ preprocessingFunction = DG.Func.find({name: 'macromoleculePreprocessingFunction', package: 'Bio'})[0];
554
+
555
+ const res = await reduceDimensionality(table, molecules, methodName,
556
+ similarityMetric as KnownMetrics, preprocessingFunction, plotEmbeddings, options, {
557
+ fastRowCount: 10000,
558
+ scatterPlotName: 'Sequence space',
559
+ bypassLargeDataWarning: options?.[BYPASS_LARGE_DATA_WARNING],
560
+ });
561
+ return res;
664
562
  }
665
563
 
666
564
  //top-menu: Bio | Convert | To Atomic Level...
@@ -1066,6 +964,7 @@ export function addCopyMenu(cell: DG.Cell, menu: DG.Menu): void {
1066
964
  //description: Sequence similarity tracking and evaluation dataset diversity
1067
965
  //meta.path: /apps/Tutorials/Demo/Bioinformatics/Similarity,%20Diversity
1068
966
  //meta.isDemoScript: True
967
+ //meta.demoSkip: GROK-14320
1069
968
  export async function demoBioSimilarityDiversity(): Promise<void> {
1070
969
  await demoBio01UI();
1071
970
  }
@@ -1076,6 +975,7 @@ export async function demoBioSimilarityDiversity(): Promise<void> {
1076
975
  //description: Exploring sequence space of Macromolecules, comparison with hierarchical clustering results
1077
976
  //meta.path: /apps/Tutorials/Demo/Bioinformatics/Sequence%20Space
1078
977
  //meta.isDemoScript: True
978
+ //meta.demoSkip: GROK-14320
1079
979
  export async function demoBioSequenceSpace(): Promise<void> {
1080
980
  await demoBio01aUI();
1081
981
  }
@@ -1086,6 +986,7 @@ export async function demoBioSequenceSpace(): Promise<void> {
1086
986
  //description: Activity Cliffs analysis on Macromolecules data
1087
987
  //meta.path: /apps/Tutorials/Demo/Bioinformatics/Activity%20Cliffs
1088
988
  //meta.isDemoScript: True
989
+ //meta.demoSkip: GROK-14320
1089
990
  export async function demoBioActivityCliffs(): Promise<void> {
1090
991
  await demoBio01bUI();
1091
992
  }
@@ -1096,6 +997,7 @@ export async function demoBioActivityCliffs(): Promise<void> {
1096
997
  //description: Atomic level structure of Macromolecules
1097
998
  //meta.path: /apps/Tutorials/Demo/Bioinformatics/Atomic%20Level
1098
999
  //meta.isDemoScript: True
1000
+ //meta.demoSkip: GROK-14320
1099
1001
  export async function demoBioAtomicLevel(): Promise<void> {
1100
1002
  await demoBio03UI();
1101
1003
  }
@@ -1106,6 +1008,7 @@ export async function demoBioAtomicLevel(): Promise<void> {
1106
1008
  //description: MSA and composition analysis on Helm data
1107
1009
  //meta.path: /apps/Tutorials/Demo/Bioinformatics/Helm,%20MSA,%20Sequence%20Space
1108
1010
  //meta.isDemoScript: True
1011
+ //meta.demoSkip: GROK-14320
1109
1012
  export async function demoBioHelmMsaSequenceSpace(): Promise<void> {
1110
1013
  await demoBio05UI();
1111
1014
  }
@@ -16,7 +16,7 @@ category('PepSeA', () => {
16
16
  await awaitContainerStart();
17
17
  const table = DG.DataFrame.fromCsv(testCsv);
18
18
  const alignedCol = await runPepsea(table.getCol('HELM'), 'msa(HELM)');
19
- expect(alignedCol !== null, true, 'PepSeA conainter has not started');
19
+ expect(alignedCol !== null, true, 'PepSeA container has not started');
20
20
  const alignedTestCol = table.getCol('MSA');
21
21
  for (let i = 0; i < alignedCol!.length; ++i)
22
22
  expect(alignedCol!.get(i) == alignedTestCol.get(i), true);
@@ -1,9 +1,10 @@
1
1
  import * as DG from 'datagrok-api/dg';
2
2
  import * as grok from 'datagrok-api/grok';
3
3
  import {expect} from '@datagrok-libraries/utils/src/test';
4
- import {BYPASS_LARGE_DATA_WARNING, sequenceSpaceTopMenu} from '../package';
4
+ import {sequenceSpaceTopMenu} from '../package';
5
5
  import {MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
6
6
  import {DimReductionMethods} from '@datagrok-libraries/ml/src/reduce-dimensionality';
7
+ import {BYPASS_LARGE_DATA_WARNING} from '@datagrok-libraries/ml/src/functionEditors/consts';
7
8
 
8
9
  export async function _testSequenceSpaceReturnsResult(
9
10
  df: DG.DataFrame, algorithm: DimReductionMethods, colName: string,
@@ -14,7 +15,10 @@ export async function _testSequenceSpaceReturnsResult(
14
15
  if (semType)
15
16
  col.semType = semType;
16
17
 
18
+ const preprocessingFunc = DG.Func.find({package: 'Bio', name: 'macromoleculePreprocessingFunction'})[0];
19
+ if (!preprocessingFunc)
20
+ throw new Error('Preprocessing function not found');
17
21
  const sp = await sequenceSpaceTopMenu(df, df.col(colName)!, algorithm, MmDistanceFunctionsNames.LEVENSHTEIN, true,
18
- 0.6, {[`${BYPASS_LARGE_DATA_WARNING}`]: true});
22
+ preprocessingFunc, {[BYPASS_LARGE_DATA_WARNING]: true});
19
23
  expect(sp != null, true);
20
24
  }