@datagrok/bio 2.11.13 → 2.11.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/dist/23.js +2 -0
  2. package/dist/23.js.map +1 -0
  3. package/dist/282.js +2 -0
  4. package/dist/282.js.map +1 -0
  5. package/dist/356.js +2 -0
  6. package/dist/356.js.map +1 -0
  7. package/dist/361.js +1 -1
  8. package/dist/361.js.map +1 -1
  9. package/dist/40.js +2 -0
  10. package/dist/40.js.map +1 -0
  11. package/dist/562.js +2 -0
  12. package/dist/562.js.map +1 -0
  13. package/dist/586.js +2 -0
  14. package/dist/586.js.map +1 -0
  15. package/dist/65.js +2 -0
  16. package/dist/65.js.map +1 -0
  17. package/dist/796.js +2 -0
  18. package/dist/796.js.map +1 -0
  19. package/dist/8473fcbfb6e85ca6c852.wasm +0 -0
  20. package/dist/{931.js → 935.js} +3 -3
  21. package/dist/935.js.map +1 -0
  22. package/dist/9a8fbf37666e32487835.wasm +0 -0
  23. package/dist/package-test.js +1 -1
  24. package/dist/package-test.js.map +1 -1
  25. package/dist/package.js +1 -1
  26. package/dist/package.js.map +1 -1
  27. package/package.json +4 -4
  28. package/scripts/sequence_generator.py +24 -22
  29. package/src/demo/bio05-helm-msa-sequence-space.ts +53 -13
  30. package/src/demo/utils.ts +2 -1
  31. package/src/function-edtiors/split-to-monomers-editor.ts +2 -2
  32. package/src/package.ts +82 -177
  33. package/src/tests/pepsea-tests.ts +1 -1
  34. package/src/tests/sequence-space-utils.ts +6 -2
  35. package/src/tests/similarity-diversity-tests.ts +16 -4
  36. package/src/utils/cell-renderer.ts +1 -1
  37. package/src/utils/helm-to-molfile.ts +1 -1
  38. package/src/utils/monomer-lib.ts +1 -1
  39. package/src/utils/pepsea.ts +16 -1
  40. package/src/viewers/vd-regions-viewer.ts +42 -16
  41. package/src/viewers/web-logo-viewer.ts +40 -20
  42. package/dist/1.js +0 -2
  43. package/dist/1.js.map +0 -1
  44. package/dist/190.js +0 -2
  45. package/dist/190.js.map +0 -1
  46. package/dist/381.js +0 -2
  47. package/dist/381.js.map +0 -1
  48. package/dist/770.js +0 -2
  49. package/dist/770.js.map +0 -1
  50. package/dist/868.js +0 -2
  51. package/dist/868.js.map +0 -1
  52. package/dist/931.js.map +0 -1
  53. package/src/utils/err-info.ts +0 -28
  54. /package/dist/{931.js.LICENSE.txt → 935.js.LICENSE.txt} +0 -0
package/package.json CHANGED
@@ -5,7 +5,7 @@
5
5
  "name": "Leonid Stolbov",
6
6
  "email": "lstolbov@datagrok.ai"
7
7
  },
8
- "version": "2.11.13",
8
+ "version": "2.11.15",
9
9
  "description": "Bioinformatics support (import/export of sequences, conversion, visualization, analysis). [See more](https://github.com/datagrok-ai/public/blob/master/packages/Bio/README.md) for details.",
10
10
  "repository": {
11
11
  "type": "git",
@@ -34,11 +34,11 @@
34
34
  ],
35
35
  "dependencies": {
36
36
  "@biowasm/aioli": "^3.1.0",
37
- "@datagrok-libraries/bio": "^5.39.9",
37
+ "@datagrok-libraries/bio": "^5.39.10",
38
38
  "@datagrok-libraries/chem-meta": "^1.2.1",
39
- "@datagrok-libraries/ml": "^6.3.56",
39
+ "@datagrok-libraries/ml": "^6.3.62",
40
40
  "@datagrok-libraries/tutorials": "^1.3.11",
41
- "@datagrok-libraries/utils": "^4.1.28",
41
+ "@datagrok-libraries/utils": "^4.1.34",
42
42
  "cash-dom": "^8.0.0",
43
43
  "css-loader": "^6.7.3",
44
44
  "datagrok-api": "^1.16.0",
@@ -3,25 +3,28 @@
3
3
  # description: Create the model peptides/DNA sequences with peptides data
4
4
  # language: python
5
5
  # tags: template, demo
6
- # input: int clusters = 5 [Number of superclusters]
7
- # input: int num_sequences = 50 [Number of sequences in each supercluster]
8
- # input: int motif_length = 12 [Average length of motif]
9
- # input: int max_variants_position = 3 [Maximum number of different letters in conservative position in motif]
10
- # input: int random_length = 3 [Average length of random sequence parts before and after motif]
11
- # input: int dispersion = 2 [Variation of total sequence length]
12
- # input: string alphabet_key = 'PT' [Sequence alphabet: PT/DNA/RNA/custom. Custom alphabet is a list of values separated by comma]
13
- # input: bool disable_cliffs = False [Disable generation of cliffs]
14
- # input: double cliff_probability = 0.01 [Probability to make activity cliff of a sequence]
15
- # input: double cliff_strength = 4.0 [Strength of cliff]
16
- # input: string fasta_separator = '' {nullable: true}
6
+ # input: int clusters = 5 { caption: Number of clusters; category: Clusters }
7
+ # input: int num_sequences = 50 { caption: Number of sequences in each cluster; category: Clusters }
8
+ # input: int motif_length = 12 { caption: Average length of motif; category: Motif }
9
+ # input: int max_variants_position = 3 { caption: Maximum number of different letters in conservative position in motif; category: Motif }
10
+ # input: int random_length = 3 { caption: Average length of random sequence parts before and after motif; category: Motif }
11
+ # input: int dispersion = 2 { caption: Variation of total sequence length; category: Motif }
12
+ # input: bool enable_cliffs = true { caption: Enable activity cliffs; category: Activity cliffs }
13
+ # input: double cliff_probability = 0.01 { caption: Probability to make activity cliff of a sequence; category: Activity cliffs; format: 0.000}
14
+ # input: double cliff_strength = 4.0 { caption: Strength of cliff; category: Activity cliffs }
15
+ # input: string alphabet_key = "PT" { caption: Sequence alphabet; category: Output format; hint: PT/DNA/RNA/custom. Custom alphabet is a list of values separated by comma}
16
+ # input: string fasta_separator = "" { caption: Fasta format separator; nullable: true; category: Output format}
17
+ # input: file helm_library_file { caption: HELM library to produce HELM output; nullable: true; category: Output format}
18
+ # input: string helm_connection_mode = "linear" { choices: ["linear", "cyclic", "mixed"]; caption: Peptides connection mode (HELM only); category: Output format}
17
19
  # output: dataframe sequences
18
20
 
19
- """
20
- The most simple options set running from command line
21
- python sequence_generator.py -c 4 -s 50 > output_file.tsv
22
- Basic options:
23
- number of clusters
24
- -s cluster size (number of sequences per cluster)
21
+
22
+ description="""The utility generates clusters of macromolecule sequences to test SAR fucntionality.
23
+ Each cluster contains randomly generated sequence motif.
24
+ Each sequence has activity - a Gauss-distributed random value.
25
+ All sequences in the cluster has activities from the same distibution.
26
+ The utility can simulate activity cliffs - random changes in the conservative motif letters,
27
+ leading to drastical change in the activity.
25
28
  """
26
29
 
27
30
  import random
@@ -280,8 +283,7 @@ def alphabet_from_helm(helm_library_file: str) -> Alphabet:
280
283
  def parse_command_line_args() -> Any:
281
284
  parser = argparse.ArgumentParser(
282
285
  prog="MotifSequencesGenerator",
283
- description="The program generates set of sequences containing sequence motifs "
284
- "for SAR functionality testing",
286
+ description=description,
285
287
  epilog="Utility author and support: Gennadii Zakharov <Gennadiy.Zakharov@gmail.com>",
286
288
  )
287
289
 
@@ -386,14 +388,14 @@ if not grok:
386
388
  random_length = args.random_length
387
389
  dispersion = args.dispersion
388
390
  alphabet_key = args.alphabet
389
- disable_cliffs = args.disable_cliffs
391
+ enable_cliffs = not args.disable_cliffs
390
392
  cliff_probability = args.cliff_probability
391
393
  cliff_strength = args.cliff_strength
392
394
  fasta_separator = args.fasta_separator
393
395
  helm_library_file = args.helm_library_file
394
396
  helm_connection_mode = args.helm_connection_mode
395
397
 
396
- helm_init = "helm_library_file" in globals() and helm_library_file is not None
398
+ helm_init = "helm_library_file" in globals() and helm_library_file is not None and helm_library_file != ''
397
399
 
398
400
  if not helm_init:
399
401
  alphabet: Alphabet = (
@@ -413,7 +415,7 @@ header, data = generate_sequences(
413
415
  random_length,
414
416
  dispersion,
415
417
  alphabet,
416
- not disable_cliffs,
418
+ enable_cliffs,
417
419
  cliff_probability,
418
420
  cliff_strength,
419
421
  )
@@ -2,15 +2,18 @@ import * as grok from 'datagrok-api/grok';
2
2
  import * as ui from 'datagrok-api/ui';
3
3
  import * as DG from 'datagrok-api/dg';
4
4
 
5
- import {_package, sequenceSpaceTopMenu} from '../package';
6
- import {handleError} from './utils';
7
-
8
5
  import {IWebLogoViewer} from '@datagrok-libraries/bio/src/viewers/web-logo';
9
- import {pepseaMethods, runPepsea} from '../utils/pepsea';
6
+ import {awaitStatus, DockerContainerStatus} from '@datagrok-libraries/bio/src/utils/docker';
10
7
  import {DemoScript} from '@datagrok-libraries/tutorials/src/demo-script';
11
8
  import {DimReductionMethods} from '@datagrok-libraries/ml/src/reduce-dimensionality';
12
9
  import {MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
13
10
 
11
+ import {Pepsea, pepseaMethods, runPepsea} from '../utils/pepsea';
12
+ import {sequenceSpaceTopMenu} from '../package';
13
+ import {handleError} from './utils';
14
+
15
+ import {_package} from '../package';
16
+
14
17
  const helmFn: string = 'samples/HELM.csv';
15
18
 
16
19
  export async function demoBio05UI(): Promise<void> {
@@ -25,36 +28,73 @@ export async function demoBio05UI(): Promise<void> {
25
28
  const msaHelmColName: string = 'msa(HELM)';
26
29
  const dimRedMethod: DimReductionMethods = DimReductionMethods.UMAP;
27
30
 
31
+ const pepseaDcId = (await Pepsea.getDockerContainer()).id;
32
+ // // region For test: Stop container to test auto-start
33
+ // await grok.dapi.docker.dockerContainers.stop(pepseaDcId);
34
+ // await Pepsea.awaitStatus(pepseaDcId, 'stopped', 15000);
35
+ // // endregion
36
+ const pepseaDcPromise: Promise<DG.DockerContainer> = Pepsea.getDockerContainer();
37
+ let pepseaDcStatus: DockerContainerStatus;
38
+ let pepseaDcStartPromise: Promise<void>;
39
+
28
40
  try {
29
41
  const demoScript = new DemoScript(
30
42
  'Helm, MSA, Sequence Space',
31
43
  'MSA and composition analysis on Helm data');
32
44
  await demoScript
33
45
  .step(`Load peptides with non-natural aminoacids in 'HELM' notation`, async () => {
34
- view = grok.shell.addTableView(df = await _package.files.readCsv(helmFn));
46
+ [pepseaDcStatus, df] = await Promise.all([
47
+ (async () => { return (await pepseaDcPromise).status; })(),
48
+ _package.files.readCsv(helmFn)
49
+ ]);
50
+ view = grok.shell.addTableView(df);
35
51
 
36
52
  grok.shell.windows.showContextPanel = false;
37
53
  grok.shell.windows.showProperties = false;
54
+
55
+ if (pepseaDcStatus === 'started' || pepseaDcStatus === 'checking') {
56
+ _package.logger.debug(
57
+ `demoBio05UI(), PepSeA ('${Pepsea.dcName}') docker container status = '${pepseaDcStatus}'.`);
58
+ pepseaDcStartPromise = Promise.resolve();
59
+ } else {
60
+ _package.logger.warning(
61
+ `demoBio05UI(), PepSeA ('${Pepsea.dcName}') docker container is trying to start...`);
62
+
63
+ await grok.dapi.docker.dockerContainers.run(pepseaDcId);
64
+ pepseaDcStartPromise = awaitStatus(pepseaDcId, 'started', 30000, _package.logger);
65
+ }
38
66
  }, {
39
67
  description: 'Load dataset with macromolecules of \'Helm\' notation.',
40
68
  delay: 2000,
41
69
  })
42
70
  .step('Align peptides with non-natural aminoacids with PepSeA', async () => {
43
- helmCol = df.getCol(helmColName);
44
- const method: string = pepseaMethods[0];
45
- const gapOpen: number = 1.53;
46
- const gapExtend: number = 0;
47
- msaHelmCol = (await runPepsea(helmCol, msaHelmColName, method, gapOpen, gapExtend, undefined))!;
48
- df.columns.add(msaHelmCol);
49
- await grok.data.detectSemanticTypes(df);
71
+ const pi = DG.TaskBarProgressIndicator.create('MSA by PepSeA ...');
72
+ try {
73
+ // TODO: Show splash if pepseaDcStartPromise is not resolved still
74
+ await pepseaDcStartPromise; // throws timeout
75
+ // Hide splash
76
+
77
+ helmCol = df.getCol(helmColName);
78
+ const method: string = pepseaMethods[0];
79
+ const gapOpen: number = 1.53;
80
+ const gapExtend: number = 0;
81
+ msaHelmCol = (await runPepsea(helmCol, msaHelmColName, method, gapOpen, gapExtend, undefined))!;
82
+ if (!msaHelmCol)
83
+ throw new Error(`Empty MSA result.`);
84
+ df.columns.add(msaHelmCol);
85
+ await grok.data.detectSemanticTypes(df);
86
+ } finally {
87
+ pi.close();
88
+ }
50
89
  }, {
51
90
  // eslint-disable-next-line max-len
52
91
  description: 'Multiple sequence alignment (MSA) performed with PepSeA tool operating on non-natural aminoacids as well.',
53
92
  delay: 2000,
54
93
  })
55
94
  .step('Build sequence space', async () => {
95
+ const preprocessingFunc = DG.Func.find({package: 'Bio', name: 'macromoleculePreprocessingFunction'})[0];
56
96
  ssViewer = (await sequenceSpaceTopMenu(df, msaHelmCol,
57
- dimRedMethod, MmDistanceFunctionsNames.LEVENSHTEIN, true)) as DG.ScatterPlotViewer;
97
+ dimRedMethod, MmDistanceFunctionsNames.LEVENSHTEIN, true, preprocessingFunc)) as DG.ScatterPlotViewer;
58
98
  view.dockManager.dock(ssViewer, DG.DOCK_TYPE.RIGHT, null, 'Sequence Space', 0.35);
59
99
  }, {
60
100
  description: 'Reduce sequence space dimensionality to display on 2D representation.',
package/src/demo/utils.ts CHANGED
@@ -63,8 +63,9 @@ export async function demoSequenceSpace(
63
63
  'lassoTool': true,
64
64
  })) as DG.ScatterPlotViewer;
65
65
  } else {
66
+ const preprocessingFunc = DG.Func.find({package: 'Bio', name: 'macromoleculePreprocessingFunction'})[0];
66
67
  resSpaceViewer = (await sequenceSpaceTopMenu(df, df.getCol(colName),
67
- DimReductionMethods.UMAP, MmDistanceFunctionsNames.LEVENSHTEIN, true)) as DG.ScatterPlotViewer;
68
+ DimReductionMethods.UMAP, MmDistanceFunctionsNames.LEVENSHTEIN, true, preprocessingFunc)) as DG.ScatterPlotViewer;
68
69
  }
69
70
  view.dockManager.dock(resSpaceViewer!, DG.DOCK_TYPE.RIGHT, null, 'Sequence Space', 0.35);
70
71
  return resSpaceViewer;
@@ -6,7 +6,7 @@ export class SplitToMonomersFunctionEditor {
6
6
  tableInput: DG.InputBase;
7
7
  seqColInput: DG.InputBase;
8
8
 
9
- funcParamsDiv: HTMLDivElement;
9
+ funcParamsDiv: HTMLElement;
10
10
 
11
11
  get funcParams(): {} {
12
12
  return {
@@ -15,7 +15,7 @@ export class SplitToMonomersFunctionEditor {
15
15
  };
16
16
  }
17
17
 
18
- get paramsUI(): HTMLDivElement {
18
+ get paramsUI(): HTMLElement {
19
19
  return this.funcParamsDiv;
20
20
  }
21
21
 
package/src/package.ts CHANGED
@@ -9,12 +9,15 @@ import {removeEmptyStringRows} from '@datagrok-libraries/utils/src/dataframe-uti
9
9
  import {Options} from '@datagrok-libraries/utils/src/type-declarations';
10
10
  import {DimReductionMethods, ITSNEOptions, IUMAPOptions} from '@datagrok-libraries/ml/src/reduce-dimensionality';
11
11
  import {SequenceSpaceFunctionEditor} from '@datagrok-libraries/ml/src/functionEditors/seq-space-editor';
12
+ import {DimReductionBaseEditor, PreprocessFunctionReturnType}
13
+ from '@datagrok-libraries/ml/src/functionEditors/dimensionality-reduction-editor';
14
+ import {reduceDimensionality} from '@datagrok-libraries/ml/src/functionEditors/dimensionality-reducer';
12
15
  import {ActivityCliffsFunctionEditor} from '@datagrok-libraries/ml/src/functionEditors/activity-cliffs-editor';
13
16
  import {
14
17
  ISequenceSpaceParams, getActivityCliffs, SequenceSpaceFunc, CLIFFS_COL_ENCODE_FN
15
18
  } from '@datagrok-libraries/ml/src/viewers/activity-cliffs';
16
19
  import {MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
17
- import {BitArrayMetrics} from '@datagrok-libraries/ml/src/typed-metrics';
20
+ import {BitArrayMetrics, KnownMetrics} from '@datagrok-libraries/ml/src/typed-metrics';
18
21
  import {
19
22
  TAGS as bioTAGS, ALPHABET, NOTATION,
20
23
  } from '@datagrok-libraries/bio/src/utils/macromolecule';
@@ -27,6 +30,7 @@ import {SCORE, calculateScores} from '@datagrok-libraries/bio/src/utils/macromol
27
30
  import {
28
31
  createJsonMonomerLibFromSdf, IMonomerLibHelper
29
32
  } from '@datagrok-libraries/bio/src/monomer-works/monomer-utils';
33
+ import {errInfo} from '@datagrok-libraries/bio/src/utils/err-info';
30
34
 
31
35
  import {getMacromoleculeColumns} from './utils/ui-utils';
32
36
  import {
@@ -74,7 +78,6 @@ import {getRegionDo} from './utils/get-region';
74
78
  import {GetRegionApp} from './apps/get-region-app';
75
79
  import {GetRegionFuncEditor} from './utils/get-region-func-editor';
76
80
  import {sequenceToMolfile} from './utils/sequence-to-mol';
77
- import {errInfo} from './utils/err-info';
78
81
  import {detectMacromoleculeProbeDo} from './utils/detect-macromolecule-probe';
79
82
 
80
83
  import {SHOW_SCATTERPLOT_PROGRESS} from '@datagrok-libraries/ml/src/functionEditors/seq-space-base-editor';
@@ -230,11 +233,21 @@ export function SplitToMonomersEditor(call: DG.FuncCall): void {
230
233
  //tags: editor
231
234
  //input: funccall call
232
235
  export function SequenceSpaceEditor(call: DG.FuncCall) {
233
- const funcEditor = new SequenceSpaceFunctionEditor(DG.SEMTYPE.MACROMOLECULE);
236
+ const funcEditor = new DimReductionBaseEditor({semtype: DG.SEMTYPE.MACROMOLECULE});
234
237
  ui.dialog({title: 'Sequence Space'})
235
- .add(funcEditor.paramsUI)
238
+ .add(funcEditor.getEditor())
236
239
  .onOK(async () => {
237
- return call.func.prepare(funcEditor.funcParams).call();
240
+ const params = funcEditor.getParams();
241
+ return call.func.prepare({
242
+ molecules: params.col,
243
+ table: params.table,
244
+ methodName: params.methodName,
245
+ similarityMetric: params.similarityMetric,
246
+ plotEmbeddings: params.plotEmbeddings,
247
+ options: params.options,
248
+ preprocessingFunction: params.preprocessingFunction,
249
+ clusterEmbeddings: params.clusterEmbeddings,
250
+ }).call();
238
251
  })
239
252
  .show();
240
253
  }
@@ -477,6 +490,48 @@ export async function activityCliffs(df: DG.DataFrame, macroMolecule: DG.Column<
477
490
  }).finally(() => { pi.close(); });
478
491
  }
479
492
 
493
+ //name: Encode Sequences
494
+ //tags: dim-red-preprocessing-function
495
+ //meta.supportedSemTypes: Macromolecule
496
+ //meta.supportedTypes: string
497
+ //meta.supportedUnits: fasta,separator,helm
498
+ //meta.supportedDistanceFunctions: Hamming,Levenshtein,Monomer chemical distance,Needlemann-Wunsch
499
+ //input: column col {semType: Macromolecule}
500
+ //input: string metric
501
+ //output: object result
502
+ export async function macromoleculePreprocessingFunction(
503
+ col: DG.Column, metric: MmDistanceFunctionsNames): Promise<PreprocessFunctionReturnType> {
504
+ const {seqList, options} = await getEncodedSeqSpaceCol(col, metric);
505
+ return {entries: seqList, options};
506
+ }
507
+
508
+ //name: Helm Fingerprints
509
+ //tags: dim-red-preprocessing-function
510
+ //meta.supportedSemTypes: Macromolecule
511
+ //meta.supportedTypes: string
512
+ //meta.supportedUnits: helm
513
+ //meta.supportedDistanceFunctions: Tanimoto,Asymmetric,Cosine,Sokal
514
+ //input: column col {semType: Macromolecule}
515
+ //input: string _metric
516
+ //output: object result
517
+ export async function helmPreprocessingFunction(
518
+ col: DG.Column<string>, _metric: BitArrayMetrics): Promise<PreprocessFunctionReturnType> {
519
+ if (col.version !== col.temp[MONOMERIC_COL_TAGS.LAST_INVALIDATED_VERSION])
520
+ await invalidateMols(col, false);
521
+ const molCol = col.temp[MONOMERIC_COL_TAGS.MONOMERIC_MOLS];
522
+ const fingerPrints: DG.Column<DG.BitSet | null> =
523
+ await grok.functions.call('Chem:getMorganFingerprints', {molColumn: molCol});
524
+
525
+ const entries: Array<BitArray | null> = new Array(fingerPrints.length).fill(null);
526
+ for (let i = 0; i < fingerPrints.length; i++) {
527
+ if (fingerPrints.isNone(i) || !fingerPrints.get(i))
528
+ continue;
529
+ const fp = fingerPrints.get(i)!;
530
+ entries[i] = BitArray.fromUint32Array(fp.length, new Uint32Array(fp.getBuffer().buffer));
531
+ }
532
+ return {entries, options: {}};
533
+ }
534
+
480
535
  //top-menu: Bio | Analyze | Sequence Space...
481
536
  //name: Sequence Space
482
537
  //description: Creates 2D sequence space with projected sequences by pairwise distance
@@ -485,182 +540,27 @@ export async function activityCliffs(df: DG.DataFrame, macroMolecule: DG.Column<
485
540
  //input: string methodName { choices:["UMAP", "t-SNE"] }
486
541
  //input: string similarityMetric { choices:["Hamming", "Levenshtein", "Monomer chemical distance"] }
487
542
  //input: bool plotEmbeddings = true
488
- //input: double sparseMatrixThreshold = 0 [Similarity Threshold for sparse matrix calculation]
543
+ //input: func preprocessingFunction {optional: true}
489
544
  //input: object options {optional: true}
545
+ //input: bool clusterEmbeddings = true { optional: true }
490
546
  //output: viewer result
491
547
  //editor: Bio:SequenceSpaceEditor
492
- export async function sequenceSpaceTopMenu(
493
- table: DG.DataFrame, macroMolecule: DG.Column, methodName: DimReductionMethods,
494
- similarityMetric: BitArrayMetrics | MmDistanceFunctionsNames = MmDistanceFunctionsNames.LEVENSHTEIN,
495
- plotEmbeddings: boolean, sparseMatrixThreshold?: number, options?: (IUMAPOptions | ITSNEOptions) & Options,
496
- ): Promise<DG.Viewer | undefined> {
497
- // Delay is required for initial function dialog to close before starting invalidating of molfiles.
498
- // Otherwise, dialog is freezing
499
- await delay(10);
500
- if (!checkInputColumnUI(macroMolecule, 'Sequence space')) return;
501
- let scatterPlot: DG.ScatterPlotViewer | undefined = undefined;
502
- const pg = DG.TaskBarProgressIndicator.create('Initializing sequence space ...');
503
- // function for progress of umap
504
- try {
505
- function progressFunc(_nEpoch: number, epochsLength: number, embeddings: number[][]) {
506
- let embedXCol: DG.Column | null = null;
507
- let embedYCol: DG.Column | null = null;
508
- if (!table.columns.names().includes(embedColsNames[0])) {
509
- embedXCol = table.columns.add(DG.Column.float(embedColsNames[0], table.rowCount));
510
- embedYCol = table.columns.add(DG.Column.float(embedColsNames[1], table.rowCount));
511
- if (plotEmbeddings) {
512
- scatterPlot = grok.shell
513
- .tableView(table.name)
514
- .scatterPlot({x: embedColsNames[0], y: embedColsNames[1], title: 'Sequence space'});
515
- }
516
- } else {
517
- embedXCol = table.columns.byName(embedColsNames[0]);
518
- embedYCol = table.columns.byName(embedColsNames[1]);
519
- }
520
-
521
- if (options?.[SHOW_SCATTERPLOT_PROGRESS]) {
522
- scatterPlot?.root && ui.setUpdateIndicator(scatterPlot!.root, false);
523
- embedXCol.init((i) => embeddings[i] ? embeddings[i][0] : undefined);
524
- embedYCol.init((i) => embeddings[i] ? embeddings[i][1] : undefined);
525
- }
526
- const progress = (_nEpoch / epochsLength * 100);
527
- pg.update(progress, `Running sequence space ... ${progress.toFixed(0)}%`);
528
- }
529
-
530
- const embedColsNames = getEmbeddingColsNames(table);
531
- const withoutEmptyValues = DG.DataFrame.fromColumns([macroMolecule]).clone();
532
- const emptyValsIdxs = removeEmptyStringRows(withoutEmptyValues, macroMolecule);
533
-
534
- const chemSpaceParams: ISequenceSpaceParams = {
535
- seqCol: withoutEmptyValues.col(macroMolecule.name)!,
536
- methodName: methodName,
537
- similarityMetric: similarityMetric,
538
- embedAxesNames: embedColsNames,
539
- options: {...options, sparseMatrixThreshold: sparseMatrixThreshold ?? 0.5,
540
- usingSparseMatrix: table.rowCount > 20000},
541
- };
542
-
543
- const allowedRowCount = methodName === DimReductionMethods.UMAP ? 500000 : 15000;
544
- // number of rows which will be processed relatively fast
545
- const fastRowCount = methodName === DimReductionMethods.UMAP ? 5000 : 2000;
546
- if (table.rowCount > allowedRowCount) {
547
- grok.shell.warning(`Too many rows, maximum for sequence space is ${allowedRowCount}`);
548
- return;
549
- }
550
-
551
- async function getSeqSpace() {
552
- table.columns.add(DG.Column.float(embedColsNames[0], table.rowCount));
553
- table.columns.add(DG.Column.float(embedColsNames[1], table.rowCount));
554
- if (plotEmbeddings) {
555
- scatterPlot = grok.shell
556
- .tableView(table.name)
557
- .scatterPlot({x: embedColsNames[0], y: embedColsNames[1], title: 'Sequence space'});
558
- ui.setUpdateIndicator(scatterPlot.root, true);
559
- }
560
- let resolveF: Function | null = null;
561
-
562
- const sub = grok.events.onViewerClosed.subscribe((args) => {
563
- const v = args.args.viewer as unknown as DG.Viewer<any>;
564
- if (v?.getOptions()?.look?.title && scatterPlot?.getOptions()?.look?.title &&
565
- v?.getOptions()?.look?.title === scatterPlot?.getOptions()?.look?.title) {
566
- grok.events.fireCustomEvent(DIMENSIONALITY_REDUCER_TERMINATE_EVENT, {});
567
- sub.unsubscribe();
568
- resolveF?.();
569
- pg.close();
570
- }
571
- });
572
- const sequenceSpaceResPromise = new Promise<ISequenceSpaceResult | undefined>(async (resolve, reject) => {
573
- try {
574
- resolveF = resolve;
575
- const res = await getSequenceSpace(chemSpaceParams,
576
- options?.[BYPASS_LARGE_DATA_WARNING] ? undefined : progressFunc);
577
- resolve(res);
578
- } catch (e) {
579
- reject(e);
580
- }
581
- });
582
- const sequenceSpaceRes = await sequenceSpaceResPromise;
583
- pg.close();
584
- sub.unsubscribe();
585
- return sequenceSpaceRes ? processResult(sequenceSpaceRes) : sequenceSpaceRes;
586
- }
587
-
588
- if (table.rowCount > fastRowCount && !options?.[BYPASS_LARGE_DATA_WARNING]) {
589
- ui.dialog().add(ui.divText(`Sequence space analysis might take several minutes.
590
- Do you want to continue?`))
591
- .onOK(async () => {
592
- await getSeqSpace().catch((err: any) => {
593
- pg.close();
594
- const [errMsg, errStack] = errInfo(err);
595
- _package.logger.error(errMsg, undefined, errStack);
596
- if (scatterPlot)
597
- scatterPlot.close();
598
- });
599
- })
600
- .onCancel(() => { pg.close(); })
601
- .show();
602
- } else {
603
- return await getSeqSpace();
604
- }
605
-
606
- function processResult(sequenceSpaceRes: ISequenceSpaceResult): DG.ScatterPlotViewer | undefined {
607
- const embeddings = sequenceSpaceRes.coordinates;
608
- for (const col of embeddings) {
609
- const listValues = col.toList();
610
- emptyValsIdxs.forEach((ind: number) => listValues.splice(ind, 0, null));
611
- let embedCol = table.columns.byName(col.name);
612
- if (!embedCol) {
613
- embedCol = DG.Column.float(col.name, listValues.length);
614
- table.columns.add(embedCol);
615
- }
616
- embedCol.init((i) => listValues[i]);
617
- //table.columns.add(DG.Column.float(col.name, table.rowCount).init((i) => listValues[i]));
618
- }
619
- if (plotEmbeddings) {
620
- if (!scatterPlot) {
621
- scatterPlot = grok.shell
622
- .tableView(table.name)
623
- .scatterPlot({x: embedColsNames[0], y: embedColsNames[1], title: 'Sequence space'});
624
- }
625
- ui.setUpdateIndicator(scatterPlot.root, false);
626
- return scatterPlot;
627
- }
628
- }
629
- } catch (e) {
630
- console.error(e);
631
- pg.close();
632
- const [errMsg, errStack] = errInfo(e);
633
- _package.logger.error(errMsg, undefined, errStack);
634
- if (scatterPlot)
635
- (scatterPlot as unknown as DG.Viewer).close();
636
- }
637
- /* const encodedCol = encodeMonomers(macroMolecule);
638
- if (!encodedCol)
548
+ export async function sequenceSpaceTopMenu(table: DG.DataFrame, molecules: DG.Column,
549
+ methodName: DimReductionMethods, similarityMetric: BitArrayMetrics | MmDistanceFunctionsNames,
550
+ plotEmbeddings: boolean, preprocessingFunction?: DG.Func, options?: (IUMAPOptions | ITSNEOptions) & Options,
551
+ clusterEmbeddings?: boolean): Promise<DG.ScatterPlotViewer | undefined> {
552
+ if (!checkInputColumnUI(molecules, 'Sequence Space'))
639
553
  return;
640
- const embedColsNames = getEmbeddingColsNames(table);
641
- const withoutEmptyValues = DG.DataFrame.fromColumns([encodedCol]).clone();
642
- const emptyValsIdxs = removeEmptyStringRows(withoutEmptyValues, encodedCol);
643
-
644
- const chemSpaceParams = {
645
- seqCol: withoutEmptyValues.col(encodedCol.name)!,
646
- methodName: methodName,
647
- similarityMetric: similarityMetric,
648
- embedAxesNames: embedColsNames
649
- };
650
- const sequenceSpaceRes = await sequenceSpace(chemSpaceParams);
651
- const embeddings = sequenceSpaceRes.coordinates;
652
- for (const col of embeddings) {
653
- const listValues = col.toList();
654
- emptyValsIdxs.forEach((ind: number) => listValues.splice(ind, 0, null));
655
- table.columns.add(DG.Column.fromList('double', col.name, listValues));
656
- }
657
- let sp;
658
- if (plotEmbeddings) {
659
- for (const v of grok.shell.views) {
660
- if (v.name === table.name)
661
- sp = (v as DG.TableView).scatterPlot({x: embedColsNames[0], y: embedColsNames[1], title: 'Sequence space'});
662
- }
663
- } */
554
+ if (!preprocessingFunction)
555
+ preprocessingFunction = DG.Func.find({name: 'macromoleculePreprocessingFunction', package: 'Bio'})[0];
556
+
557
+ const res = await reduceDimensionality(table, molecules, methodName,
558
+ similarityMetric as KnownMetrics, preprocessingFunction, plotEmbeddings, clusterEmbeddings ?? false, options, {
559
+ fastRowCount: 10000,
560
+ scatterPlotName: 'Sequence space',
561
+ bypassLargeDataWarning: options?.[BYPASS_LARGE_DATA_WARNING],
562
+ });
563
+ return res;
664
564
  }
665
565
 
666
566
  //top-menu: Bio | Convert | To Atomic Level...
@@ -1066,6 +966,7 @@ export function addCopyMenu(cell: DG.Cell, menu: DG.Menu): void {
1066
966
  //description: Sequence similarity tracking and evaluation dataset diversity
1067
967
  //meta.path: /apps/Tutorials/Demo/Bioinformatics/Similarity,%20Diversity
1068
968
  //meta.isDemoScript: True
969
+ //meta.demoSkip: GROK-14320
1069
970
  export async function demoBioSimilarityDiversity(): Promise<void> {
1070
971
  await demoBio01UI();
1071
972
  }
@@ -1076,6 +977,7 @@ export async function demoBioSimilarityDiversity(): Promise<void> {
1076
977
  //description: Exploring sequence space of Macromolecules, comparison with hierarchical clustering results
1077
978
  //meta.path: /apps/Tutorials/Demo/Bioinformatics/Sequence%20Space
1078
979
  //meta.isDemoScript: True
980
+ //meta.demoSkip: GROK-14320
1079
981
  export async function demoBioSequenceSpace(): Promise<void> {
1080
982
  await demoBio01aUI();
1081
983
  }
@@ -1086,6 +988,7 @@ export async function demoBioSequenceSpace(): Promise<void> {
1086
988
  //description: Activity Cliffs analysis on Macromolecules data
1087
989
  //meta.path: /apps/Tutorials/Demo/Bioinformatics/Activity%20Cliffs
1088
990
  //meta.isDemoScript: True
991
+ //meta.demoSkip: GROK-14320
1089
992
  export async function demoBioActivityCliffs(): Promise<void> {
1090
993
  await demoBio01bUI();
1091
994
  }
@@ -1096,6 +999,7 @@ export async function demoBioActivityCliffs(): Promise<void> {
1096
999
  //description: Atomic level structure of Macromolecules
1097
1000
  //meta.path: /apps/Tutorials/Demo/Bioinformatics/Atomic%20Level
1098
1001
  //meta.isDemoScript: True
1002
+ //meta.demoSkip: GROK-14320
1099
1003
  export async function demoBioAtomicLevel(): Promise<void> {
1100
1004
  await demoBio03UI();
1101
1005
  }
@@ -1106,6 +1010,7 @@ export async function demoBioAtomicLevel(): Promise<void> {
1106
1010
  //description: MSA and composition analysis on Helm data
1107
1011
  //meta.path: /apps/Tutorials/Demo/Bioinformatics/Helm,%20MSA,%20Sequence%20Space
1108
1012
  //meta.isDemoScript: True
1013
+ //meta.demoSkip: GROK-14320
1109
1014
  export async function demoBioHelmMsaSequenceSpace(): Promise<void> {
1110
1015
  await demoBio05UI();
1111
1016
  }
@@ -16,7 +16,7 @@ category('PepSeA', () => {
16
16
  await awaitContainerStart();
17
17
  const table = DG.DataFrame.fromCsv(testCsv);
18
18
  const alignedCol = await runPepsea(table.getCol('HELM'), 'msa(HELM)');
19
- expect(alignedCol !== null, true, 'PepSeA conainter has not started');
19
+ expect(alignedCol !== null, true, 'PepSeA container has not started');
20
20
  const alignedTestCol = table.getCol('MSA');
21
21
  for (let i = 0; i < alignedCol!.length; ++i)
22
22
  expect(alignedCol!.get(i) == alignedTestCol.get(i), true);
@@ -1,9 +1,10 @@
1
1
  import * as DG from 'datagrok-api/dg';
2
2
  import * as grok from 'datagrok-api/grok';
3
3
  import {expect} from '@datagrok-libraries/utils/src/test';
4
- import {BYPASS_LARGE_DATA_WARNING, sequenceSpaceTopMenu} from '../package';
4
+ import {sequenceSpaceTopMenu} from '../package';
5
5
  import {MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
6
6
  import {DimReductionMethods} from '@datagrok-libraries/ml/src/reduce-dimensionality';
7
+ import {BYPASS_LARGE_DATA_WARNING} from '@datagrok-libraries/ml/src/functionEditors/consts';
7
8
 
8
9
  export async function _testSequenceSpaceReturnsResult(
9
10
  df: DG.DataFrame, algorithm: DimReductionMethods, colName: string,
@@ -14,7 +15,10 @@ export async function _testSequenceSpaceReturnsResult(
14
15
  if (semType)
15
16
  col.semType = semType;
16
17
 
18
+ const preprocessingFunc = DG.Func.find({package: 'Bio', name: 'macromoleculePreprocessingFunction'})[0];
19
+ if (!preprocessingFunc)
20
+ throw new Error('Preprocessing function not found');
17
21
  const sp = await sequenceSpaceTopMenu(df, df.col(colName)!, algorithm, MmDistanceFunctionsNames.LEVENSHTEIN, true,
18
- 0.6, {[`${BYPASS_LARGE_DATA_WARNING}`]: true});
22
+ preprocessingFunc, {[BYPASS_LARGE_DATA_WARNING]: true});
19
23
  expect(sp != null, true);
20
24
  }