@datagrok/bio 2.11.13 → 2.11.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/23.js +2 -0
- package/dist/23.js.map +1 -0
- package/dist/282.js +2 -0
- package/dist/282.js.map +1 -0
- package/dist/356.js +2 -0
- package/dist/356.js.map +1 -0
- package/dist/361.js +1 -1
- package/dist/361.js.map +1 -1
- package/dist/40.js +2 -0
- package/dist/40.js.map +1 -0
- package/dist/562.js +2 -0
- package/dist/562.js.map +1 -0
- package/dist/586.js +2 -0
- package/dist/586.js.map +1 -0
- package/dist/65.js +2 -0
- package/dist/65.js.map +1 -0
- package/dist/796.js +2 -0
- package/dist/796.js.map +1 -0
- package/dist/8473fcbfb6e85ca6c852.wasm +0 -0
- package/dist/{931.js → 935.js} +3 -3
- package/dist/935.js.map +1 -0
- package/dist/9a8fbf37666e32487835.wasm +0 -0
- package/dist/package-test.js +1 -1
- package/dist/package-test.js.map +1 -1
- package/dist/package.js +1 -1
- package/dist/package.js.map +1 -1
- package/package.json +4 -4
- package/scripts/sequence_generator.py +24 -22
- package/src/demo/bio05-helm-msa-sequence-space.ts +53 -13
- package/src/demo/utils.ts +2 -1
- package/src/function-edtiors/split-to-monomers-editor.ts +2 -2
- package/src/package.ts +82 -177
- package/src/tests/pepsea-tests.ts +1 -1
- package/src/tests/sequence-space-utils.ts +6 -2
- package/src/tests/similarity-diversity-tests.ts +16 -4
- package/src/utils/cell-renderer.ts +1 -1
- package/src/utils/helm-to-molfile.ts +1 -1
- package/src/utils/monomer-lib.ts +1 -1
- package/src/utils/pepsea.ts +16 -1
- package/src/viewers/vd-regions-viewer.ts +42 -16
- package/src/viewers/web-logo-viewer.ts +40 -20
- package/dist/1.js +0 -2
- package/dist/1.js.map +0 -1
- package/dist/190.js +0 -2
- package/dist/190.js.map +0 -1
- package/dist/381.js +0 -2
- package/dist/381.js.map +0 -1
- package/dist/770.js +0 -2
- package/dist/770.js.map +0 -1
- package/dist/868.js +0 -2
- package/dist/868.js.map +0 -1
- package/dist/931.js.map +0 -1
- package/src/utils/err-info.ts +0 -28
- /package/dist/{931.js.LICENSE.txt → 935.js.LICENSE.txt} +0 -0
package/package.json
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
"name": "Leonid Stolbov",
|
|
6
6
|
"email": "lstolbov@datagrok.ai"
|
|
7
7
|
},
|
|
8
|
-
"version": "2.11.
|
|
8
|
+
"version": "2.11.15",
|
|
9
9
|
"description": "Bioinformatics support (import/export of sequences, conversion, visualization, analysis). [See more](https://github.com/datagrok-ai/public/blob/master/packages/Bio/README.md) for details.",
|
|
10
10
|
"repository": {
|
|
11
11
|
"type": "git",
|
|
@@ -34,11 +34,11 @@
|
|
|
34
34
|
],
|
|
35
35
|
"dependencies": {
|
|
36
36
|
"@biowasm/aioli": "^3.1.0",
|
|
37
|
-
"@datagrok-libraries/bio": "^5.39.
|
|
37
|
+
"@datagrok-libraries/bio": "^5.39.10",
|
|
38
38
|
"@datagrok-libraries/chem-meta": "^1.2.1",
|
|
39
|
-
"@datagrok-libraries/ml": "^6.3.
|
|
39
|
+
"@datagrok-libraries/ml": "^6.3.62",
|
|
40
40
|
"@datagrok-libraries/tutorials": "^1.3.11",
|
|
41
|
-
"@datagrok-libraries/utils": "^4.1.
|
|
41
|
+
"@datagrok-libraries/utils": "^4.1.34",
|
|
42
42
|
"cash-dom": "^8.0.0",
|
|
43
43
|
"css-loader": "^6.7.3",
|
|
44
44
|
"datagrok-api": "^1.16.0",
|
|
@@ -3,25 +3,28 @@
|
|
|
3
3
|
# description: Create the model peptides/DNA sequences with peptides data
|
|
4
4
|
# language: python
|
|
5
5
|
# tags: template, demo
|
|
6
|
-
# input: int clusters = 5
|
|
7
|
-
# input: int num_sequences = 50
|
|
8
|
-
# input: int motif_length = 12
|
|
9
|
-
# input: int max_variants_position = 3
|
|
10
|
-
# input: int random_length = 3
|
|
11
|
-
# input: int dispersion = 2
|
|
12
|
-
# input:
|
|
13
|
-
# input:
|
|
14
|
-
# input: double
|
|
15
|
-
# input:
|
|
16
|
-
# input: string fasta_separator =
|
|
6
|
+
# input: int clusters = 5 { caption: Number of clusters; category: Clusters }
|
|
7
|
+
# input: int num_sequences = 50 { caption: Number of sequences in each cluster; category: Clusters }
|
|
8
|
+
# input: int motif_length = 12 { caption: Average length of motif; category: Motif }
|
|
9
|
+
# input: int max_variants_position = 3 { caption: Maximum number of different letters in conservative position in motif; category: Motif }
|
|
10
|
+
# input: int random_length = 3 { caption: Average length of random sequence parts before and after motif; category: Motif }
|
|
11
|
+
# input: int dispersion = 2 { caption: Variation of total sequence length; category: Motif }
|
|
12
|
+
# input: bool enable_cliffs = true { caption: Enable activity cliffs; category: Activity cliffs }
|
|
13
|
+
# input: double cliff_probability = 0.01 { caption: Probability to make activity cliff of a sequence; category: Activity cliffs; format: 0.000}
|
|
14
|
+
# input: double cliff_strength = 4.0 { caption: Strength of cliff; category: Activity cliffs }
|
|
15
|
+
# input: string alphabet_key = "PT" { caption: Sequence alphabet; category: Output format; hint: PT/DNA/RNA/custom. Custom alphabet is a list of values separated by comma}
|
|
16
|
+
# input: string fasta_separator = "" { caption: Fasta format separator; nullable: true; category: Output format}
|
|
17
|
+
# input: file helm_library_file { caption: HELM library to produce HELM output; nullable: true; category: Output format}
|
|
18
|
+
# input: string helm_connection_mode = "linear" { choices: ["linear", "cyclic", "mixed"]; caption: Peptides connection mode (HELM only); category: Output format}
|
|
17
19
|
# output: dataframe sequences
|
|
18
20
|
|
|
19
|
-
|
|
20
|
-
The
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
21
|
+
|
|
22
|
+
description="""The utility generates clusters of macromolecule sequences to test SAR fucntionality.
|
|
23
|
+
Each cluster contains randomly generated sequence motif.
|
|
24
|
+
Each sequence has activity - a Gauss-distributed random value.
|
|
25
|
+
All sequences in the cluster has activities from the same distibution.
|
|
26
|
+
The utility can simulate activity cliffs - random changes in the conservative motif letters,
|
|
27
|
+
leading to drastical change in the activity.
|
|
25
28
|
"""
|
|
26
29
|
|
|
27
30
|
import random
|
|
@@ -280,8 +283,7 @@ def alphabet_from_helm(helm_library_file: str) -> Alphabet:
|
|
|
280
283
|
def parse_command_line_args() -> Any:
|
|
281
284
|
parser = argparse.ArgumentParser(
|
|
282
285
|
prog="MotifSequencesGenerator",
|
|
283
|
-
description=
|
|
284
|
-
"for SAR functionality testing",
|
|
286
|
+
description=description,
|
|
285
287
|
epilog="Utility author and support: Gennadii Zakharov <Gennadiy.Zakharov@gmail.com>",
|
|
286
288
|
)
|
|
287
289
|
|
|
@@ -386,14 +388,14 @@ if not grok:
|
|
|
386
388
|
random_length = args.random_length
|
|
387
389
|
dispersion = args.dispersion
|
|
388
390
|
alphabet_key = args.alphabet
|
|
389
|
-
|
|
391
|
+
enable_cliffs = not args.disable_cliffs
|
|
390
392
|
cliff_probability = args.cliff_probability
|
|
391
393
|
cliff_strength = args.cliff_strength
|
|
392
394
|
fasta_separator = args.fasta_separator
|
|
393
395
|
helm_library_file = args.helm_library_file
|
|
394
396
|
helm_connection_mode = args.helm_connection_mode
|
|
395
397
|
|
|
396
|
-
helm_init = "helm_library_file" in globals() and helm_library_file is not None
|
|
398
|
+
helm_init = "helm_library_file" in globals() and helm_library_file is not None and helm_library_file != ''
|
|
397
399
|
|
|
398
400
|
if not helm_init:
|
|
399
401
|
alphabet: Alphabet = (
|
|
@@ -413,7 +415,7 @@ header, data = generate_sequences(
|
|
|
413
415
|
random_length,
|
|
414
416
|
dispersion,
|
|
415
417
|
alphabet,
|
|
416
|
-
|
|
418
|
+
enable_cliffs,
|
|
417
419
|
cliff_probability,
|
|
418
420
|
cliff_strength,
|
|
419
421
|
)
|
|
@@ -2,15 +2,18 @@ import * as grok from 'datagrok-api/grok';
|
|
|
2
2
|
import * as ui from 'datagrok-api/ui';
|
|
3
3
|
import * as DG from 'datagrok-api/dg';
|
|
4
4
|
|
|
5
|
-
import {_package, sequenceSpaceTopMenu} from '../package';
|
|
6
|
-
import {handleError} from './utils';
|
|
7
|
-
|
|
8
5
|
import {IWebLogoViewer} from '@datagrok-libraries/bio/src/viewers/web-logo';
|
|
9
|
-
import {
|
|
6
|
+
import {awaitStatus, DockerContainerStatus} from '@datagrok-libraries/bio/src/utils/docker';
|
|
10
7
|
import {DemoScript} from '@datagrok-libraries/tutorials/src/demo-script';
|
|
11
8
|
import {DimReductionMethods} from '@datagrok-libraries/ml/src/reduce-dimensionality';
|
|
12
9
|
import {MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
|
|
13
10
|
|
|
11
|
+
import {Pepsea, pepseaMethods, runPepsea} from '../utils/pepsea';
|
|
12
|
+
import {sequenceSpaceTopMenu} from '../package';
|
|
13
|
+
import {handleError} from './utils';
|
|
14
|
+
|
|
15
|
+
import {_package} from '../package';
|
|
16
|
+
|
|
14
17
|
const helmFn: string = 'samples/HELM.csv';
|
|
15
18
|
|
|
16
19
|
export async function demoBio05UI(): Promise<void> {
|
|
@@ -25,36 +28,73 @@ export async function demoBio05UI(): Promise<void> {
|
|
|
25
28
|
const msaHelmColName: string = 'msa(HELM)';
|
|
26
29
|
const dimRedMethod: DimReductionMethods = DimReductionMethods.UMAP;
|
|
27
30
|
|
|
31
|
+
const pepseaDcId = (await Pepsea.getDockerContainer()).id;
|
|
32
|
+
// // region For test: Stop container to test auto-start
|
|
33
|
+
// await grok.dapi.docker.dockerContainers.stop(pepseaDcId);
|
|
34
|
+
// await Pepsea.awaitStatus(pepseaDcId, 'stopped', 15000);
|
|
35
|
+
// // endregion
|
|
36
|
+
const pepseaDcPromise: Promise<DG.DockerContainer> = Pepsea.getDockerContainer();
|
|
37
|
+
let pepseaDcStatus: DockerContainerStatus;
|
|
38
|
+
let pepseaDcStartPromise: Promise<void>;
|
|
39
|
+
|
|
28
40
|
try {
|
|
29
41
|
const demoScript = new DemoScript(
|
|
30
42
|
'Helm, MSA, Sequence Space',
|
|
31
43
|
'MSA and composition analysis on Helm data');
|
|
32
44
|
await demoScript
|
|
33
45
|
.step(`Load peptides with non-natural aminoacids in 'HELM' notation`, async () => {
|
|
34
|
-
|
|
46
|
+
[pepseaDcStatus, df] = await Promise.all([
|
|
47
|
+
(async () => { return (await pepseaDcPromise).status; })(),
|
|
48
|
+
_package.files.readCsv(helmFn)
|
|
49
|
+
]);
|
|
50
|
+
view = grok.shell.addTableView(df);
|
|
35
51
|
|
|
36
52
|
grok.shell.windows.showContextPanel = false;
|
|
37
53
|
grok.shell.windows.showProperties = false;
|
|
54
|
+
|
|
55
|
+
if (pepseaDcStatus === 'started' || pepseaDcStatus === 'checking') {
|
|
56
|
+
_package.logger.debug(
|
|
57
|
+
`demoBio05UI(), PepSeA ('${Pepsea.dcName}') docker container status = '${pepseaDcStatus}'.`);
|
|
58
|
+
pepseaDcStartPromise = Promise.resolve();
|
|
59
|
+
} else {
|
|
60
|
+
_package.logger.warning(
|
|
61
|
+
`demoBio05UI(), PepSeA ('${Pepsea.dcName}') docker container is trying to start...`);
|
|
62
|
+
|
|
63
|
+
await grok.dapi.docker.dockerContainers.run(pepseaDcId);
|
|
64
|
+
pepseaDcStartPromise = awaitStatus(pepseaDcId, 'started', 30000, _package.logger);
|
|
65
|
+
}
|
|
38
66
|
}, {
|
|
39
67
|
description: 'Load dataset with macromolecules of \'Helm\' notation.',
|
|
40
68
|
delay: 2000,
|
|
41
69
|
})
|
|
42
70
|
.step('Align peptides with non-natural aminoacids with PepSeA', async () => {
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
71
|
+
const pi = DG.TaskBarProgressIndicator.create('MSA by PepSeA ...');
|
|
72
|
+
try {
|
|
73
|
+
// TODO: Show splash if pepseaDcStartPromise is not resolved still
|
|
74
|
+
await pepseaDcStartPromise; // throws timeout
|
|
75
|
+
// Hide splash
|
|
76
|
+
|
|
77
|
+
helmCol = df.getCol(helmColName);
|
|
78
|
+
const method: string = pepseaMethods[0];
|
|
79
|
+
const gapOpen: number = 1.53;
|
|
80
|
+
const gapExtend: number = 0;
|
|
81
|
+
msaHelmCol = (await runPepsea(helmCol, msaHelmColName, method, gapOpen, gapExtend, undefined))!;
|
|
82
|
+
if (!msaHelmCol)
|
|
83
|
+
throw new Error(`Empty MSA result.`);
|
|
84
|
+
df.columns.add(msaHelmCol);
|
|
85
|
+
await grok.data.detectSemanticTypes(df);
|
|
86
|
+
} finally {
|
|
87
|
+
pi.close();
|
|
88
|
+
}
|
|
50
89
|
}, {
|
|
51
90
|
// eslint-disable-next-line max-len
|
|
52
91
|
description: 'Multiple sequence alignment (MSA) performed with PepSeA tool operating on non-natural aminoacids as well.',
|
|
53
92
|
delay: 2000,
|
|
54
93
|
})
|
|
55
94
|
.step('Build sequence space', async () => {
|
|
95
|
+
const preprocessingFunc = DG.Func.find({package: 'Bio', name: 'macromoleculePreprocessingFunction'})[0];
|
|
56
96
|
ssViewer = (await sequenceSpaceTopMenu(df, msaHelmCol,
|
|
57
|
-
dimRedMethod, MmDistanceFunctionsNames.LEVENSHTEIN, true)) as DG.ScatterPlotViewer;
|
|
97
|
+
dimRedMethod, MmDistanceFunctionsNames.LEVENSHTEIN, true, preprocessingFunc)) as DG.ScatterPlotViewer;
|
|
58
98
|
view.dockManager.dock(ssViewer, DG.DOCK_TYPE.RIGHT, null, 'Sequence Space', 0.35);
|
|
59
99
|
}, {
|
|
60
100
|
description: 'Reduce sequence space dimensionality to display on 2D representation.',
|
package/src/demo/utils.ts
CHANGED
|
@@ -63,8 +63,9 @@ export async function demoSequenceSpace(
|
|
|
63
63
|
'lassoTool': true,
|
|
64
64
|
})) as DG.ScatterPlotViewer;
|
|
65
65
|
} else {
|
|
66
|
+
const preprocessingFunc = DG.Func.find({package: 'Bio', name: 'macromoleculePreprocessingFunction'})[0];
|
|
66
67
|
resSpaceViewer = (await sequenceSpaceTopMenu(df, df.getCol(colName),
|
|
67
|
-
DimReductionMethods.UMAP, MmDistanceFunctionsNames.LEVENSHTEIN, true)) as DG.ScatterPlotViewer;
|
|
68
|
+
DimReductionMethods.UMAP, MmDistanceFunctionsNames.LEVENSHTEIN, true, preprocessingFunc)) as DG.ScatterPlotViewer;
|
|
68
69
|
}
|
|
69
70
|
view.dockManager.dock(resSpaceViewer!, DG.DOCK_TYPE.RIGHT, null, 'Sequence Space', 0.35);
|
|
70
71
|
return resSpaceViewer;
|
|
@@ -6,7 +6,7 @@ export class SplitToMonomersFunctionEditor {
|
|
|
6
6
|
tableInput: DG.InputBase;
|
|
7
7
|
seqColInput: DG.InputBase;
|
|
8
8
|
|
|
9
|
-
funcParamsDiv:
|
|
9
|
+
funcParamsDiv: HTMLElement;
|
|
10
10
|
|
|
11
11
|
get funcParams(): {} {
|
|
12
12
|
return {
|
|
@@ -15,7 +15,7 @@ export class SplitToMonomersFunctionEditor {
|
|
|
15
15
|
};
|
|
16
16
|
}
|
|
17
17
|
|
|
18
|
-
get paramsUI():
|
|
18
|
+
get paramsUI(): HTMLElement {
|
|
19
19
|
return this.funcParamsDiv;
|
|
20
20
|
}
|
|
21
21
|
|
package/src/package.ts
CHANGED
|
@@ -9,12 +9,15 @@ import {removeEmptyStringRows} from '@datagrok-libraries/utils/src/dataframe-uti
|
|
|
9
9
|
import {Options} from '@datagrok-libraries/utils/src/type-declarations';
|
|
10
10
|
import {DimReductionMethods, ITSNEOptions, IUMAPOptions} from '@datagrok-libraries/ml/src/reduce-dimensionality';
|
|
11
11
|
import {SequenceSpaceFunctionEditor} from '@datagrok-libraries/ml/src/functionEditors/seq-space-editor';
|
|
12
|
+
import {DimReductionBaseEditor, PreprocessFunctionReturnType}
|
|
13
|
+
from '@datagrok-libraries/ml/src/functionEditors/dimensionality-reduction-editor';
|
|
14
|
+
import {reduceDimensionality} from '@datagrok-libraries/ml/src/functionEditors/dimensionality-reducer';
|
|
12
15
|
import {ActivityCliffsFunctionEditor} from '@datagrok-libraries/ml/src/functionEditors/activity-cliffs-editor';
|
|
13
16
|
import {
|
|
14
17
|
ISequenceSpaceParams, getActivityCliffs, SequenceSpaceFunc, CLIFFS_COL_ENCODE_FN
|
|
15
18
|
} from '@datagrok-libraries/ml/src/viewers/activity-cliffs';
|
|
16
19
|
import {MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
|
|
17
|
-
import {BitArrayMetrics} from '@datagrok-libraries/ml/src/typed-metrics';
|
|
20
|
+
import {BitArrayMetrics, KnownMetrics} from '@datagrok-libraries/ml/src/typed-metrics';
|
|
18
21
|
import {
|
|
19
22
|
TAGS as bioTAGS, ALPHABET, NOTATION,
|
|
20
23
|
} from '@datagrok-libraries/bio/src/utils/macromolecule';
|
|
@@ -27,6 +30,7 @@ import {SCORE, calculateScores} from '@datagrok-libraries/bio/src/utils/macromol
|
|
|
27
30
|
import {
|
|
28
31
|
createJsonMonomerLibFromSdf, IMonomerLibHelper
|
|
29
32
|
} from '@datagrok-libraries/bio/src/monomer-works/monomer-utils';
|
|
33
|
+
import {errInfo} from '@datagrok-libraries/bio/src/utils/err-info';
|
|
30
34
|
|
|
31
35
|
import {getMacromoleculeColumns} from './utils/ui-utils';
|
|
32
36
|
import {
|
|
@@ -74,7 +78,6 @@ import {getRegionDo} from './utils/get-region';
|
|
|
74
78
|
import {GetRegionApp} from './apps/get-region-app';
|
|
75
79
|
import {GetRegionFuncEditor} from './utils/get-region-func-editor';
|
|
76
80
|
import {sequenceToMolfile} from './utils/sequence-to-mol';
|
|
77
|
-
import {errInfo} from './utils/err-info';
|
|
78
81
|
import {detectMacromoleculeProbeDo} from './utils/detect-macromolecule-probe';
|
|
79
82
|
|
|
80
83
|
import {SHOW_SCATTERPLOT_PROGRESS} from '@datagrok-libraries/ml/src/functionEditors/seq-space-base-editor';
|
|
@@ -230,11 +233,21 @@ export function SplitToMonomersEditor(call: DG.FuncCall): void {
|
|
|
230
233
|
//tags: editor
|
|
231
234
|
//input: funccall call
|
|
232
235
|
export function SequenceSpaceEditor(call: DG.FuncCall) {
|
|
233
|
-
const funcEditor = new
|
|
236
|
+
const funcEditor = new DimReductionBaseEditor({semtype: DG.SEMTYPE.MACROMOLECULE});
|
|
234
237
|
ui.dialog({title: 'Sequence Space'})
|
|
235
|
-
.add(funcEditor.
|
|
238
|
+
.add(funcEditor.getEditor())
|
|
236
239
|
.onOK(async () => {
|
|
237
|
-
|
|
240
|
+
const params = funcEditor.getParams();
|
|
241
|
+
return call.func.prepare({
|
|
242
|
+
molecules: params.col,
|
|
243
|
+
table: params.table,
|
|
244
|
+
methodName: params.methodName,
|
|
245
|
+
similarityMetric: params.similarityMetric,
|
|
246
|
+
plotEmbeddings: params.plotEmbeddings,
|
|
247
|
+
options: params.options,
|
|
248
|
+
preprocessingFunction: params.preprocessingFunction,
|
|
249
|
+
clusterEmbeddings: params.clusterEmbeddings,
|
|
250
|
+
}).call();
|
|
238
251
|
})
|
|
239
252
|
.show();
|
|
240
253
|
}
|
|
@@ -477,6 +490,48 @@ export async function activityCliffs(df: DG.DataFrame, macroMolecule: DG.Column<
|
|
|
477
490
|
}).finally(() => { pi.close(); });
|
|
478
491
|
}
|
|
479
492
|
|
|
493
|
+
//name: Encode Sequences
|
|
494
|
+
//tags: dim-red-preprocessing-function
|
|
495
|
+
//meta.supportedSemTypes: Macromolecule
|
|
496
|
+
//meta.supportedTypes: string
|
|
497
|
+
//meta.supportedUnits: fasta,separator,helm
|
|
498
|
+
//meta.supportedDistanceFunctions: Hamming,Levenshtein,Monomer chemical distance,Needlemann-Wunsch
|
|
499
|
+
//input: column col {semType: Macromolecule}
|
|
500
|
+
//input: string metric
|
|
501
|
+
//output: object result
|
|
502
|
+
export async function macromoleculePreprocessingFunction(
|
|
503
|
+
col: DG.Column, metric: MmDistanceFunctionsNames): Promise<PreprocessFunctionReturnType> {
|
|
504
|
+
const {seqList, options} = await getEncodedSeqSpaceCol(col, metric);
|
|
505
|
+
return {entries: seqList, options};
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
//name: Helm Fingerprints
|
|
509
|
+
//tags: dim-red-preprocessing-function
|
|
510
|
+
//meta.supportedSemTypes: Macromolecule
|
|
511
|
+
//meta.supportedTypes: string
|
|
512
|
+
//meta.supportedUnits: helm
|
|
513
|
+
//meta.supportedDistanceFunctions: Tanimoto,Asymmetric,Cosine,Sokal
|
|
514
|
+
//input: column col {semType: Macromolecule}
|
|
515
|
+
//input: string _metric
|
|
516
|
+
//output: object result
|
|
517
|
+
export async function helmPreprocessingFunction(
|
|
518
|
+
col: DG.Column<string>, _metric: BitArrayMetrics): Promise<PreprocessFunctionReturnType> {
|
|
519
|
+
if (col.version !== col.temp[MONOMERIC_COL_TAGS.LAST_INVALIDATED_VERSION])
|
|
520
|
+
await invalidateMols(col, false);
|
|
521
|
+
const molCol = col.temp[MONOMERIC_COL_TAGS.MONOMERIC_MOLS];
|
|
522
|
+
const fingerPrints: DG.Column<DG.BitSet | null> =
|
|
523
|
+
await grok.functions.call('Chem:getMorganFingerprints', {molColumn: molCol});
|
|
524
|
+
|
|
525
|
+
const entries: Array<BitArray | null> = new Array(fingerPrints.length).fill(null);
|
|
526
|
+
for (let i = 0; i < fingerPrints.length; i++) {
|
|
527
|
+
if (fingerPrints.isNone(i) || !fingerPrints.get(i))
|
|
528
|
+
continue;
|
|
529
|
+
const fp = fingerPrints.get(i)!;
|
|
530
|
+
entries[i] = BitArray.fromUint32Array(fp.length, new Uint32Array(fp.getBuffer().buffer));
|
|
531
|
+
}
|
|
532
|
+
return {entries, options: {}};
|
|
533
|
+
}
|
|
534
|
+
|
|
480
535
|
//top-menu: Bio | Analyze | Sequence Space...
|
|
481
536
|
//name: Sequence Space
|
|
482
537
|
//description: Creates 2D sequence space with projected sequences by pairwise distance
|
|
@@ -485,182 +540,27 @@ export async function activityCliffs(df: DG.DataFrame, macroMolecule: DG.Column<
|
|
|
485
540
|
//input: string methodName { choices:["UMAP", "t-SNE"] }
|
|
486
541
|
//input: string similarityMetric { choices:["Hamming", "Levenshtein", "Monomer chemical distance"] }
|
|
487
542
|
//input: bool plotEmbeddings = true
|
|
488
|
-
//input:
|
|
543
|
+
//input: func preprocessingFunction {optional: true}
|
|
489
544
|
//input: object options {optional: true}
|
|
545
|
+
//input: bool clusterEmbeddings = true { optional: true }
|
|
490
546
|
//output: viewer result
|
|
491
547
|
//editor: Bio:SequenceSpaceEditor
|
|
492
|
-
export async function sequenceSpaceTopMenu(
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
// Delay is required for initial function dialog to close before starting invalidating of molfiles.
|
|
498
|
-
// Otherwise, dialog is freezing
|
|
499
|
-
await delay(10);
|
|
500
|
-
if (!checkInputColumnUI(macroMolecule, 'Sequence space')) return;
|
|
501
|
-
let scatterPlot: DG.ScatterPlotViewer | undefined = undefined;
|
|
502
|
-
const pg = DG.TaskBarProgressIndicator.create('Initializing sequence space ...');
|
|
503
|
-
// function for progress of umap
|
|
504
|
-
try {
|
|
505
|
-
function progressFunc(_nEpoch: number, epochsLength: number, embeddings: number[][]) {
|
|
506
|
-
let embedXCol: DG.Column | null = null;
|
|
507
|
-
let embedYCol: DG.Column | null = null;
|
|
508
|
-
if (!table.columns.names().includes(embedColsNames[0])) {
|
|
509
|
-
embedXCol = table.columns.add(DG.Column.float(embedColsNames[0], table.rowCount));
|
|
510
|
-
embedYCol = table.columns.add(DG.Column.float(embedColsNames[1], table.rowCount));
|
|
511
|
-
if (plotEmbeddings) {
|
|
512
|
-
scatterPlot = grok.shell
|
|
513
|
-
.tableView(table.name)
|
|
514
|
-
.scatterPlot({x: embedColsNames[0], y: embedColsNames[1], title: 'Sequence space'});
|
|
515
|
-
}
|
|
516
|
-
} else {
|
|
517
|
-
embedXCol = table.columns.byName(embedColsNames[0]);
|
|
518
|
-
embedYCol = table.columns.byName(embedColsNames[1]);
|
|
519
|
-
}
|
|
520
|
-
|
|
521
|
-
if (options?.[SHOW_SCATTERPLOT_PROGRESS]) {
|
|
522
|
-
scatterPlot?.root && ui.setUpdateIndicator(scatterPlot!.root, false);
|
|
523
|
-
embedXCol.init((i) => embeddings[i] ? embeddings[i][0] : undefined);
|
|
524
|
-
embedYCol.init((i) => embeddings[i] ? embeddings[i][1] : undefined);
|
|
525
|
-
}
|
|
526
|
-
const progress = (_nEpoch / epochsLength * 100);
|
|
527
|
-
pg.update(progress, `Running sequence space ... ${progress.toFixed(0)}%`);
|
|
528
|
-
}
|
|
529
|
-
|
|
530
|
-
const embedColsNames = getEmbeddingColsNames(table);
|
|
531
|
-
const withoutEmptyValues = DG.DataFrame.fromColumns([macroMolecule]).clone();
|
|
532
|
-
const emptyValsIdxs = removeEmptyStringRows(withoutEmptyValues, macroMolecule);
|
|
533
|
-
|
|
534
|
-
const chemSpaceParams: ISequenceSpaceParams = {
|
|
535
|
-
seqCol: withoutEmptyValues.col(macroMolecule.name)!,
|
|
536
|
-
methodName: methodName,
|
|
537
|
-
similarityMetric: similarityMetric,
|
|
538
|
-
embedAxesNames: embedColsNames,
|
|
539
|
-
options: {...options, sparseMatrixThreshold: sparseMatrixThreshold ?? 0.5,
|
|
540
|
-
usingSparseMatrix: table.rowCount > 20000},
|
|
541
|
-
};
|
|
542
|
-
|
|
543
|
-
const allowedRowCount = methodName === DimReductionMethods.UMAP ? 500000 : 15000;
|
|
544
|
-
// number of rows which will be processed relatively fast
|
|
545
|
-
const fastRowCount = methodName === DimReductionMethods.UMAP ? 5000 : 2000;
|
|
546
|
-
if (table.rowCount > allowedRowCount) {
|
|
547
|
-
grok.shell.warning(`Too many rows, maximum for sequence space is ${allowedRowCount}`);
|
|
548
|
-
return;
|
|
549
|
-
}
|
|
550
|
-
|
|
551
|
-
async function getSeqSpace() {
|
|
552
|
-
table.columns.add(DG.Column.float(embedColsNames[0], table.rowCount));
|
|
553
|
-
table.columns.add(DG.Column.float(embedColsNames[1], table.rowCount));
|
|
554
|
-
if (plotEmbeddings) {
|
|
555
|
-
scatterPlot = grok.shell
|
|
556
|
-
.tableView(table.name)
|
|
557
|
-
.scatterPlot({x: embedColsNames[0], y: embedColsNames[1], title: 'Sequence space'});
|
|
558
|
-
ui.setUpdateIndicator(scatterPlot.root, true);
|
|
559
|
-
}
|
|
560
|
-
let resolveF: Function | null = null;
|
|
561
|
-
|
|
562
|
-
const sub = grok.events.onViewerClosed.subscribe((args) => {
|
|
563
|
-
const v = args.args.viewer as unknown as DG.Viewer<any>;
|
|
564
|
-
if (v?.getOptions()?.look?.title && scatterPlot?.getOptions()?.look?.title &&
|
|
565
|
-
v?.getOptions()?.look?.title === scatterPlot?.getOptions()?.look?.title) {
|
|
566
|
-
grok.events.fireCustomEvent(DIMENSIONALITY_REDUCER_TERMINATE_EVENT, {});
|
|
567
|
-
sub.unsubscribe();
|
|
568
|
-
resolveF?.();
|
|
569
|
-
pg.close();
|
|
570
|
-
}
|
|
571
|
-
});
|
|
572
|
-
const sequenceSpaceResPromise = new Promise<ISequenceSpaceResult | undefined>(async (resolve, reject) => {
|
|
573
|
-
try {
|
|
574
|
-
resolveF = resolve;
|
|
575
|
-
const res = await getSequenceSpace(chemSpaceParams,
|
|
576
|
-
options?.[BYPASS_LARGE_DATA_WARNING] ? undefined : progressFunc);
|
|
577
|
-
resolve(res);
|
|
578
|
-
} catch (e) {
|
|
579
|
-
reject(e);
|
|
580
|
-
}
|
|
581
|
-
});
|
|
582
|
-
const sequenceSpaceRes = await sequenceSpaceResPromise;
|
|
583
|
-
pg.close();
|
|
584
|
-
sub.unsubscribe();
|
|
585
|
-
return sequenceSpaceRes ? processResult(sequenceSpaceRes) : sequenceSpaceRes;
|
|
586
|
-
}
|
|
587
|
-
|
|
588
|
-
if (table.rowCount > fastRowCount && !options?.[BYPASS_LARGE_DATA_WARNING]) {
|
|
589
|
-
ui.dialog().add(ui.divText(`Sequence space analysis might take several minutes.
|
|
590
|
-
Do you want to continue?`))
|
|
591
|
-
.onOK(async () => {
|
|
592
|
-
await getSeqSpace().catch((err: any) => {
|
|
593
|
-
pg.close();
|
|
594
|
-
const [errMsg, errStack] = errInfo(err);
|
|
595
|
-
_package.logger.error(errMsg, undefined, errStack);
|
|
596
|
-
if (scatterPlot)
|
|
597
|
-
scatterPlot.close();
|
|
598
|
-
});
|
|
599
|
-
})
|
|
600
|
-
.onCancel(() => { pg.close(); })
|
|
601
|
-
.show();
|
|
602
|
-
} else {
|
|
603
|
-
return await getSeqSpace();
|
|
604
|
-
}
|
|
605
|
-
|
|
606
|
-
function processResult(sequenceSpaceRes: ISequenceSpaceResult): DG.ScatterPlotViewer | undefined {
|
|
607
|
-
const embeddings = sequenceSpaceRes.coordinates;
|
|
608
|
-
for (const col of embeddings) {
|
|
609
|
-
const listValues = col.toList();
|
|
610
|
-
emptyValsIdxs.forEach((ind: number) => listValues.splice(ind, 0, null));
|
|
611
|
-
let embedCol = table.columns.byName(col.name);
|
|
612
|
-
if (!embedCol) {
|
|
613
|
-
embedCol = DG.Column.float(col.name, listValues.length);
|
|
614
|
-
table.columns.add(embedCol);
|
|
615
|
-
}
|
|
616
|
-
embedCol.init((i) => listValues[i]);
|
|
617
|
-
//table.columns.add(DG.Column.float(col.name, table.rowCount).init((i) => listValues[i]));
|
|
618
|
-
}
|
|
619
|
-
if (plotEmbeddings) {
|
|
620
|
-
if (!scatterPlot) {
|
|
621
|
-
scatterPlot = grok.shell
|
|
622
|
-
.tableView(table.name)
|
|
623
|
-
.scatterPlot({x: embedColsNames[0], y: embedColsNames[1], title: 'Sequence space'});
|
|
624
|
-
}
|
|
625
|
-
ui.setUpdateIndicator(scatterPlot.root, false);
|
|
626
|
-
return scatterPlot;
|
|
627
|
-
}
|
|
628
|
-
}
|
|
629
|
-
} catch (e) {
|
|
630
|
-
console.error(e);
|
|
631
|
-
pg.close();
|
|
632
|
-
const [errMsg, errStack] = errInfo(e);
|
|
633
|
-
_package.logger.error(errMsg, undefined, errStack);
|
|
634
|
-
if (scatterPlot)
|
|
635
|
-
(scatterPlot as unknown as DG.Viewer).close();
|
|
636
|
-
}
|
|
637
|
-
/* const encodedCol = encodeMonomers(macroMolecule);
|
|
638
|
-
if (!encodedCol)
|
|
548
|
+
export async function sequenceSpaceTopMenu(table: DG.DataFrame, molecules: DG.Column,
|
|
549
|
+
methodName: DimReductionMethods, similarityMetric: BitArrayMetrics | MmDistanceFunctionsNames,
|
|
550
|
+
plotEmbeddings: boolean, preprocessingFunction?: DG.Func, options?: (IUMAPOptions | ITSNEOptions) & Options,
|
|
551
|
+
clusterEmbeddings?: boolean): Promise<DG.ScatterPlotViewer | undefined> {
|
|
552
|
+
if (!checkInputColumnUI(molecules, 'Sequence Space'))
|
|
639
553
|
return;
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
const sequenceSpaceRes = await sequenceSpace(chemSpaceParams);
|
|
651
|
-
const embeddings = sequenceSpaceRes.coordinates;
|
|
652
|
-
for (const col of embeddings) {
|
|
653
|
-
const listValues = col.toList();
|
|
654
|
-
emptyValsIdxs.forEach((ind: number) => listValues.splice(ind, 0, null));
|
|
655
|
-
table.columns.add(DG.Column.fromList('double', col.name, listValues));
|
|
656
|
-
}
|
|
657
|
-
let sp;
|
|
658
|
-
if (plotEmbeddings) {
|
|
659
|
-
for (const v of grok.shell.views) {
|
|
660
|
-
if (v.name === table.name)
|
|
661
|
-
sp = (v as DG.TableView).scatterPlot({x: embedColsNames[0], y: embedColsNames[1], title: 'Sequence space'});
|
|
662
|
-
}
|
|
663
|
-
} */
|
|
554
|
+
if (!preprocessingFunction)
|
|
555
|
+
preprocessingFunction = DG.Func.find({name: 'macromoleculePreprocessingFunction', package: 'Bio'})[0];
|
|
556
|
+
|
|
557
|
+
const res = await reduceDimensionality(table, molecules, methodName,
|
|
558
|
+
similarityMetric as KnownMetrics, preprocessingFunction, plotEmbeddings, clusterEmbeddings ?? false, options, {
|
|
559
|
+
fastRowCount: 10000,
|
|
560
|
+
scatterPlotName: 'Sequence space',
|
|
561
|
+
bypassLargeDataWarning: options?.[BYPASS_LARGE_DATA_WARNING],
|
|
562
|
+
});
|
|
563
|
+
return res;
|
|
664
564
|
}
|
|
665
565
|
|
|
666
566
|
//top-menu: Bio | Convert | To Atomic Level...
|
|
@@ -1066,6 +966,7 @@ export function addCopyMenu(cell: DG.Cell, menu: DG.Menu): void {
|
|
|
1066
966
|
//description: Sequence similarity tracking and evaluation dataset diversity
|
|
1067
967
|
//meta.path: /apps/Tutorials/Demo/Bioinformatics/Similarity,%20Diversity
|
|
1068
968
|
//meta.isDemoScript: True
|
|
969
|
+
//meta.demoSkip: GROK-14320
|
|
1069
970
|
export async function demoBioSimilarityDiversity(): Promise<void> {
|
|
1070
971
|
await demoBio01UI();
|
|
1071
972
|
}
|
|
@@ -1076,6 +977,7 @@ export async function demoBioSimilarityDiversity(): Promise<void> {
|
|
|
1076
977
|
//description: Exploring sequence space of Macromolecules, comparison with hierarchical clustering results
|
|
1077
978
|
//meta.path: /apps/Tutorials/Demo/Bioinformatics/Sequence%20Space
|
|
1078
979
|
//meta.isDemoScript: True
|
|
980
|
+
//meta.demoSkip: GROK-14320
|
|
1079
981
|
export async function demoBioSequenceSpace(): Promise<void> {
|
|
1080
982
|
await demoBio01aUI();
|
|
1081
983
|
}
|
|
@@ -1086,6 +988,7 @@ export async function demoBioSequenceSpace(): Promise<void> {
|
|
|
1086
988
|
//description: Activity Cliffs analysis on Macromolecules data
|
|
1087
989
|
//meta.path: /apps/Tutorials/Demo/Bioinformatics/Activity%20Cliffs
|
|
1088
990
|
//meta.isDemoScript: True
|
|
991
|
+
//meta.demoSkip: GROK-14320
|
|
1089
992
|
export async function demoBioActivityCliffs(): Promise<void> {
|
|
1090
993
|
await demoBio01bUI();
|
|
1091
994
|
}
|
|
@@ -1096,6 +999,7 @@ export async function demoBioActivityCliffs(): Promise<void> {
|
|
|
1096
999
|
//description: Atomic level structure of Macromolecules
|
|
1097
1000
|
//meta.path: /apps/Tutorials/Demo/Bioinformatics/Atomic%20Level
|
|
1098
1001
|
//meta.isDemoScript: True
|
|
1002
|
+
//meta.demoSkip: GROK-14320
|
|
1099
1003
|
export async function demoBioAtomicLevel(): Promise<void> {
|
|
1100
1004
|
await demoBio03UI();
|
|
1101
1005
|
}
|
|
@@ -1106,6 +1010,7 @@ export async function demoBioAtomicLevel(): Promise<void> {
|
|
|
1106
1010
|
//description: MSA and composition analysis on Helm data
|
|
1107
1011
|
//meta.path: /apps/Tutorials/Demo/Bioinformatics/Helm,%20MSA,%20Sequence%20Space
|
|
1108
1012
|
//meta.isDemoScript: True
|
|
1013
|
+
//meta.demoSkip: GROK-14320
|
|
1109
1014
|
export async function demoBioHelmMsaSequenceSpace(): Promise<void> {
|
|
1110
1015
|
await demoBio05UI();
|
|
1111
1016
|
}
|
|
@@ -16,7 +16,7 @@ category('PepSeA', () => {
|
|
|
16
16
|
await awaitContainerStart();
|
|
17
17
|
const table = DG.DataFrame.fromCsv(testCsv);
|
|
18
18
|
const alignedCol = await runPepsea(table.getCol('HELM'), 'msa(HELM)');
|
|
19
|
-
expect(alignedCol !== null, true, 'PepSeA
|
|
19
|
+
expect(alignedCol !== null, true, 'PepSeA container has not started');
|
|
20
20
|
const alignedTestCol = table.getCol('MSA');
|
|
21
21
|
for (let i = 0; i < alignedCol!.length; ++i)
|
|
22
22
|
expect(alignedCol!.get(i) == alignedTestCol.get(i), true);
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import * as DG from 'datagrok-api/dg';
|
|
2
2
|
import * as grok from 'datagrok-api/grok';
|
|
3
3
|
import {expect} from '@datagrok-libraries/utils/src/test';
|
|
4
|
-
import {
|
|
4
|
+
import {sequenceSpaceTopMenu} from '../package';
|
|
5
5
|
import {MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
|
|
6
6
|
import {DimReductionMethods} from '@datagrok-libraries/ml/src/reduce-dimensionality';
|
|
7
|
+
import {BYPASS_LARGE_DATA_WARNING} from '@datagrok-libraries/ml/src/functionEditors/consts';
|
|
7
8
|
|
|
8
9
|
export async function _testSequenceSpaceReturnsResult(
|
|
9
10
|
df: DG.DataFrame, algorithm: DimReductionMethods, colName: string,
|
|
@@ -14,7 +15,10 @@ export async function _testSequenceSpaceReturnsResult(
|
|
|
14
15
|
if (semType)
|
|
15
16
|
col.semType = semType;
|
|
16
17
|
|
|
18
|
+
const preprocessingFunc = DG.Func.find({package: 'Bio', name: 'macromoleculePreprocessingFunction'})[0];
|
|
19
|
+
if (!preprocessingFunc)
|
|
20
|
+
throw new Error('Preprocessing function not found');
|
|
17
21
|
const sp = await sequenceSpaceTopMenu(df, df.col(colName)!, algorithm, MmDistanceFunctionsNames.LEVENSHTEIN, true,
|
|
18
|
-
|
|
22
|
+
preprocessingFunc, {[BYPASS_LARGE_DATA_WARNING]: true});
|
|
19
23
|
expect(sp != null, true);
|
|
20
24
|
}
|