@datagrok/bio 2.4.12 → 2.4.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/package-test.js +1 -1
- package/dist/package-test.js.map +1 -1
- package/dist/package.js +1 -1
- package/dist/package.js.map +1 -1
- package/package.json +6 -5
- package/scripts/motif_generator.py +119 -0
- package/src/demo/bio01-similarity-diversity.ts +40 -29
- package/src/demo/bio01a-hierarchical-clustering-and-sequence-space.ts +51 -40
- package/src/demo/bio01b-hierarchical-clustering-and-activity-cliffs.ts +71 -62
- package/src/demo/bio05-helm-msa-sequence-space.ts +43 -34
- package/src/demo/utils.ts +7 -13
- package/src/package.ts +8 -4
package/package.json
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
"name": "Leonid Stolbov",
|
|
6
6
|
"email": "lstolbov@datagrok.ai"
|
|
7
7
|
},
|
|
8
|
-
"version": "2.4.
|
|
8
|
+
"version": "2.4.13",
|
|
9
9
|
"description": "Bioinformatics support (import/export of sequences, conversion, visualization, analysis). [See more](https://github.com/datagrok-ai/public/blob/master/packages/Bio/README.md) for details.",
|
|
10
10
|
"repository": {
|
|
11
11
|
"type": "git",
|
|
@@ -17,6 +17,7 @@
|
|
|
17
17
|
"@datagrok-libraries/bio": "^5.28.4",
|
|
18
18
|
"@datagrok-libraries/chem-meta": "^1.0.1",
|
|
19
19
|
"@datagrok-libraries/ml": "^6.3.16",
|
|
20
|
+
"@datagrok-libraries/tutorials": "^1.2.1",
|
|
20
21
|
"@datagrok-libraries/utils": "^2.1.3",
|
|
21
22
|
"cash-dom": "^8.0.0",
|
|
22
23
|
"css-loader": "^6.7.3",
|
|
@@ -32,12 +33,12 @@
|
|
|
32
33
|
"devDependencies": {
|
|
33
34
|
"@types/node": "^17.0.24",
|
|
34
35
|
"@types/wu": "latest",
|
|
35
|
-
"@typescript-eslint/eslint-plugin": "
|
|
36
|
-
"@typescript-eslint/parser": "
|
|
37
|
-
"eslint": "
|
|
36
|
+
"@typescript-eslint/eslint-plugin": "latest",
|
|
37
|
+
"@typescript-eslint/parser": "latest",
|
|
38
|
+
"eslint": "latest",
|
|
38
39
|
"eslint-config-google": "latest",
|
|
39
40
|
"ts-loader": "^9.2.5",
|
|
40
|
-
"typescript": "^
|
|
41
|
+
"typescript": "^5.0.4",
|
|
41
42
|
"webpack": "^5.76.0",
|
|
42
43
|
"webpack-bundle-analyzer": "latest",
|
|
43
44
|
"webpack-cli": "^4.6.0",
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
import random
|
|
4
|
+
from math import sqrt
|
|
5
|
+
import argparse
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
from typing import List, Tuple
|
|
9
|
+
|
|
10
|
+
letter_choice_type = List[str]
|
|
11
|
+
motif_template_type = List[letter_choice_type]
|
|
12
|
+
|
|
13
|
+
default_alphabet = 'A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y'
|
|
14
|
+
|
|
15
|
+
def meanrange(mean:int,disp:int) -> int:
|
|
16
|
+
return random.randint(mean - disp, mean + disp)
|
|
17
|
+
|
|
18
|
+
def generate_modif_template(motif_length:int, alphabet:List[str], max_variants_cluster:int, prob_any:float=0.2) -> motif_template_type: # Making a template to generate from it some random motifs
|
|
19
|
+
motif_template = []
|
|
20
|
+
for position in range(motif_length):
|
|
21
|
+
# Selecting letters for position i
|
|
22
|
+
if (0 < position < motif_length-1) and (random.random() <= prob_any):
|
|
23
|
+
letters = ['?'] # this stands for any symbol
|
|
24
|
+
else:
|
|
25
|
+
n_variants = random.randrange(max_variants_cluster) + 1
|
|
26
|
+
letters = [ random.choice(alphabet) for i in range(n_variants)]
|
|
27
|
+
motif_template.append(letters)
|
|
28
|
+
return motif_template
|
|
29
|
+
|
|
30
|
+
def generate_motif(template: motif_template_type, alphabet:List[str]) -> str:
|
|
31
|
+
# Sunbtituting the ? in template for any letter
|
|
32
|
+
template_with_any = [ (letters if not '?' in letters else alphabet) for letters in template ]
|
|
33
|
+
return ''.join([ random.choice(letters) for letters in template_with_any ])
|
|
34
|
+
|
|
35
|
+
def motif_notation(motif_template: motif_template_type) -> str:
|
|
36
|
+
def motif_notation_code(letter_choice:letter_choice_type) -> str:
|
|
37
|
+
if len(letter_choice) == 1:
|
|
38
|
+
return(letter_choice[0])
|
|
39
|
+
else:
|
|
40
|
+
return f"[{''.join(letter_choice)}]"
|
|
41
|
+
|
|
42
|
+
return ''.join([ motif_notation_code(letter_choice) for letter_choice in motif_template])
|
|
43
|
+
|
|
44
|
+
def generate_random(n:int, alphabet:List[str]) -> str:
|
|
45
|
+
return ''.join([ random.choice(alphabet) for i in range(n) ])
|
|
46
|
+
|
|
47
|
+
def make_cliff(motif_template:motif_template_type, alphabet:List[str] , motif:str) -> str:
|
|
48
|
+
# Selecting conservative letter in motif
|
|
49
|
+
pos = random.randrange(len(motif_template))
|
|
50
|
+
while '?' in motif_template[pos]:
|
|
51
|
+
pos = (pos + 1) % len(motif_template) # always will find letters since ends of motif can't be any symbol
|
|
52
|
+
outlier_letters = list(set(alphabet) - set (motif_template[pos]))
|
|
53
|
+
return motif[:pos] + random.choice(outlier_letters) + motif[pos+1:]
|
|
54
|
+
|
|
55
|
+
# ====================================================================================
|
|
56
|
+
|
|
57
|
+
parser = argparse.ArgumentParser(prog='MotifSequencesGenerator',
|
|
58
|
+
description='The program generates set of sequences containing sequence motifs for SAR fucntionality testing',
|
|
59
|
+
epilog='Unitity support: Gennadii Zakharov ')
|
|
60
|
+
|
|
61
|
+
parser.add_argument("-a", "--alphabet", type=str, default=default_alphabet, help="Alphabet to generate sequences, separated by comma",)
|
|
62
|
+
parser.add_argument("-c", "--clusters", type=int, default=1, help="Number of clusters")
|
|
63
|
+
parser.add_argument("-s", "--sequences", type=int, default=500, help="Number of sequences in each cluster",)
|
|
64
|
+
parser.add_argument("-m,", "--motif", type=int, default=12, help="Average length of motif",)
|
|
65
|
+
parser.add_argument("-r,", "--random", type=int, default=4, help="Average length of random sequence parts before and after motif",)
|
|
66
|
+
parser.add_argument("-d,", "--dispersion", type=int, default=2, help="Variation of total sequence lengths",)
|
|
67
|
+
|
|
68
|
+
parser.add_argument("--max-variants-position", type=int, default=3, help="maximum number of different letters in motif position",)
|
|
69
|
+
parser.add_argument("--cliff-probability", type=float, default=0.01, help="Probabaility to make activity cliff of a sequence",)
|
|
70
|
+
parser.add_argument("--cliff-strength", type=float, default=4.0, help="Strength of cliff",)
|
|
71
|
+
|
|
72
|
+
args = parser.parse_args()
|
|
73
|
+
|
|
74
|
+
alphabet:List[str] = args.alphabet.split(',')
|
|
75
|
+
|
|
76
|
+
print('cluster\tsequence_id\tsequence\tactivity\tis_cliff')
|
|
77
|
+
|
|
78
|
+
line_number = 0
|
|
79
|
+
|
|
80
|
+
for n_cluster in range(args.clusters):
|
|
81
|
+
activity_average = random.random() * 10
|
|
82
|
+
activity_dispersion = random.random()
|
|
83
|
+
|
|
84
|
+
# Generatin motif template for cluster
|
|
85
|
+
motif_length = meanrange(args.motif, args.dispersion)
|
|
86
|
+
motif_template = generate_modif_template(motif_length, alphabet, args.max_variants_position)
|
|
87
|
+
sys.stderr.write(f"Cluster {n_cluster:2} motif template: {motif_notation(motif_template)}\n")
|
|
88
|
+
|
|
89
|
+
total_length = meanrange(args.random * 2, args.dispersion) + motif_length
|
|
90
|
+
prefix_length = meanrange(args.random, args.dispersion//2)
|
|
91
|
+
suffix_length = total_length - motif_length - prefix_length
|
|
92
|
+
|
|
93
|
+
cliff_made = False
|
|
94
|
+
for n_seq in range(args.sequences):
|
|
95
|
+
line_number +=1
|
|
96
|
+
activity = random.gauss(activity_average, activity_dispersion)
|
|
97
|
+
|
|
98
|
+
motif = generate_motif(motif_template, alphabet)
|
|
99
|
+
prefix = generate_random(prefix_length, alphabet)
|
|
100
|
+
suffix = generate_random(suffix_length, alphabet)
|
|
101
|
+
seq = prefix + motif + suffix
|
|
102
|
+
|
|
103
|
+
is_cliff = random.random() <= args.cliff_probability
|
|
104
|
+
if is_cliff:
|
|
105
|
+
# Making activity cliff
|
|
106
|
+
cliff_motif = make_cliff(motif_template, alphabet, motif)
|
|
107
|
+
cliff_seq = prefix + cliff_motif + suffix
|
|
108
|
+
# Recalculating activity
|
|
109
|
+
cliff_disp = activity_dispersion * args.cliff_strength * (0.5 + random.random())
|
|
110
|
+
activity = activity_average - cliff_disp
|
|
111
|
+
cliff_activity = activity_average + cliff_disp
|
|
112
|
+
|
|
113
|
+
sys.stderr.write(f"Cliff for sequence #{line_number:4}, cluster {n_cluster} \n")
|
|
114
|
+
sys.stderr.write(f"{activity_average}\t{motif}\t{activity}\n")
|
|
115
|
+
sys.stderr.write(f"{activity_average}\t{cliff_motif}\t{cliff_activity}\n")
|
|
116
|
+
print(f"{n_cluster}\tc{n_cluster}_seq{line_number}\t{cliff_seq}\t{cliff_activity:5.2f}\t{is_cliff}")
|
|
117
|
+
line_number +=1
|
|
118
|
+
print(f"{n_cluster}\tc{n_cluster}_seq{line_number}\t{seq}\t{activity:5.2f}\t{is_cliff}")
|
|
119
|
+
|
|
@@ -3,43 +3,54 @@ import * as ui from 'datagrok-api/ui';
|
|
|
3
3
|
import * as DG from 'datagrok-api/dg';
|
|
4
4
|
|
|
5
5
|
import {_package} from '../package';
|
|
6
|
+
import {DemoScript} from '@datagrok-libraries/tutorials/src/demo-script';
|
|
6
7
|
import {delay} from '@datagrok-libraries/utils/src/test';
|
|
7
|
-
import {
|
|
8
|
+
import {handleError} from './utils';
|
|
8
9
|
|
|
9
10
|
const dataFn = 'data/sample_FASTA_DNA.csv';
|
|
10
11
|
|
|
11
|
-
export async function demoBio01UI(
|
|
12
|
+
export async function demoBio01UI() {
|
|
12
13
|
let view: DG.TableView;
|
|
13
14
|
let df: DG.DataFrame;
|
|
14
15
|
|
|
15
16
|
try {
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
17
|
+
const demoScript = new DemoScript('Demo', 'Sequence similarity / diversity search');
|
|
18
|
+
await demoScript
|
|
19
|
+
.step(`Loading DNA notation 'fasta'`, async () => {
|
|
20
|
+
df = await _package.files.readCsv(dataFn);
|
|
21
|
+
view = grok.shell.addTableView(df);
|
|
22
|
+
}, {
|
|
23
|
+
description: `Load dataset with macromolecules of 'fasta' notation, 'DNA' alphabet.`,
|
|
24
|
+
delay: 1600
|
|
25
|
+
})
|
|
26
|
+
.step('Sequence similarity search', async () => {
|
|
27
|
+
const simViewer = await df.plot.fromType('Sequence Similarity Search') as DG.Viewer;
|
|
28
|
+
view.dockManager.dock(simViewer, DG.DOCK_TYPE.RIGHT, null, 'Similarity search', 0.35);
|
|
29
|
+
}, {
|
|
30
|
+
description: `Add 'Sequence Similarity Search' viewer.`,
|
|
31
|
+
delay: 1600
|
|
32
|
+
})
|
|
33
|
+
.step('Sequence diversity search', async () => {
|
|
34
|
+
const divViewer = await df.plot.fromType('Sequence Diversity Search') as DG.Viewer;
|
|
35
|
+
view.dockManager.dock(divViewer, DG.DOCK_TYPE.DOWN, null, 'Diversity search', 0.27);
|
|
36
|
+
}, {
|
|
37
|
+
description: `Add 'Sequence Deversity Search' viewer.`,
|
|
38
|
+
delay: 1600
|
|
39
|
+
})
|
|
40
|
+
.step('Set current row 3', async () => {
|
|
41
|
+
df.currentRowIdx = 3;
|
|
42
|
+
}, {
|
|
43
|
+
description: 'Handling current row changed of data frame showing update of similar sequences.',
|
|
44
|
+
delay: 1600,
|
|
45
|
+
})
|
|
46
|
+
.step('Set current row 7', async () => {
|
|
47
|
+
df.currentRowIdx = 7;
|
|
48
|
+
}, {
|
|
49
|
+
description: 'Changing current row to another.',
|
|
50
|
+
delay: 1600,
|
|
51
|
+
})
|
|
52
|
+
.start();
|
|
39
53
|
} catch (err: any) {
|
|
40
|
-
|
|
41
|
-
_package.logger.error(err.message, undefined, err.stack);
|
|
42
|
-
else
|
|
43
|
-
_package.logger.error(err.toString());
|
|
54
|
+
handleError(err);
|
|
44
55
|
}
|
|
45
56
|
}
|
|
@@ -8,12 +8,13 @@ import * as lev from 'fastest-levenshtein';
|
|
|
8
8
|
import {DistanceMatrix} from '@datagrok-libraries/bio/src/trees/distance-matrix';
|
|
9
9
|
import {getTreeHelper, ITreeHelper} from '@datagrok-libraries/bio/src/trees/tree-helper';
|
|
10
10
|
import {getDendrogramService, IDendrogramService} from '@datagrok-libraries/bio/src/trees/dendrogram';
|
|
11
|
-
import {demoSequenceSpace,
|
|
11
|
+
import {demoSequenceSpace, handleError} from './utils';
|
|
12
|
+
import {DemoScript} from '@datagrok-libraries/tutorials/src/demo-script';
|
|
12
13
|
|
|
13
14
|
const dataFn = 'data/sample_FASTA_DNA.csv';
|
|
14
15
|
const seqColName = 'sequence';
|
|
15
16
|
|
|
16
|
-
export async function demoBio01aUI(
|
|
17
|
+
export async function demoBio01aUI() {
|
|
17
18
|
let treeHelper: ITreeHelper;
|
|
18
19
|
let dendrogramSvc: IDendrogramService;
|
|
19
20
|
let view: DG.TableView;
|
|
@@ -25,44 +26,54 @@ export async function demoBio01aUI(funcPath: string) {
|
|
|
25
26
|
const embedCols: { [colName: string]: DG.Column<number> } = {};
|
|
26
27
|
|
|
27
28
|
try {
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
29
|
+
const demoScript = new DemoScript('Demo', 'Exploring sequence space');
|
|
30
|
+
await demoScript
|
|
31
|
+
.step(`Loading DNA notation 'fasta'`, async () => {
|
|
32
|
+
[df, treeHelper, dendrogramSvc] = await Promise.all([
|
|
33
|
+
_package.files.readCsv(dataFn),
|
|
34
|
+
getTreeHelper(),
|
|
35
|
+
getDendrogramService()
|
|
36
|
+
]);
|
|
37
|
+
view = grok.shell.addTableView(df);
|
|
38
|
+
view.grid.props.rowHeight = 22;
|
|
39
|
+
}, {
|
|
40
|
+
description: `Load dataset with macromolecules of 'fasta' notation, 'DNA' alphabet.`,
|
|
41
|
+
delay: 1600,
|
|
42
|
+
})
|
|
43
|
+
.step('Building sequence space', async () => {
|
|
44
|
+
spViewer = await demoSequenceSpace(view, df, seqColName, method);
|
|
45
|
+
}, {
|
|
46
|
+
description: `Reduce sequence space dimensionality to display on 2D representation.`,
|
|
47
|
+
delay: 1600
|
|
48
|
+
})
|
|
49
|
+
.step('Hierarchical clustering', async () => {
|
|
50
|
+
const seqCol: DG.Column<string> = df.getCol(seqColName);
|
|
51
|
+
const seqList = seqCol.toList();
|
|
52
|
+
const distance: DistanceMatrix = DistanceMatrix.calc(seqList, (aSeq: string, bSeq: string) => {
|
|
53
|
+
const levDistance = lev.distance(aSeq, bSeq);
|
|
54
|
+
return levDistance / ((aSeq.length + bSeq.length) / 2);
|
|
55
|
+
});
|
|
56
|
+
const treeRoot = await treeHelper.hierarchicalClusteringByDistance(distance, 'ward');
|
|
57
|
+
dendrogramSvc.injectTreeForGrid(view.grid, treeRoot, undefined, 150, undefined);
|
|
58
|
+
}, {
|
|
59
|
+
description: `Perform hierarchical clustering to reveal relationships between sequences.`,
|
|
60
|
+
delay: 1600,
|
|
61
|
+
})
|
|
62
|
+
.step('Selection', async () => {
|
|
63
|
+
df.selection.init((idx: number) => [15].includes(idx));
|
|
64
|
+
}, {
|
|
65
|
+
description: `Handling selection of data frame row reflecting on linked viewers.`,
|
|
66
|
+
delay: 1600,
|
|
67
|
+
})
|
|
68
|
+
.step('Select a bunch of sequences', async () => {
|
|
69
|
+
df.selection.init((idx: number) => [21, 9, 58].includes(idx));
|
|
70
|
+
df.currentRowIdx = 27;
|
|
71
|
+
}, {
|
|
72
|
+
description: 'Selecting a group of rows from a data frame to show their similarity and proximity to each other on a viewer..',
|
|
73
|
+
delay: 1600,
|
|
74
|
+
})
|
|
75
|
+
.start();
|
|
62
76
|
} catch (err: any) {
|
|
63
|
-
|
|
64
|
-
_package.logger.error(err.message, undefined, err.stack);
|
|
65
|
-
else
|
|
66
|
-
_package.logger.error(err.toString());
|
|
77
|
+
handleError(err);
|
|
67
78
|
}
|
|
68
79
|
}
|
|
@@ -10,11 +10,12 @@ import * as lev from 'fastest-levenshtein';
|
|
|
10
10
|
import {DistanceMatrix} from '@datagrok-libraries/bio/src/trees/distance-matrix';
|
|
11
11
|
import {getTreeHelper, ITreeHelper} from '@datagrok-libraries/bio/src/trees/tree-helper';
|
|
12
12
|
import {getDendrogramService, IDendrogramService} from '@datagrok-libraries/bio/src/trees/dendrogram';
|
|
13
|
-
import {
|
|
13
|
+
import {handleError} from './utils';
|
|
14
|
+
import {DemoScript} from '@datagrok-libraries/tutorials/src/demo-script';
|
|
14
15
|
|
|
15
16
|
const dataFn = 'samples/sample_FASTA.csv';
|
|
16
17
|
|
|
17
|
-
export async function demoBio01bUI(
|
|
18
|
+
export async function demoBio01bUI() {
|
|
18
19
|
let treeHelper: ITreeHelper;
|
|
19
20
|
let dendrogramSvc: IDendrogramService;
|
|
20
21
|
let view: DG.TableView;
|
|
@@ -25,70 +26,78 @@ export async function demoBio01bUI(funcPath: string) {
|
|
|
25
26
|
const idRows: { [id: number]: number } = {};
|
|
26
27
|
|
|
27
28
|
try {
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
29
|
+
const demoScript = new DemoScript('Demo', '');
|
|
30
|
+
await demoScript
|
|
31
|
+
.step(`Loading DNA notation \'fasta\'`, async () => {
|
|
32
|
+
[df, treeHelper, dendrogramSvc] = await Promise.all([
|
|
33
|
+
_package.files.readCsv(dataFn),
|
|
34
|
+
getTreeHelper(),
|
|
35
|
+
getDendrogramService()
|
|
36
|
+
]);
|
|
34
37
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
38
|
+
view = grok.shell.addTableView(df);
|
|
39
|
+
view.grid.props.rowHeight = 22;
|
|
40
|
+
const uniProtKbGCol = view.grid.columns.byName('UniProtKB')!;
|
|
41
|
+
uniProtKbGCol.width = 75;
|
|
42
|
+
const lengthGCol = view.grid.columns.byName('Length')!;
|
|
43
|
+
lengthGCol.width = 0;
|
|
44
|
+
}, {
|
|
45
|
+
description: 'Load dataset with macromolecules of \'fasta\' notation, \'DNA\' alphabet.',
|
|
46
|
+
delay: 1600,
|
|
47
|
+
})
|
|
48
|
+
.step('Analyze for activity cliffs', async () => {
|
|
49
|
+
activityCliffsViewer = (await activityCliffs(
|
|
50
|
+
df, df.getCol('Sequence'), df.getCol('Activity'),
|
|
51
|
+
80, method)) as DG.ScatterPlotViewer;
|
|
52
|
+
view.dockManager.dock(activityCliffsViewer, DG.DOCK_TYPE.RIGHT, null, 'Activity Cliffs', 0.35);
|
|
43
53
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
54
|
+
// Show grid viewer with the cliffs
|
|
55
|
+
const cliffsLink: HTMLButtonElement = $(activityCliffsViewer.root)
|
|
56
|
+
.find('button.scatter_plot_link,cliffs_grid').get()[0] as HTMLButtonElement;
|
|
57
|
+
cliffsLink.click();
|
|
58
|
+
}, {
|
|
59
|
+
description: 'Reveal similar sequences with a cliff of activity.',
|
|
60
|
+
delay: 1600
|
|
61
|
+
})
|
|
62
|
+
.step('Hierarchical clustering', async () => {
|
|
63
|
+
const seqCol: DG.Column<string> = df.getCol('sequence');
|
|
64
|
+
const seqList = seqCol.toList();
|
|
65
|
+
const distance: DistanceMatrix = DistanceMatrix.calc(seqList, (aSeq: string, bSeq: string) => {
|
|
66
|
+
const levDistance = lev.distance(aSeq, bSeq);
|
|
67
|
+
return levDistance / ((aSeq.length + bSeq.length) / 2);
|
|
68
|
+
});
|
|
69
|
+
const treeRoot = await treeHelper.hierarchicalClusteringByDistance(distance, 'ward');
|
|
70
|
+
dendrogramSvc.injectTreeForGrid(view.grid, treeRoot, undefined, 150, undefined);
|
|
49
71
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
.
|
|
53
|
-
|
|
54
|
-
|
|
72
|
+
// adjust for visual
|
|
73
|
+
const activityGCol = view.grid.columns.byName('Activity')!;
|
|
74
|
+
activityGCol.scrollIntoView();
|
|
75
|
+
}, {
|
|
76
|
+
description: 'Perform hierarchical clustering to reveal relationships between sequences.',
|
|
77
|
+
delay: 1600
|
|
78
|
+
})
|
|
79
|
+
.step('Browse the cliff', async () => {
|
|
80
|
+
//cliffsDfGrid.dataFrame.currentRowIdx = -1; // reset
|
|
81
|
+
const cliffsDfGrid: DG.Grid = activityCliffsViewer.dataFrame.temp[acTEMPS.cliffsDfGrid];
|
|
82
|
+
//cliffsDfGrid.dataFrame.selection.init((i) => i == currentCliffIdx);
|
|
83
|
+
cliffsDfGrid.dataFrame.currentRowIdx = 0;
|
|
84
|
+
//cliffsDfGrid.dataFrame.selection.set(currentCliffIdx, true, true);
|
|
55
85
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
const
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
await step('Browse the cliff.', async () => {
|
|
72
|
-
//cliffsDfGrid.dataFrame.currentRowIdx = -1; // reset
|
|
73
|
-
const cliffsDfGrid: DG.Grid = activityCliffsViewer.dataFrame.temp[acTEMPS.cliffsDfGrid];
|
|
74
|
-
//cliffsDfGrid.dataFrame.selection.init((i) => i == currentCliffIdx);
|
|
75
|
-
cliffsDfGrid.dataFrame.currentRowIdx = 0;
|
|
76
|
-
//cliffsDfGrid.dataFrame.selection.set(currentCliffIdx, true, true);
|
|
77
|
-
|
|
78
|
-
// /* workaround to select rows of the cliff */
|
|
79
|
-
// const entryCol: DG.Column = df.getCol('Entry');
|
|
80
|
-
// df.selection.init((rowIdx) => ['UPI00000BFE1D', 'UPI00000BFE17'].includes(entryCol.get(rowIdx)));
|
|
81
|
-
//
|
|
82
|
-
// const selectionIdxList: Int32Array = df.selection.getSelectedIndexes();
|
|
83
|
-
// if (selectionIdxList.length > 0) {
|
|
84
|
-
// df.currentRowIdx = selectionIdxList[0];
|
|
85
|
-
// view.grid.scrollToCell('UniProtKB', view.grid.tableRowToGrid(selectionIdxList[0]));
|
|
86
|
-
// }
|
|
87
|
-
})();
|
|
86
|
+
// /* workaround to select rows of the cliff */
|
|
87
|
+
// const entryCol: DG.Column = df.getCol('Entry');
|
|
88
|
+
// df.selection.init((rowIdx) => ['UPI00000BFE1D', 'UPI00000BFE17'].includes(entryCol.get(rowIdx)));
|
|
89
|
+
//
|
|
90
|
+
// const selectionIdxList: Int32Array = df.selection.getSelectedIndexes();
|
|
91
|
+
// if (selectionIdxList.length > 0) {
|
|
92
|
+
// df.currentRowIdx = selectionIdxList[0];
|
|
93
|
+
// view.grid.scrollToCell('UniProtKB', view.grid.tableRowToGrid(selectionIdxList[0]));
|
|
94
|
+
// }
|
|
95
|
+
}, {
|
|
96
|
+
description: 'Zoom in to explore selected activity cliff details.',
|
|
97
|
+
delay: 1600
|
|
98
|
+
})
|
|
99
|
+
.start();
|
|
88
100
|
} catch (err: any) {
|
|
89
|
-
|
|
90
|
-
_package.logger.error(err.message, undefined, err.stack);
|
|
91
|
-
else
|
|
92
|
-
_package.logger.error(err.toString());
|
|
101
|
+
handleError(err);
|
|
93
102
|
}
|
|
94
103
|
}
|
|
@@ -3,15 +3,16 @@ import * as ui from 'datagrok-api/ui';
|
|
|
3
3
|
import * as DG from 'datagrok-api/dg';
|
|
4
4
|
|
|
5
5
|
import {_package, sequenceSpaceTopMenu} from '../package';
|
|
6
|
-
import {
|
|
6
|
+
import {handleError} from './utils';
|
|
7
7
|
|
|
8
8
|
import {IWebLogoViewer} from '@datagrok-libraries/bio/src/viewers/web-logo';
|
|
9
9
|
import {pepseaMethods, runPepsea} from '../utils/pepsea';
|
|
10
10
|
import {StringMetricsNames} from '@datagrok-libraries/ml/src/typed-metrics';
|
|
11
|
+
import {DemoScript} from '@datagrok-libraries/tutorials/src/demo-script';
|
|
11
12
|
|
|
12
13
|
const helmFn: string = 'samples/sample_HELM.csv';
|
|
13
14
|
|
|
14
|
-
export async function demoBio05UI(
|
|
15
|
+
export async function demoBio05UI(): Promise<void> {
|
|
15
16
|
let view: DG.TableView;
|
|
16
17
|
let df: DG.DataFrame;
|
|
17
18
|
let helmCol: DG.Column<string>;
|
|
@@ -23,38 +24,46 @@ export async function demoBio05UI(funcPath: string): Promise<void> {
|
|
|
23
24
|
const msaHelmColName: string = 'msa(HELM)';
|
|
24
25
|
|
|
25
26
|
try {
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
27
|
+
const demoScript = new DemoScript('Demo', 'MSA and composition analysis on Helm data.');
|
|
28
|
+
await demoScript
|
|
29
|
+
.step(`Loading peptides notation 'HELM'`, async () => {
|
|
30
|
+
view = grok.shell.addTableView(df = await _package.files.readCsv(helmFn));
|
|
31
|
+
}, {
|
|
32
|
+
description: 'Load dataset with macromolecules of \'Helm\' notation.',
|
|
33
|
+
delay: 1600,
|
|
34
|
+
})
|
|
35
|
+
.step('MSA on non-natural aminoacids with PepSeA', async () => {
|
|
36
|
+
helmCol = df.getCol(helmColName);
|
|
37
|
+
const method: string = pepseaMethods[0];
|
|
38
|
+
const gapOpen: number = 1.53;
|
|
39
|
+
const gapExtend: number = 0;
|
|
40
|
+
msaHelmCol = await runPepsea(helmCol, msaHelmColName, method, gapOpen, gapExtend, undefined);
|
|
41
|
+
df.columns.add(msaHelmCol);
|
|
42
|
+
await grok.data.detectSemanticTypes(df);
|
|
43
|
+
}, {
|
|
44
|
+
description: 'Multiple sequence alignment (MSA) performed with PepSeA tool operating on non-natural aminoacids as well.',
|
|
45
|
+
delay: 1600,
|
|
46
|
+
})
|
|
47
|
+
.step('Composition analysis on MSA results', async () => {
|
|
48
|
+
wlViewer = await df.plot.fromType('WebLogo', {
|
|
49
|
+
sequenceColumnName: msaHelmColName
|
|
50
|
+
}) as DG.Viewer & IWebLogoViewer;
|
|
51
|
+
view.dockManager.dock(wlViewer, DG.DOCK_TYPE.DOWN, null, 'Composition analysis', 0.2);
|
|
52
|
+
}, {
|
|
53
|
+
description: 'Composition analysis allows to reveal functional features of sequences like motifs, or variable loops.',
|
|
54
|
+
delay: 1600,
|
|
55
|
+
})
|
|
56
|
+
.step('Building sequence space', async () => {
|
|
57
|
+
const method: string = 'UMAP';
|
|
58
|
+
ssViewer = (await sequenceSpaceTopMenu(df, msaHelmCol,
|
|
59
|
+
'UMAP', StringMetricsNames.Levenshtein, true)) as DG.ScatterPlotViewer;
|
|
60
|
+
view.dockManager.dock(ssViewer, DG.DOCK_TYPE.RIGHT, null, 'Sequence Space', 0.35);
|
|
61
|
+
}, {
|
|
62
|
+
description: 'Reduce sequence space dimensionality to display on 2D representation.',
|
|
63
|
+
delay: 1600
|
|
64
|
+
})
|
|
65
|
+
.start();
|
|
54
66
|
} catch (err: any) {
|
|
55
|
-
|
|
56
|
-
_package.logger.error(err.message, undefined, err.stack);
|
|
57
|
-
else
|
|
58
|
-
_package.logger.error(err.toString());
|
|
67
|
+
handleError(err);
|
|
59
68
|
}
|
|
60
69
|
}
|
package/src/demo/utils.ts
CHANGED
|
@@ -7,19 +7,6 @@ import {reduceDimensinalityWithNormalization} from '@datagrok-libraries/ml/src/s
|
|
|
7
7
|
import {StringMetricsNames} from '@datagrok-libraries/ml/src/typed-metrics';
|
|
8
8
|
import {delay} from '@datagrok-libraries/utils/src/test';
|
|
9
9
|
|
|
10
|
-
export function step(message: string, action: () => Promise<void>, delayMs: number = 1600): () => Promise<void> {
|
|
11
|
-
return async function() {
|
|
12
|
-
grok.shell.info(message);
|
|
13
|
-
const pi = DG.TaskBarProgressIndicator.create(message);
|
|
14
|
-
try {
|
|
15
|
-
await action();
|
|
16
|
-
} finally {
|
|
17
|
-
pi.close();
|
|
18
|
-
await delay(delayMs);
|
|
19
|
-
}
|
|
20
|
-
};
|
|
21
|
-
}
|
|
22
|
-
|
|
23
10
|
enum EMBED_COL_NAMES {
|
|
24
11
|
X = 'Embed_X',
|
|
25
12
|
Y = 'Embed_Y'
|
|
@@ -93,3 +80,10 @@ export async function demoSequenceSpace(
|
|
|
93
80
|
view.dockManager.dock(resSpaceViewer!, DG.DOCK_TYPE.RIGHT, null, 'Sequence Space', 0.35);
|
|
94
81
|
return resSpaceViewer;
|
|
95
82
|
}
|
|
83
|
+
|
|
84
|
+
export function handleError(err: any): void {
|
|
85
|
+
const errMsg: string = err instanceof Error ? err.message : err.toString();
|
|
86
|
+
const stack: string | undefined = err instanceof Error ? err.stack : undefined;
|
|
87
|
+
grok.shell.error(errMsg);
|
|
88
|
+
_package.logger.error(err.message, undefined, stack);
|
|
89
|
+
}
|