@datagrok/bio 2.4.29 → 2.4.31
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintrc.json +1 -1
- package/dist/258.js +2 -0
- package/dist/258.js.map +1 -0
- package/dist/457.js +2 -0
- package/dist/457.js.map +1 -0
- package/dist/562.js +2 -0
- package/dist/562.js.map +1 -0
- package/dist/925.js +2 -0
- package/dist/925.js.map +1 -0
- package/dist/package-test.js +1 -1
- package/dist/package-test.js.map +1 -1
- package/dist/package.js +1 -1
- package/dist/package.js.map +1 -1
- package/dockerfiles/Dockerfile +1 -1
- package/package.json +3 -3
- package/scripts/sequence_generator.py +34 -13
- package/src/analysis/sequence-activity-cliffs.ts +2 -2
- package/src/analysis/sequence-diversity-viewer.ts +25 -12
- package/src/analysis/sequence-similarity-viewer.ts +46 -17
- package/src/analysis/sequence-space.ts +1 -1
- package/src/analysis/workers/mm-distance-worker-creator.ts +31 -0
- package/src/analysis/workers/mm-distance-worker.ts +16 -0
- package/src/demo/bio01b-hierarchical-clustering-and-activity-cliffs.ts +2 -1
- package/src/demo/bio05-helm-msa-sequence-space.ts +4 -3
- package/src/demo/utils.ts +3 -1
- package/src/package.ts +9 -7
- package/src/tests/activity-cliffs-tests.ts +3 -2
- package/src/tests/activity-cliffs-utils.ts +2 -1
- package/src/tests/sequence-space-test.ts +3 -2
- package/src/tests/sequence-space-utils.ts +4 -2
- package/src/tests/viewers.ts +8 -3
- package/dist/105.js +0 -2
- package/dist/105.js.map +0 -1
- package/dist/367.js +0 -2
- package/dist/367.js.map +0 -1
- package/dist/864.js +0 -2
- package/dist/864.js.map +0 -1
package/dockerfiles/Dockerfile
CHANGED
|
@@ -24,7 +24,7 @@ RUN savedAptMark="$(apt-mark showmanual)" ; \
|
|
|
24
24
|
; \
|
|
25
25
|
apt-mark auto '.*' > /dev/null ; \
|
|
26
26
|
[ -z "$savedAptMark" ] || apt-mark manual $savedAptMark ; \
|
|
27
|
-
wget https://mafft.cbrc.jp/alignment/software/mafft_7.
|
|
27
|
+
wget https://mafft.cbrc.jp/alignment/software/mafft_7.520-1_amd64.deb -O mafft.deb; \
|
|
28
28
|
apt install -y ./mafft.deb; \
|
|
29
29
|
rm -rf mafft.deb; \
|
|
30
30
|
wget https://github.com/Merck/PepSeA/archive/refs/heads/main.zip -O PepSeA.zip; \
|
package/package.json
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
"name": "Leonid Stolbov",
|
|
6
6
|
"email": "lstolbov@datagrok.ai"
|
|
7
7
|
},
|
|
8
|
-
"version": "2.4.
|
|
8
|
+
"version": "2.4.31",
|
|
9
9
|
"description": "Bioinformatics support (import/export of sequences, conversion, visualization, analysis). [See more](https://github.com/datagrok-ai/public/blob/master/packages/Bio/README.md) for details.",
|
|
10
10
|
"repository": {
|
|
11
11
|
"type": "git",
|
|
@@ -16,9 +16,9 @@
|
|
|
16
16
|
"@biowasm/aioli": "^3.1.0",
|
|
17
17
|
"@datagrok-libraries/bio": "^5.30.0",
|
|
18
18
|
"@datagrok-libraries/chem-meta": "^1.0.1",
|
|
19
|
-
"@datagrok-libraries/ml": "^6.3.
|
|
19
|
+
"@datagrok-libraries/ml": "^6.3.27",
|
|
20
20
|
"@datagrok-libraries/tutorials": "^1.3.2",
|
|
21
|
-
"@datagrok-libraries/utils": "^
|
|
21
|
+
"@datagrok-libraries/utils": "^4.0.8",
|
|
22
22
|
"cash-dom": "^8.0.0",
|
|
23
23
|
"css-loader": "^6.7.3",
|
|
24
24
|
"datagrok-api": "^1.13.3",
|
|
@@ -3,8 +3,8 @@
|
|
|
3
3
|
# description: Create the model peptides/DNA sequences with peptides data
|
|
4
4
|
# language: python
|
|
5
5
|
# tags: template, demo
|
|
6
|
-
# input: int clusters =
|
|
7
|
-
# input: int num_sequences =
|
|
6
|
+
# input: int clusters = 5 [Number of superclusters]
|
|
7
|
+
# input: int num_sequences = 50 [Number of sequences in each supercluster]
|
|
8
8
|
# input: int motif_length = 12 [Average length of motif]
|
|
9
9
|
# input: int max_variants_position = 3 [Maximum number of different letters in conservative position in motif]
|
|
10
10
|
# input: int random_length = 3 [Average length of random sequence parts before and after motif]
|
|
@@ -59,7 +59,9 @@ def generate_motif_template(
|
|
|
59
59
|
|
|
60
60
|
|
|
61
61
|
def generate_motif(template: motif_template_type, alphabet: alphabet_type) -> str:
|
|
62
|
-
template_with_any = [
|
|
62
|
+
template_with_any = [
|
|
63
|
+
(letters if not "?" in letters else alphabet) for letters in template
|
|
64
|
+
]
|
|
63
65
|
return "".join([random.choice(letters) for letters in template_with_any])
|
|
64
66
|
|
|
65
67
|
|
|
@@ -70,18 +72,24 @@ def motif_notation(motif_template: motif_template_type) -> str:
|
|
|
70
72
|
else:
|
|
71
73
|
return f"[{''.join(letter_choice)}]"
|
|
72
74
|
|
|
73
|
-
return "".join(
|
|
75
|
+
return "".join(
|
|
76
|
+
[motif_notation_code(letter_choice) for letter_choice in motif_template]
|
|
77
|
+
)
|
|
74
78
|
|
|
75
79
|
|
|
76
80
|
def generate_random(n: int, alphabet: alphabet_type) -> str:
|
|
77
81
|
return "".join([random.choice(alphabet) for i in range(n)])
|
|
78
82
|
|
|
79
83
|
|
|
80
|
-
def make_cliff(
|
|
84
|
+
def make_cliff(
|
|
85
|
+
motif_template: motif_template_type, alphabet: alphabet_type, motif: str
|
|
86
|
+
) -> str:
|
|
81
87
|
# Mutate conservative letter in motif
|
|
82
88
|
pos = random.randrange(len(motif_template))
|
|
83
89
|
while "?" in motif_template[pos]:
|
|
84
|
-
pos = (pos + 1) % len(
|
|
90
|
+
pos = (pos + 1) % len(
|
|
91
|
+
motif_template
|
|
92
|
+
) # always will find letters since ends of motif can't be any symbol
|
|
85
93
|
outlier_letters = list(set(alphabet) - set(motif_template[pos]))
|
|
86
94
|
return motif[:pos] + random.choice(outlier_letters) + motif[pos + 1 :]
|
|
87
95
|
|
|
@@ -97,7 +105,9 @@ def generate_cluster(
|
|
|
97
105
|
cliff_probability: float,
|
|
98
106
|
cliff_strength: float,
|
|
99
107
|
) -> Iterator[sequence_record_type]:
|
|
100
|
-
motif_template = generate_motif_template(
|
|
108
|
+
motif_template = generate_motif_template(
|
|
109
|
+
motif_length, alphabet, max_variants_position
|
|
110
|
+
)
|
|
101
111
|
|
|
102
112
|
activity_average = random.random() * 10
|
|
103
113
|
activity_dispersion = random.random()
|
|
@@ -166,7 +176,9 @@ def generate_sequences(
|
|
|
166
176
|
cliff_probability,
|
|
167
177
|
cliff_strength,
|
|
168
178
|
):
|
|
169
|
-
sequences.append(
|
|
179
|
+
sequences.append(
|
|
180
|
+
(n_cluster, f"c{n_cluster}_s{n_seq}", seq, activity, is_cliff)
|
|
181
|
+
)
|
|
170
182
|
return headers, sequences
|
|
171
183
|
|
|
172
184
|
|
|
@@ -178,15 +190,19 @@ def parse_command_line_args() -> Any:
|
|
|
178
190
|
epilog="Utility support: Gennadii Zakharov",
|
|
179
191
|
)
|
|
180
192
|
|
|
181
|
-
parser.add_argument(
|
|
193
|
+
parser.add_argument(
|
|
194
|
+
"-c", "--clusters", type=int, default=5, help="Number of superclusters"
|
|
195
|
+
)
|
|
182
196
|
parser.add_argument(
|
|
183
197
|
"-s",
|
|
184
198
|
"--sequences",
|
|
185
199
|
type=int,
|
|
186
|
-
default=
|
|
200
|
+
default=50,
|
|
187
201
|
help="Number of sequences in each supercluster",
|
|
188
202
|
)
|
|
189
|
-
parser.add_argument(
|
|
203
|
+
parser.add_argument(
|
|
204
|
+
"-m,", "--motif-length", type=int, default=12, help="Average length of motif"
|
|
205
|
+
)
|
|
190
206
|
|
|
191
207
|
parser.add_argument(
|
|
192
208
|
"-r,",
|
|
@@ -208,7 +224,8 @@ def parse_command_line_args() -> Any:
|
|
|
208
224
|
"--alphabet",
|
|
209
225
|
type=str,
|
|
210
226
|
default=list(alphabets.keys())[0],
|
|
211
|
-
help=f"Sequence alphabet: {available_alphabets}. Custom alphabet is a list of values separated "
|
|
227
|
+
help=f"Sequence alphabet: {available_alphabets}. Custom alphabet is a list of values separated "
|
|
228
|
+
f"by comma",
|
|
212
229
|
)
|
|
213
230
|
parser.add_argument(
|
|
214
231
|
"--max-variants-position",
|
|
@@ -258,7 +275,11 @@ if not grok:
|
|
|
258
275
|
cliff_probability = args.cliff_probability
|
|
259
276
|
cliff_strength = args.cliff_strength
|
|
260
277
|
|
|
261
|
-
alphabet: alphabet_type =
|
|
278
|
+
alphabet: alphabet_type = (
|
|
279
|
+
alphabets[alphabet_key].split(",")
|
|
280
|
+
if alphabet_key in alphabets
|
|
281
|
+
else alphabet_key.split(",")
|
|
282
|
+
)
|
|
262
283
|
|
|
263
284
|
# Running sequence generator
|
|
264
285
|
header, data = generate_sequences(
|
|
@@ -4,7 +4,7 @@ import * as DG from 'datagrok-api/dg';
|
|
|
4
4
|
|
|
5
5
|
import {ITooltipAndPanelParams} from '@datagrok-libraries/ml/src/viewers/activity-cliffs';
|
|
6
6
|
import {getSimilarityFromDistance} from '@datagrok-libraries/ml/src/distance-metrics-methods';
|
|
7
|
-
import {AvailableMetrics,
|
|
7
|
+
import {AvailableMetrics, DistanceMetricsSubjects, StringMetricsNames} from '@datagrok-libraries/ml/src/typed-metrics';
|
|
8
8
|
import {drawMoleculeDifferenceOnCanvas} from '../utils/cell-renderer';
|
|
9
9
|
import * as C from '../utils/constants';
|
|
10
10
|
import {GridColumn} from 'datagrok-api/dg';
|
|
@@ -15,7 +15,7 @@ export async function getDistances(col: DG.Column, seq: string): Promise<Array<n
|
|
|
15
15
|
const stringArray = col.toList();
|
|
16
16
|
const distances = new Array(stringArray.length).fill(0);
|
|
17
17
|
const distanceMethod: (x: string, y: string) => number =
|
|
18
|
-
AvailableMetrics[
|
|
18
|
+
AvailableMetrics[DistanceMetricsSubjects.String][StringMetricsNames.Levenshtein];
|
|
19
19
|
for (let i = 0; i < stringArray.length; ++i) {
|
|
20
20
|
const distance = stringArray[i] ? distanceMethod(stringArray[i], seq) : null;
|
|
21
21
|
distances[i] = distance ? distance / Math.max((stringArray[i] as string).length, seq.length) : null;
|
|
@@ -2,14 +2,13 @@ import * as ui from 'datagrok-api/ui';
|
|
|
2
2
|
import * as DG from 'datagrok-api/dg';
|
|
3
3
|
import * as grok from 'datagrok-api/grok';
|
|
4
4
|
|
|
5
|
-
import BitArray from '@datagrok-libraries/utils/src/bit-array';
|
|
6
5
|
import {getDiverseSubset} from '@datagrok-libraries/utils/src/similarity-metrics';
|
|
7
|
-
import $ from 'cash-dom';
|
|
8
|
-
import {ArrayUtils} from '@datagrok-libraries/utils/src/array-utils';
|
|
9
6
|
import {SequenceSearchBaseViewer} from './sequence-search-base-viewer';
|
|
10
7
|
import {getMonomericMols} from '../calculations/monomerLevelMols';
|
|
11
8
|
import {updateDivInnerHTML} from '../utils/ui-utils';
|
|
12
9
|
import {Subject} from 'rxjs';
|
|
10
|
+
import {calcMmDistanceMatrix, dmLinearIndex} from './workers/mm-distance-worker-creator';
|
|
11
|
+
import {UnitsHandler} from '@datagrok-libraries/bio/src/utils/units-handler';
|
|
13
12
|
|
|
14
13
|
export class SequenceDiversityViewer extends SequenceSearchBaseViewer {
|
|
15
14
|
diverseColumnLabel: string | null; // Use postfix Label to prevent activating table column selection editor
|
|
@@ -28,15 +27,9 @@ export class SequenceDiversityViewer extends SequenceSearchBaseViewer {
|
|
|
28
27
|
return;
|
|
29
28
|
if (this.dataFrame) {
|
|
30
29
|
if (computeData && this.moleculeColumn) {
|
|
31
|
-
const
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
this.renderMolIds = await grok.functions.call('Chem:callChemDiversitySearch', {
|
|
35
|
-
col: monomericMols,
|
|
36
|
-
metricName: this.distanceMetric,
|
|
37
|
-
limit: this.limit,
|
|
38
|
-
fingerprint: this.fingerprint
|
|
39
|
-
});
|
|
30
|
+
const uh = new UnitsHandler(this.moleculeColumn);
|
|
31
|
+
await (uh.isFasta() ? this.computeByMM() : this.computeByChem());
|
|
32
|
+
|
|
40
33
|
const diverseColumnName: string = this.diverseColumnLabel != null ? this.diverseColumnLabel :
|
|
41
34
|
`diverse (${this.moleculeColumnName})`;
|
|
42
35
|
const resCol = DG.Column.string(diverseColumnName, this.renderMolIds!.length)
|
|
@@ -49,4 +42,24 @@ export class SequenceDiversityViewer extends SequenceSearchBaseViewer {
|
|
|
49
42
|
}
|
|
50
43
|
}
|
|
51
44
|
}
|
|
45
|
+
|
|
46
|
+
private async computeByChem() {
|
|
47
|
+
const monomericMols = await getMonomericMols(this.moleculeColumn!);
|
|
48
|
+
//need to create df to calculate fingerprints
|
|
49
|
+
const _monomericMolsDf = DG.DataFrame.fromColumns([monomericMols]);
|
|
50
|
+
this.renderMolIds = await grok.functions.call('Chem:callChemDiversitySearch', {
|
|
51
|
+
col: monomericMols,
|
|
52
|
+
metricName: this.distanceMetric,
|
|
53
|
+
limit: this.limit,
|
|
54
|
+
fingerprint: this.fingerprint
|
|
55
|
+
});
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
private async computeByMM() {
|
|
59
|
+
const distanceMatrixData = await calcMmDistanceMatrix(this.moleculeColumn!);
|
|
60
|
+
const len = this.moleculeColumn!.length;
|
|
61
|
+
const linearizeFunc = dmLinearIndex(len);
|
|
62
|
+
this.renderMolIds = getDiverseSubset(len, Math.min(len, this.limit),
|
|
63
|
+
(i1: number, i2: number) => distanceMatrixData[linearizeFunc(i1, i2)]);
|
|
64
|
+
}
|
|
52
65
|
}
|
|
@@ -9,6 +9,8 @@ import {createDifferenceCanvas, createDifferencesWithPositions} from './sequence
|
|
|
9
9
|
import {updateDivInnerHTML} from '../utils/ui-utils';
|
|
10
10
|
import {Subject} from 'rxjs';
|
|
11
11
|
import {TAGS as bioTAGS, getSplitter} from '@datagrok-libraries/bio/src/utils/macromolecule';
|
|
12
|
+
import {UnitsHandler} from '@datagrok-libraries/bio/src/utils/units-handler';
|
|
13
|
+
import {calcMmDistanceMatrix, dmLinearIndex} from './workers/mm-distance-worker-creator';
|
|
12
14
|
|
|
13
15
|
export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
|
|
14
16
|
cutoff: number;
|
|
@@ -23,6 +25,8 @@ export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
|
|
|
23
25
|
gridSelect: boolean = false;
|
|
24
26
|
targetMoleculeIdx: number = 0;
|
|
25
27
|
computeCompleted = new Subject<boolean>();
|
|
28
|
+
distanceMatrixComputed: boolean = false;
|
|
29
|
+
mmDistanceMatrix: Float32Array;
|
|
26
30
|
|
|
27
31
|
constructor() {
|
|
28
32
|
super('similarity');
|
|
@@ -43,20 +47,9 @@ export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
|
|
|
43
47
|
this.curIdx = this.dataFrame!.currentRowIdx == -1 ? 0 : this.dataFrame!.currentRowIdx;
|
|
44
48
|
if (computeData && !this.gridSelect) {
|
|
45
49
|
this.targetMoleculeIdx = this.dataFrame!.currentRowIdx == -1 ? 0 : this.dataFrame!.currentRowIdx;
|
|
46
|
-
const
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
const df = await grok.functions.call('Chem:callChemSimilaritySearch', {
|
|
50
|
-
df: this.dataFrame,
|
|
51
|
-
col: monomericMols,
|
|
52
|
-
molecule: monomericMols.get(this.targetMoleculeIdx),
|
|
53
|
-
metricName: this.distanceMetric,
|
|
54
|
-
limit: this.limit,
|
|
55
|
-
minScore: this.cutoff,
|
|
56
|
-
fingerprint: this.fingerprint
|
|
57
|
-
});
|
|
58
|
-
this.idxs = df.getCol('indexes');
|
|
59
|
-
this.scores = df.getCol('score');
|
|
50
|
+
const uh = new UnitsHandler(this.moleculeColumn!);
|
|
51
|
+
|
|
52
|
+
await (uh.isFasta() ? this.computeByMM() : this.computeByChem());
|
|
60
53
|
const similarColumnName: string = this.similarColumnLabel != null ? this.similarColumnLabel :
|
|
61
54
|
`similar (${this.moleculeColumnName})`;
|
|
62
55
|
this.molCol = DG.Column.string(similarColumnName,
|
|
@@ -83,15 +76,51 @@ export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
|
|
|
83
76
|
}
|
|
84
77
|
}
|
|
85
78
|
|
|
79
|
+
private async computeByChem() {
|
|
80
|
+
const monomericMols = await getMonomericMols(this.moleculeColumn!);
|
|
81
|
+
//need to create df to calculate fingerprints
|
|
82
|
+
const _monomericMolsDf = DG.DataFrame.fromColumns([monomericMols]);
|
|
83
|
+
const df = await grok.functions.call('Chem:callChemSimilaritySearch', {
|
|
84
|
+
df: this.dataFrame,
|
|
85
|
+
col: monomericMols,
|
|
86
|
+
molecule: monomericMols.get(this.targetMoleculeIdx),
|
|
87
|
+
metricName: this.distanceMetric,
|
|
88
|
+
limit: this.limit,
|
|
89
|
+
minScore: this.cutoff,
|
|
90
|
+
fingerprint: this.fingerprint
|
|
91
|
+
});
|
|
92
|
+
this.idxs = df.getCol('indexes');
|
|
93
|
+
this.scores = df.getCol('score');
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
private async computeByMM() {
|
|
97
|
+
if (!this.distanceMatrixComputed) {
|
|
98
|
+
this.mmDistanceMatrix = await calcMmDistanceMatrix(this.moleculeColumn!);
|
|
99
|
+
this.distanceMatrixComputed = true;
|
|
100
|
+
}
|
|
101
|
+
const len = this.moleculeColumn!.length;
|
|
102
|
+
const linearizeFunc = dmLinearIndex(len);
|
|
103
|
+
// array that keeps track of the indexes and scores together
|
|
104
|
+
const indexWScore = Array(len).fill(0)
|
|
105
|
+
.map((_, i) => ({idx: i, score: i === this.targetMoleculeIdx ? 1 :
|
|
106
|
+
1 - this.mmDistanceMatrix[linearizeFunc(this.targetMoleculeIdx, i)]}));
|
|
107
|
+
indexWScore.sort((a, b) => b.score - a.score);
|
|
108
|
+
// get the most similar molecules
|
|
109
|
+
const actualLimit = Math.min(this.limit, len);
|
|
110
|
+
const mostSimilar = indexWScore.slice(0, actualLimit);
|
|
111
|
+
this.idxs = DG.Column.int('indexes', actualLimit).init((i) => mostSimilar[i].idx);
|
|
112
|
+
this.scores = DG.Column.float('score', actualLimit).init((i) => mostSimilar[i].score);
|
|
113
|
+
}
|
|
86
114
|
|
|
87
115
|
createPropertyPanel(resDf: DG.DataFrame) {
|
|
88
116
|
const propPanel = ui.div();
|
|
89
117
|
const molDifferences: { [key: number]: HTMLCanvasElement } = {};
|
|
90
|
-
const
|
|
91
|
-
const
|
|
118
|
+
const molColName = this.molCol?.name!;
|
|
119
|
+
const units = resDf.col(molColName)!.getTag(DG.TAGS.UNITS);
|
|
120
|
+
const separator = resDf.col(molColName)!.getTag(bioTAGS.separator);
|
|
92
121
|
const splitter = getSplitter(units, separator);
|
|
93
122
|
const subParts1 = splitter(this.moleculeColumn!.get(this.targetMoleculeIdx));
|
|
94
|
-
const subParts2 = splitter(resDf.get(
|
|
123
|
+
const subParts2 = splitter(resDf.get(molColName, resDf.currentRowIdx));
|
|
95
124
|
const canvas = createDifferenceCanvas(subParts1, subParts2, units, molDifferences);
|
|
96
125
|
propPanel.append(ui.div(canvas, {style: {width: '300px', overflow: 'scroll'}}));
|
|
97
126
|
if (subParts1.length !== subParts2.length) {
|
|
@@ -44,7 +44,7 @@ export async function sequenceSpace(spaceParams: ISequenceSpaceParams): Promise<
|
|
|
44
44
|
|
|
45
45
|
export async function sequenceSpaceByFingerprints(spaceParams: ISequenceSpaceParams): Promise<ISequenceSpaceResult> {
|
|
46
46
|
if (spaceParams.seqCol.version !== spaceParams.seqCol.temp[MONOMERIC_COL_TAGS.LAST_INVALIDATED_VERSION])
|
|
47
|
-
await invalidateMols(spaceParams.seqCol
|
|
47
|
+
await invalidateMols(spaceParams.seqCol as unknown as DG.Column<string>, false); //we expect only string columns here
|
|
48
48
|
|
|
49
49
|
const result = await grok.functions.call('Chem:getChemSpaceEmbeddings', {
|
|
50
50
|
col: spaceParams.seqCol.temp[MONOMERIC_COL_TAGS.MONOMERIC_MOLS],
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import * as grok from 'datagrok-api/grok';
|
|
2
|
+
import * as ui from 'datagrok-api/ui';
|
|
3
|
+
import * as DG from 'datagrok-api/dg';
|
|
4
|
+
import {UnitsHandler} from '@datagrok-libraries/bio/src/utils/units-handler';
|
|
5
|
+
|
|
6
|
+
export async function calcMmDistanceMatrix(column: DG.Column<any>): Promise<Float32Array> {
|
|
7
|
+
const values = column.toList();
|
|
8
|
+
const worker = new Worker(new URL('./mm-distance-worker.ts', import.meta.url));
|
|
9
|
+
if (column.semType !== DG.SEMTYPE.MACROMOLECULE)
|
|
10
|
+
throw new Error('Column has to be of macromolecule type');
|
|
11
|
+
const uh = new UnitsHandler(column);
|
|
12
|
+
const fnName = uh.getDistanceFunctionName();
|
|
13
|
+
worker.postMessage({values, fnName});
|
|
14
|
+
return new Promise((resolve, reject) => {
|
|
15
|
+
worker.onmessage = ({data: {error, distanceMatrixData}}): void => {
|
|
16
|
+
worker.terminate();
|
|
17
|
+
error ? reject(error) : resolve(distanceMatrixData);
|
|
18
|
+
};
|
|
19
|
+
});
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
// gets index of compressed distance matrix from 2d coordinates
|
|
23
|
+
export function dmLinearIndex(size: number) {
|
|
24
|
+
return (i: number, j: number) => {
|
|
25
|
+
const getLinearIndex = (i: number, j: number) => {
|
|
26
|
+
return size * i + j - Math.floor(((i + 2) * (i + 1)) / 2);
|
|
27
|
+
};
|
|
28
|
+
if (i <= j) return getLinearIndex(i, j);
|
|
29
|
+
else return getLinearIndex(j, i);
|
|
30
|
+
};
|
|
31
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import {DistanceMatrix} from '@datagrok-libraries/bio/src/trees/distance-matrix';
|
|
2
|
+
import {mmDistanceFunctions, MmDistanceFunctionsNames}
|
|
3
|
+
from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
|
|
4
|
+
|
|
5
|
+
onmessage = (event) => {
|
|
6
|
+
const {values, fnName} = event.data;
|
|
7
|
+
const data: { error?: any; distanceMatrixData?: Float32Array } = {};
|
|
8
|
+
try {
|
|
9
|
+
const distanceMatrix = DistanceMatrix.calc(values, mmDistanceFunctions[fnName as MmDistanceFunctionsNames]());
|
|
10
|
+
distanceMatrix.normalize();
|
|
11
|
+
data.distanceMatrixData = distanceMatrix.data;
|
|
12
|
+
} catch (e) {
|
|
13
|
+
data.error = e;
|
|
14
|
+
}
|
|
15
|
+
postMessage(data);
|
|
16
|
+
};
|
|
@@ -12,6 +12,7 @@ import {getTreeHelper, ITreeHelper} from '@datagrok-libraries/bio/src/trees/tree
|
|
|
12
12
|
import {getDendrogramService, IDendrogramService} from '@datagrok-libraries/bio/src/trees/dendrogram';
|
|
13
13
|
import {handleError} from './utils';
|
|
14
14
|
import {DemoScript} from '@datagrok-libraries/tutorials/src/demo-script';
|
|
15
|
+
import { DimReductionMethods } from '@datagrok-libraries/ml/src/reduce-dimensionality';
|
|
15
16
|
|
|
16
17
|
const dataFn: string = 'data/sample_FASTA_PT_activity.csv';
|
|
17
18
|
|
|
@@ -23,7 +24,7 @@ export async function demoBio01bUI() {
|
|
|
23
24
|
let view: DG.TableView;
|
|
24
25
|
let activityCliffsViewer: DG.ScatterPlotViewer;
|
|
25
26
|
|
|
26
|
-
const dimRedMethod:
|
|
27
|
+
const dimRedMethod: DimReductionMethods = DimReductionMethods.UMAP;
|
|
27
28
|
const idRows: { [id: number]: number } = {};
|
|
28
29
|
|
|
29
30
|
try {
|
|
@@ -7,8 +7,9 @@ import {handleError} from './utils';
|
|
|
7
7
|
|
|
8
8
|
import {IWebLogoViewer} from '@datagrok-libraries/bio/src/viewers/web-logo';
|
|
9
9
|
import {pepseaMethods, runPepsea} from '../utils/pepsea';
|
|
10
|
-
import {StringMetricsNames} from '@datagrok-libraries/ml/src/typed-metrics';
|
|
11
10
|
import {DemoScript} from '@datagrok-libraries/tutorials/src/demo-script';
|
|
11
|
+
import { DimReductionMethods } from '@datagrok-libraries/ml/src/reduce-dimensionality';
|
|
12
|
+
import { MmDistanceFunctionsNames } from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
|
|
12
13
|
|
|
13
14
|
const helmFn: string = 'samples/sample_HELM.csv';
|
|
14
15
|
|
|
@@ -22,7 +23,7 @@ export async function demoBio05UI(): Promise<void> {
|
|
|
22
23
|
|
|
23
24
|
const helmColName: string = 'HELM';
|
|
24
25
|
const msaHelmColName: string = 'msa(HELM)';
|
|
25
|
-
const dimRedMethod:
|
|
26
|
+
const dimRedMethod: DimReductionMethods = DimReductionMethods.UMAP;
|
|
26
27
|
|
|
27
28
|
try {
|
|
28
29
|
const demoScript = new DemoScript(
|
|
@@ -52,7 +53,7 @@ export async function demoBio05UI(): Promise<void> {
|
|
|
52
53
|
})
|
|
53
54
|
.step('Build sequence space', async () => {
|
|
54
55
|
ssViewer = (await sequenceSpaceTopMenu(df, msaHelmCol,
|
|
55
|
-
dimRedMethod,
|
|
56
|
+
dimRedMethod, MmDistanceFunctionsNames.LEVENSHTEIN, true)) as DG.ScatterPlotViewer;
|
|
56
57
|
view.dockManager.dock(ssViewer, DG.DOCK_TYPE.RIGHT, null, 'Sequence Space', 0.35);
|
|
57
58
|
}, {
|
|
58
59
|
description: 'Reduce sequence space dimensionality to display on 2D representation.',
|
package/src/demo/utils.ts
CHANGED
|
@@ -6,6 +6,8 @@ import {_package, sequenceSpaceTopMenu} from '../package';
|
|
|
6
6
|
import {reduceDimensinalityWithNormalization} from '@datagrok-libraries/ml/src/sequence-space';
|
|
7
7
|
import {StringMetricsNames} from '@datagrok-libraries/ml/src/typed-metrics';
|
|
8
8
|
import {delay} from '@datagrok-libraries/utils/src/test';
|
|
9
|
+
import { DimReductionMethods } from '@datagrok-libraries/ml/src/reduce-dimensionality';
|
|
10
|
+
import { MmDistanceFunctionsNames } from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
|
|
9
11
|
|
|
10
12
|
enum EMBED_COL_NAMES {
|
|
11
13
|
X = 'Embed_X',
|
|
@@ -63,7 +65,7 @@ export async function demoSequenceSpace(
|
|
|
63
65
|
})) as DG.ScatterPlotViewer;
|
|
64
66
|
} else {
|
|
65
67
|
resSpaceViewer = (await sequenceSpaceTopMenu(df, df.getCol(colName),
|
|
66
|
-
|
|
68
|
+
DimReductionMethods.UMAP, MmDistanceFunctionsNames.LEVENSHTEIN, true)) as DG.ScatterPlotViewer;
|
|
67
69
|
}
|
|
68
70
|
view.dockManager.dock(resSpaceViewer!, DG.DOCK_TYPE.RIGHT, null, 'Sequence Space', 0.35);
|
|
69
71
|
return resSpaceViewer;
|
package/src/package.ts
CHANGED
|
@@ -10,7 +10,7 @@ import {
|
|
|
10
10
|
import {VdRegionsViewer} from './viewers/vd-regions-viewer';
|
|
11
11
|
import {SequenceAlignment} from './seq_align';
|
|
12
12
|
import {getEmbeddingColsNames, sequenceSpaceByFingerprints, getSequenceSpace} from './analysis/sequence-space';
|
|
13
|
-
import {getActivityCliffs} from '@datagrok-libraries/ml/src/viewers/activity-cliffs';
|
|
13
|
+
import {ISequenceSpaceParams, getActivityCliffs} from '@datagrok-libraries/ml/src/viewers/activity-cliffs';
|
|
14
14
|
import {
|
|
15
15
|
createLinesGrid,
|
|
16
16
|
createPropPanelElement,
|
|
@@ -43,7 +43,7 @@ import {
|
|
|
43
43
|
LIB_STORAGE_NAME, LibSettings, getUserLibSettings, setUserLibSetting, getLibFileNameList
|
|
44
44
|
} from './utils/monomer-lib';
|
|
45
45
|
import {getMacromoleculeColumn} from './utils/ui-utils';
|
|
46
|
-
import {ITSNEOptions, IUMAPOptions} from '@datagrok-libraries/ml/src/reduce-dimensionality';
|
|
46
|
+
import {DimReductionMethods, ITSNEOptions, IUMAPOptions} from '@datagrok-libraries/ml/src/reduce-dimensionality';
|
|
47
47
|
import {SequenceSpaceFunctionEditor} from '@datagrok-libraries/ml/src/functionEditors/seq-space-editor';
|
|
48
48
|
import {ActivityCliffsFunctionEditor} from '@datagrok-libraries/ml/src/functionEditors/activity-cliffs-editor';
|
|
49
49
|
import {demoBio01UI} from './demo/bio01-similarity-diversity';
|
|
@@ -53,6 +53,8 @@ import {demoBio03UI} from './demo/bio03-atomic-level';
|
|
|
53
53
|
import {demoBio05UI} from './demo/bio05-helm-msa-sequence-space';
|
|
54
54
|
import {checkInputColumnUI} from './utils/check-input-column';
|
|
55
55
|
import {multipleSequenceAlignmentUI} from './utils/multiple-sequence-alignment-ui';
|
|
56
|
+
import { MmDistanceFunctionsNames } from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
|
|
57
|
+
import { BitArrayMetrics, BitArrayMetricsNames, StringMetricsNames } from '@datagrok-libraries/ml/src/typed-metrics';
|
|
56
58
|
import { NotationConverter } from '@datagrok-libraries/bio/src/utils/notation-converter';
|
|
57
59
|
|
|
58
60
|
export const _package = new DG.Package();
|
|
@@ -280,7 +282,7 @@ export function SeqActivityCliffsEditor(call: DG.FuncCall) {
|
|
|
280
282
|
//output: viewer result
|
|
281
283
|
//editor: Bio:SeqActivityCliffsEditor
|
|
282
284
|
export async function activityCliffs(df: DG.DataFrame, macroMolecule: DG.Column, activities: DG.Column,
|
|
283
|
-
similarity: number, methodName:
|
|
285
|
+
similarity: number, methodName: DimReductionMethods, options?: IUMAPOptions | ITSNEOptions
|
|
284
286
|
): Promise<DG.Viewer | undefined> {
|
|
285
287
|
if (!checkInputColumnUI(macroMolecule, 'Activity Cliffs'))
|
|
286
288
|
return;
|
|
@@ -292,7 +294,7 @@ export async function activityCliffs(df: DG.DataFrame, macroMolecule: DG.Column,
|
|
|
292
294
|
'alphabet': macroMolecule.getTag(bioTAGS.alphabet),
|
|
293
295
|
};
|
|
294
296
|
const nc = new NotationConverter(macroMolecule);
|
|
295
|
-
let columnDistanceMetric =
|
|
297
|
+
let columnDistanceMetric: BitArrayMetricsNames | MmDistanceFunctionsNames = BitArrayMetricsNames.Tanimoto;
|
|
296
298
|
let seqCol = macroMolecule;
|
|
297
299
|
if (nc.isFasta() || (nc.isSeparator() && nc.alphabet && nc.alphabet !== ALPHABET.UN)){
|
|
298
300
|
if (nc.isFasta()){
|
|
@@ -347,8 +349,8 @@ export function SequenceSpaceEditor(call: DG.FuncCall) {
|
|
|
347
349
|
//input: bool plotEmbeddings = true
|
|
348
350
|
//input: object options {optional: true}
|
|
349
351
|
//editor: Bio:SequenceSpaceEditor
|
|
350
|
-
export async function sequenceSpaceTopMenu(table: DG.DataFrame, macroMolecule: DG.Column, methodName:
|
|
351
|
-
similarityMetric:
|
|
352
|
+
export async function sequenceSpaceTopMenu(table: DG.DataFrame, macroMolecule: DG.Column, methodName: DimReductionMethods,
|
|
353
|
+
similarityMetric: BitArrayMetrics | MmDistanceFunctionsNames = BitArrayMetricsNames.Tanimoto, plotEmbeddings: boolean, options?: IUMAPOptions | ITSNEOptions
|
|
352
354
|
): Promise<DG.Viewer | undefined> {
|
|
353
355
|
// Delay is required for initial function dialog to close before starting invalidating of molfiles.
|
|
354
356
|
// Otherwise, dialog is freezing
|
|
@@ -360,7 +362,7 @@ export async function sequenceSpaceTopMenu(table: DG.DataFrame, macroMolecule: D
|
|
|
360
362
|
const withoutEmptyValues = DG.DataFrame.fromColumns([macroMolecule]).clone();
|
|
361
363
|
const emptyValsIdxs = removeEmptyStringRows(withoutEmptyValues, macroMolecule);
|
|
362
364
|
|
|
363
|
-
const chemSpaceParams = {
|
|
365
|
+
const chemSpaceParams: ISequenceSpaceParams = {
|
|
364
366
|
seqCol: withoutEmptyValues.col(macroMolecule.name)!,
|
|
365
367
|
methodName: methodName,
|
|
366
368
|
similarityMetric: similarityMetric,
|
|
@@ -6,6 +6,7 @@ import {after, before, category, test} from '@datagrok-libraries/utils/src/test'
|
|
|
6
6
|
|
|
7
7
|
import {readDataframe} from './utils';
|
|
8
8
|
import {_testActivityCliffsOpen} from './activity-cliffs-utils';
|
|
9
|
+
import { DimReductionMethods } from '@datagrok-libraries/ml/src/reduce-dimensionality';
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
category('activityCliffs', async () => {
|
|
@@ -33,7 +34,7 @@ category('activityCliffs', async () => {
|
|
|
33
34
|
actCliffsTableView = grok.shell.addTableView(actCliffsDf);
|
|
34
35
|
viewList.push(actCliffsTableView);
|
|
35
36
|
|
|
36
|
-
await _testActivityCliffsOpen(actCliffsDf, 57,
|
|
37
|
+
await _testActivityCliffsOpen(actCliffsDf, 57, DimReductionMethods.UMAP, 'MSA');
|
|
37
38
|
}, {skipReason: 'GROK-12774'});
|
|
38
39
|
|
|
39
40
|
test('activityCliffsWithEmptyRows', async () => {
|
|
@@ -42,6 +43,6 @@ category('activityCliffs', async () => {
|
|
|
42
43
|
actCliffsTableViewWithEmptyRows = grok.shell.addTableView(actCliffsDfWithEmptyRows);
|
|
43
44
|
viewList.push(actCliffsTableViewWithEmptyRows);
|
|
44
45
|
|
|
45
|
-
await _testActivityCliffsOpen(actCliffsDfWithEmptyRows, 57,
|
|
46
|
+
await _testActivityCliffsOpen(actCliffsDfWithEmptyRows, 57, DimReductionMethods.UMAP, 'MSA');
|
|
46
47
|
}, {skipReason: 'GROK-12774'});
|
|
47
48
|
});
|
|
@@ -3,8 +3,9 @@ import * as grok from 'datagrok-api/grok';
|
|
|
3
3
|
|
|
4
4
|
import {delay, expect} from '@datagrok-libraries/utils/src/test';
|
|
5
5
|
import {activityCliffs} from '../package';
|
|
6
|
+
import { DimReductionMethods } from '@datagrok-libraries/ml/src/reduce-dimensionality';
|
|
6
7
|
|
|
7
|
-
export async function _testActivityCliffsOpen(df: DG.DataFrame, numberCliffs: number, method:
|
|
8
|
+
export async function _testActivityCliffsOpen(df: DG.DataFrame, numberCliffs: number, method: DimReductionMethods, colName: string) {
|
|
8
9
|
await grok.data.detectSemanticTypes(df);
|
|
9
10
|
const scatterPlot = await activityCliffs(
|
|
10
11
|
df, df.getCol(colName), df.getCol('Activity'),
|
|
@@ -5,6 +5,7 @@ import * as DG from 'datagrok-api/dg';
|
|
|
5
5
|
import {after, before, category, test, expect, delay} from '@datagrok-libraries/utils/src/test';
|
|
6
6
|
import {readDataframe} from './utils';
|
|
7
7
|
import {_testSequenceSpaceReturnsResult} from './sequence-space-utils';
|
|
8
|
+
import { DimReductionMethods } from '@datagrok-libraries/ml/src/reduce-dimensionality';
|
|
8
9
|
|
|
9
10
|
category('sequenceSpace', async () => {
|
|
10
11
|
let testFastaDf: DG.DataFrame;
|
|
@@ -15,7 +16,7 @@ category('sequenceSpace', async () => {
|
|
|
15
16
|
test('sequenceSpaceOpens', async () => {
|
|
16
17
|
testFastaDf = await readDataframe('tests/sample_MSA_data.csv');
|
|
17
18
|
testFastaTableView = grok.shell.addTableView(testFastaDf);
|
|
18
|
-
await _testSequenceSpaceReturnsResult(testFastaDf,
|
|
19
|
+
await _testSequenceSpaceReturnsResult(testFastaDf, DimReductionMethods.UMAP, 'MSA');
|
|
19
20
|
grok.shell.closeTable(testFastaDf);
|
|
20
21
|
testFastaTableView.close();
|
|
21
22
|
}, {skipReason: 'GROK-12775'});
|
|
@@ -23,7 +24,7 @@ category('sequenceSpace', async () => {
|
|
|
23
24
|
test('sequenceSpaceWithEmptyRows', async () => {
|
|
24
25
|
testHelmWithEmptyRows = await readDataframe('tests/sample_MSA_data_empty_vals.csv');
|
|
25
26
|
testHelmWithEmptyRowsTableView = grok.shell.addTableView(testHelmWithEmptyRows);
|
|
26
|
-
await _testSequenceSpaceReturnsResult(testHelmWithEmptyRows,
|
|
27
|
+
await _testSequenceSpaceReturnsResult(testHelmWithEmptyRows, DimReductionMethods.UMAP, 'MSA');
|
|
27
28
|
grok.shell.closeTable(testHelmWithEmptyRows);
|
|
28
29
|
testHelmWithEmptyRowsTableView.close();
|
|
29
30
|
}, {skipReason: 'GROK-12775'});
|
|
@@ -2,14 +2,16 @@ import * as DG from 'datagrok-api/dg';
|
|
|
2
2
|
import * as grok from 'datagrok-api/grok';
|
|
3
3
|
import {expect} from '@datagrok-libraries/utils/src/test';
|
|
4
4
|
import {sequenceSpaceTopMenu} from '../package';
|
|
5
|
+
import { MmDistanceFunctionsNames } from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
|
|
6
|
+
import { DimReductionMethods } from '@datagrok-libraries/ml/src/reduce-dimensionality';
|
|
5
7
|
|
|
6
|
-
export async function _testSequenceSpaceReturnsResult(df: DG.DataFrame, algorithm:
|
|
8
|
+
export async function _testSequenceSpaceReturnsResult(df: DG.DataFrame, algorithm: DimReductionMethods, colName: string) {
|
|
7
9
|
// await grok.data.detectSemanticTypes(df);
|
|
8
10
|
const col: DG.Column = df.getCol(colName);
|
|
9
11
|
const semType: string = await grok.functions.call('Bio:detectMacromolecule', {col: col});
|
|
10
12
|
if (semType)
|
|
11
13
|
col.semType = semType;
|
|
12
14
|
|
|
13
|
-
const sp = await sequenceSpaceTopMenu(df, df.col(colName)!, algorithm,
|
|
15
|
+
const sp = await sequenceSpaceTopMenu(df, df.col(colName)!, algorithm, MmDistanceFunctionsNames.LEVENSHTEIN, true);
|
|
14
16
|
expect(sp != null, true);
|
|
15
17
|
}
|
package/src/tests/viewers.ts
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import * as DG from 'datagrok-api/dg';
|
|
2
|
-
|
|
2
|
+
import * as grok from 'datagrok-api/grok';
|
|
3
3
|
//import * as ui from 'datagrok-api/ui';
|
|
4
4
|
|
|
5
|
-
import {category, test, testViewer} from '@datagrok-libraries/utils/src/test';
|
|
5
|
+
import {category, delay, test, testViewer} from '@datagrok-libraries/utils/src/test';
|
|
6
6
|
import {readDataframe} from './utils';
|
|
7
7
|
|
|
8
8
|
|
|
@@ -10,7 +10,12 @@ category('viewers', () => {
|
|
|
10
10
|
const viewers = DG.Func.find({package: 'Bio', tags: ['viewer']}).map((f) => f.friendlyName);
|
|
11
11
|
for (const v of viewers) {
|
|
12
12
|
test(v, async () => {
|
|
13
|
-
|
|
13
|
+
const df = await readDataframe('data/sample_FASTA_DNA.csv');
|
|
14
|
+
const tv = grok.shell.addTableView(df);
|
|
15
|
+
await grok.data.detectSemanticTypes(df);
|
|
16
|
+
tv.addViewer(v);
|
|
17
|
+
await delay(2000);
|
|
18
|
+
// await testViewer(v, df, {detectSemanticTypes: true});
|
|
14
19
|
});
|
|
15
20
|
}
|
|
16
21
|
});
|