@datagrok/bio 2.26.8 → 2.27.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +4 -0
- package/CLAUDE.md +35 -5
- package/detectors.js +4 -2
- package/dist/287.js +1 -1
- package/dist/287.js.map +1 -1
- package/dist/422.js +1 -1
- package/dist/422.js.map +1 -1
- package/dist/package-test.js +1 -1
- package/dist/package-test.js.map +1 -1
- package/dist/package.js +3 -3
- package/dist/package.js.map +1 -1
- package/package.json +2 -2
- package/scripts/mol-to-helm.py +642 -170
- package/src/analysis/sequence-activity-cliffs.ts +8 -6
- package/src/package-api.ts +9 -2
- package/src/package.g.ts +12 -0
- package/src/package.ts +23 -9
- package/src/tests/msa-tests.ts +6 -2
- package/src/utils/annotations/annotation-manager-ui.ts +1 -1
- package/src/utils/constants.ts +3 -7
- package/src/utils/monomer-lib/library-file-manager/ui.ts +1 -1
- package/src/utils/monomer-lib/monomer-manager/monomer-manager.ts +1 -1
- package/src/utils/multiple-sequence-alignment-ui.ts +403 -194
- package/src/utils/pepsea.ts +138 -116
- package/src/utils/types.ts +7 -4
- package/test-console-output-1.log +584 -615
- package/test-record-1.mp4 +0 -0
package/src/utils/pepsea.ts
CHANGED
|
@@ -1,7 +1,5 @@
|
|
|
1
|
-
/* eslint-disable max-params */
|
|
2
1
|
/* Do not change these import lines to match external modules in webpack configuration */
|
|
3
2
|
import * as grok from 'datagrok-api/grok';
|
|
4
|
-
import * as ui from 'datagrok-api/ui';
|
|
5
3
|
import * as DG from 'datagrok-api/dg';
|
|
6
4
|
|
|
7
5
|
import {NOTATION, TAGS as bioTAGS, ALIGNMENT, ALPHABET} from '@datagrok-libraries/bio/src/utils/macromolecule';
|
|
@@ -9,9 +7,22 @@ import {ILogger} from '@datagrok-libraries/bio/src/utils/logger';
|
|
|
9
7
|
|
|
10
8
|
import {checkForSingleSeqClusters} from './multiple-sequence-alignment';
|
|
11
9
|
import * as C from './constants';
|
|
12
|
-
|
|
13
10
|
import {_package} from '../package';
|
|
14
11
|
|
|
12
|
+
export const pepseaMethods = ['mafft --auto', 'mafft', 'linsi', 'ginsi', 'einsi', 'fftns', 'fftnsi', 'nwns', 'nwnsi'];
|
|
13
|
+
|
|
14
|
+
const ALIGNMENT_META_KEYS = ['AlignedSeq', 'AlignedSubpeptide', 'HELM', 'ID', 'PolymerID'];
|
|
15
|
+
|
|
16
|
+
type PepseaResponse = {
|
|
17
|
+
Alignment: {
|
|
18
|
+
PolymerID: string; AlignedSubpeptide: string; HELM: string;
|
|
19
|
+
ID: string; AlignedSeq: string; [key: string]: string;
|
|
20
|
+
}[];
|
|
21
|
+
AlignmentScore: {[key: string]: number | null};
|
|
22
|
+
};
|
|
23
|
+
|
|
24
|
+
type PepseaBodyUnit = {ID: string; HELM: string};
|
|
25
|
+
|
|
15
26
|
export const Pepsea = new class {
|
|
16
27
|
public readonly dcName: string = 'bio';
|
|
17
28
|
|
|
@@ -20,103 +31,117 @@ export const Pepsea = new class {
|
|
|
20
31
|
}
|
|
21
32
|
}();
|
|
22
33
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
34
|
+
|
|
35
|
+
/** Aligns all sequences in the column using PepSeA Docker container.
|
|
36
|
+
* Does not handle clustering - aligns all rows as a single group.
|
|
37
|
+
* Used by the registered sequenceMSA function. */
|
|
38
|
+
export async function alignWithPepsea(
|
|
39
|
+
srcCol: DG.Column<string>,
|
|
40
|
+
method: string = 'mafft --auto',
|
|
41
|
+
gapOpen: number = 1.53,
|
|
42
|
+
gapExtend: number = 0,
|
|
43
|
+
): Promise<DG.Column<string>> {
|
|
44
|
+
const container = await Pepsea.getDockerContainer();
|
|
45
|
+
const rowCount = srcCol.length;
|
|
46
|
+
|
|
47
|
+
const body: PepseaBodyUnit[] = [];
|
|
48
|
+
for (let i = 0; i < rowCount; i++) {
|
|
49
|
+
const seq = srcCol.get(i);
|
|
50
|
+
if (seq)
|
|
51
|
+
body.push({ID: i.toString(), HELM: seq});
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
const response = await requestAlignedObjects(container.id, body, method, gapOpen, gapExtend);
|
|
55
|
+
const aligned = parseAlignmentResponse(response, rowCount);
|
|
56
|
+
|
|
57
|
+
const colName = srcCol.dataFrame?.columns?.getUnusedName(`msa(${srcCol.name})`) ?? `msa(${srcCol.name})`;
|
|
58
|
+
return createPepseaResultColumn(colName, aligned);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
/** Aligns sequences with PepSeA, supporting per-cluster alignment.
|
|
63
|
+
* Used by tests and legacy code paths. */
|
|
64
|
+
export async function runPepsea(
|
|
65
|
+
table: DG.DataFrame, srcCol: DG.Column<string>, unUsedName: string,
|
|
45
66
|
method: typeof pepseaMethods[number] = 'ginsi', gapOpen: number = 1.53, gapExtend: number = 0.0,
|
|
46
|
-
clustersCol: DG.Column<string | number> | null = null, logger?: ILogger, onlySelected: boolean = false
|
|
67
|
+
clustersCol: DG.Column<string | number> | null = null, logger?: ILogger, onlySelected: boolean = false,
|
|
47
68
|
): Promise<DG.Column<string>> {
|
|
48
|
-
const
|
|
49
|
-
const
|
|
50
|
-
|
|
51
|
-
|
|
69
|
+
const container = await Pepsea.getDockerContainer();
|
|
70
|
+
const rowCount = srcCol.length;
|
|
71
|
+
|
|
72
|
+
clustersCol ??= DG.Column.int('Clusters', rowCount).init(0);
|
|
73
|
+
if (clustersCol.type !== DG.COLUMN_TYPE.STRING)
|
|
52
74
|
clustersCol = clustersCol.convertTo(DG.TYPE.STRING);
|
|
53
75
|
|
|
54
|
-
const
|
|
55
|
-
const
|
|
56
|
-
const bodies: PepseaBodyUnit[][] = new Array(
|
|
57
|
-
const clusterIndexes: number[][] = new Array(
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
const clusterId = clusterCategoryIdx;
|
|
68
|
-
const helmSeq = srcCol.get(rowIndex);
|
|
69
|
-
if (helmSeq) {
|
|
70
|
-
(bodies[clusterId] ??= []).push({ID: rowIndex.toString(), HELM: helmSeq});
|
|
71
|
-
(clusterIndexes[clusterCategoryIdx] ??= []).push(rowIndex);
|
|
72
|
-
}
|
|
73
|
-
}
|
|
74
|
-
} else {
|
|
75
|
-
const selection = table.selection;
|
|
76
|
-
for (let rowIndex = -1; (rowIndex = selection.findNext(rowIndex, true)) !== -1;) {
|
|
77
|
-
const clusterCategoryIdx = clustersColData[rowIndex];
|
|
78
|
-
const cluster = clustersColCategories[clusterCategoryIdx];
|
|
79
|
-
if (!cluster)
|
|
80
|
-
continue;
|
|
81
|
-
|
|
82
|
-
const clusterId = clusterCategoryIdx;
|
|
83
|
-
const helmSeq = srcCol.get(rowIndex);
|
|
84
|
-
if (helmSeq) {
|
|
85
|
-
(bodies[clusterId] ??= []).push({ID: rowIndex.toString(), HELM: helmSeq});
|
|
86
|
-
(clusterIndexes[clusterCategoryIdx] ??= []).push(rowIndex);
|
|
87
|
-
}
|
|
76
|
+
const categories = clustersCol.categories;
|
|
77
|
+
const data = clustersCol.getRawData();
|
|
78
|
+
const bodies: PepseaBodyUnit[][] = new Array(categories.length);
|
|
79
|
+
const clusterIndexes: number[][] = new Array(categories.length);
|
|
80
|
+
|
|
81
|
+
const rows = onlySelected ? selectedRows(table.selection) : allRows(rowCount);
|
|
82
|
+
for (const rowIndex of rows) {
|
|
83
|
+
const catIdx = data[rowIndex];
|
|
84
|
+
if (!categories[catIdx]) continue;
|
|
85
|
+
const helmSeq = srcCol.get(rowIndex);
|
|
86
|
+
if (helmSeq) {
|
|
87
|
+
(bodies[catIdx] ??= []).push({ID: rowIndex.toString(), HELM: helmSeq});
|
|
88
|
+
(clusterIndexes[catIdx] ??= []).push(rowIndex);
|
|
88
89
|
}
|
|
89
90
|
}
|
|
90
|
-
checkForSingleSeqClusters(clusterIndexes,
|
|
91
|
-
|
|
92
|
-
const alignedSequences: string[] = new Array(
|
|
93
|
-
for (const body of bodies) {
|
|
94
|
-
if (!body || body.length === 0)
|
|
95
|
-
|
|
96
|
-
const
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
for (const alignment of alignments) { // filling alignedSequencesCol
|
|
100
|
-
alignedSequences[parseInt(alignment.ID)] = Object.entries(alignment)
|
|
101
|
-
.filter((v) => !alignmentObjectMetaKeys.includes(v[0]))
|
|
102
|
-
.map((v) => v[1] !== '-' ? v[1] : '')
|
|
103
|
-
.join(C.PEPSEA.SEPARATOR);
|
|
104
|
-
}
|
|
91
|
+
checkForSingleSeqClusters(clusterIndexes, categories);
|
|
92
|
+
|
|
93
|
+
const alignedSequences: string[] = new Array(rowCount).fill(null);
|
|
94
|
+
for (const body of bodies) {
|
|
95
|
+
if (!body || body.length === 0) continue;
|
|
96
|
+
const response = await requestAlignedObjects(container.id, body, method, gapOpen, gapExtend, logger);
|
|
97
|
+
for (const alignment of response.Alignment)
|
|
98
|
+
alignedSequences[parseInt(alignment.ID)] = extractAlignedSequence(alignment);
|
|
105
99
|
}
|
|
106
100
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
101
|
+
return createPepseaResultColumn(unUsedName, alignedSequences);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
// --- Helpers ---
|
|
106
|
+
|
|
107
|
+
function extractAlignedSequence(alignment: PepseaResponse['Alignment'][0]): string {
|
|
108
|
+
return Object.entries(alignment)
|
|
109
|
+
.filter(([key]) => !ALIGNMENT_META_KEYS.includes(key))
|
|
110
|
+
.map(([, val]) => val !== '-' ? val : '')
|
|
111
|
+
.join(C.PEPSEA.SEPARATOR);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
function parseAlignmentResponse(response: PepseaResponse, rowCount: number): string[] {
|
|
115
|
+
const aligned: string[] = new Array(rowCount).fill(null);
|
|
116
|
+
for (const alignment of response.Alignment)
|
|
117
|
+
aligned[parseInt(alignment.ID)] = extractAlignedSequence(alignment);
|
|
118
|
+
return aligned;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
function createPepseaResultColumn(name: string, sequences: string[]): DG.Column<string> {
|
|
122
|
+
const col = DG.Column.fromStrings(name, sequences);
|
|
123
|
+
col.meta.units = NOTATION.SEPARATOR;
|
|
124
|
+
col.setTag(bioTAGS.separator, C.PEPSEA.SEPARATOR);
|
|
125
|
+
col.setTag(bioTAGS.aligned, ALIGNMENT.SEQ_MSA);
|
|
126
|
+
col.setTag(bioTAGS.alphabet, ALPHABET.UN);
|
|
127
|
+
col.setTag(bioTAGS.alphabetIsMultichar, 'true');
|
|
128
|
+
col.semType = DG.SEMTYPE.MACROMOLECULE;
|
|
129
|
+
return col;
|
|
130
|
+
}
|
|
114
131
|
|
|
115
|
-
|
|
132
|
+
function* selectedRows(selection: DG.BitSet): Generator<number> {
|
|
133
|
+
for (let i = -1; (i = selection.findNext(i, true)) !== -1;)
|
|
134
|
+
yield i;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
function* allRows(count: number): Generator<number> {
|
|
138
|
+
for (let i = 0; i < count; i++)
|
|
139
|
+
yield i;
|
|
116
140
|
}
|
|
117
141
|
|
|
118
142
|
async function requestAlignedObjects(
|
|
119
|
-
dockerfileId: string, body: PepseaBodyUnit[], method: string,
|
|
143
|
+
dockerfileId: string, body: PepseaBodyUnit[], method: string,
|
|
144
|
+
gapOpen: number, gapExtend: number, logger?: ILogger,
|
|
120
145
|
): Promise<PepseaResponse> {
|
|
121
146
|
const params = {
|
|
122
147
|
method: 'POST',
|
|
@@ -124,37 +149,34 @@ async function requestAlignedObjects(
|
|
|
124
149
|
body: JSON.stringify(body),
|
|
125
150
|
};
|
|
126
151
|
const path = `/align?method=${method}&gap_open=${gapOpen}&gap_extend=${gapExtend}`;
|
|
127
|
-
|
|
128
|
-
const t1
|
|
152
|
+
|
|
153
|
+
const t1 = window.performance.now();
|
|
129
154
|
// @ts-ignore
|
|
130
155
|
const response: Response = await grok.dapi.docker.dockerContainers.fetchProxy(dockerfileId, path, params);
|
|
131
|
-
const t2
|
|
132
|
-
_package.logger.debug(`Bio: requestAlignedObjects()
|
|
133
|
-
|
|
134
|
-
const
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
if (
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
throw new Error(
|
|
146
|
-
} else if (!response.ok && !isJson) {
|
|
147
|
-
const responseStr = await response.text();
|
|
148
|
-
throw new Error(`Error: ${responseStr}`);
|
|
149
|
-
} else if (!isJson) {
|
|
150
|
-
const responseStr = await response.text();
|
|
151
|
-
throw new Error(`Error: PepSeA expected JSON response, got '${responseStr}'.`);
|
|
156
|
+
const t2 = window.performance.now();
|
|
157
|
+
_package.logger.debug(`Bio: requestAlignedObjects() ET: ${(t2 - t1)} ms`);
|
|
158
|
+
|
|
159
|
+
const contentType = response.headers.get('content-type');
|
|
160
|
+
const isJson = contentType === 'application/json';
|
|
161
|
+
|
|
162
|
+
if (!response.ok) {
|
|
163
|
+
if (isJson) {
|
|
164
|
+
const json = await response.json();
|
|
165
|
+
if (json['pepsea-error']) throw new Error(`PepSeA error: ${json['pepsea-error']}`);
|
|
166
|
+
if (json['datagrok-error']) throw new Error(`Datagrok error: ${json['datagrok-error']}`);
|
|
167
|
+
throw new Error(response.statusText);
|
|
168
|
+
}
|
|
169
|
+
const text = await response.text();
|
|
170
|
+
throw new Error(`Error: ${text}`);
|
|
152
171
|
}
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
logger?.warning(pepseaStdErr);
|
|
172
|
+
|
|
173
|
+
if (!isJson) {
|
|
174
|
+
const text = await response.text();
|
|
175
|
+
throw new Error(`Error: PepSeA expected JSON response, got '${text}'.`);
|
|
158
176
|
}
|
|
177
|
+
|
|
178
|
+
const responseObj = await response.json();
|
|
179
|
+
if ('pepsea-stderr' in responseObj)
|
|
180
|
+
logger?.warning(responseObj['pepsea-stderr'] as string);
|
|
159
181
|
return responseObj as PepseaResponse;
|
|
160
182
|
}
|
package/src/utils/types.ts
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import * as DG from 'datagrok-api/dg';
|
|
2
|
-
import {pepseaMethods} from './pepsea';
|
|
3
2
|
|
|
4
3
|
export type DataFrameDict = { [key: string]: DG.DataFrame };
|
|
5
4
|
|
|
@@ -14,7 +13,11 @@ export type SubstitutionsInfo = Map<string, Map<string, Map<number, number[] | U
|
|
|
14
13
|
export type SelectionObject = { [position: string]: string[] };
|
|
15
14
|
|
|
16
15
|
export type MultipleSequenceAlignmentUIOptions = {
|
|
17
|
-
col?: DG.Column<string> | null
|
|
18
|
-
|
|
19
|
-
kalign?: { gapOpen?: number
|
|
16
|
+
col?: DG.Column<string> | null;
|
|
17
|
+
clustersCol?: DG.Column | null;
|
|
18
|
+
kalign?: { gapOpen?: number; gapExtend?: number; terminalGap?: number };
|
|
19
|
+
/** Name of the MSA engine function to use for non-canonical sequences */
|
|
20
|
+
engine?: string;
|
|
21
|
+
/** Engine-specific parameters (e.g. method, gapOpen, gapExtend) */
|
|
22
|
+
engineParams?: Record<string, any>;
|
|
20
23
|
};
|