@datagrok/bio 2.26.8 → 2.27.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,5 @@
1
- /* eslint-disable max-params */
2
1
  /* Do not change these import lines to match external modules in webpack configuration */
3
2
  import * as grok from 'datagrok-api/grok';
4
- import * as ui from 'datagrok-api/ui';
5
3
  import * as DG from 'datagrok-api/dg';
6
4
 
7
5
  import {NOTATION, TAGS as bioTAGS, ALIGNMENT, ALPHABET} from '@datagrok-libraries/bio/src/utils/macromolecule';
@@ -9,9 +7,22 @@ import {ILogger} from '@datagrok-libraries/bio/src/utils/logger';
9
7
 
10
8
  import {checkForSingleSeqClusters} from './multiple-sequence-alignment';
11
9
  import * as C from './constants';
12
-
13
10
  import {_package} from '../package';
14
11
 
12
+ export const pepseaMethods = ['mafft --auto', 'mafft', 'linsi', 'ginsi', 'einsi', 'fftns', 'fftnsi', 'nwns', 'nwnsi'];
13
+
14
+ const ALIGNMENT_META_KEYS = ['AlignedSeq', 'AlignedSubpeptide', 'HELM', 'ID', 'PolymerID'];
15
+
16
+ type PepseaResponse = {
17
+ Alignment: {
18
+ PolymerID: string; AlignedSubpeptide: string; HELM: string;
19
+ ID: string; AlignedSeq: string; [key: string]: string;
20
+ }[];
21
+ AlignmentScore: {[key: string]: number | null};
22
+ };
23
+
24
+ type PepseaBodyUnit = {ID: string; HELM: string};
25
+
15
26
  export const Pepsea = new class {
16
27
  public readonly dcName: string = 'bio';
17
28
 
@@ -20,103 +31,117 @@ export const Pepsea = new class {
20
31
  }
21
32
  }();
22
33
 
23
- export const pepseaMethods = ['mafft --auto', 'mafft', 'linsi', 'ginsi', 'einsi', 'fftns', 'fftnsi', 'nwns', 'nwnsi'];
24
- const alignmentObjectMetaKeys = ['AlignedSeq', 'AlignedSubpeptide', 'HELM', 'ID', 'PolymerID'];
25
- type PepseaResponse = {
26
- Alignment: {
27
- PolymerID: string, AlignedSubpeptide: string, HELM: string, ID: string, AlignedSeq: string, [key: string]: string,
28
- }[],
29
- AlignmentScore: { [key: string]: number | null },
30
- };
31
- type PepseaBodyUnit = { ID: string, HELM: string };
32
-
33
- /** Gets the column containing MSA sequences produced by the 'PepSeA' tool from the {@link srcCol} column.
34
- * Does not add the result column to the dataframe of {@link srcCol}.
35
- * @async
36
- * @param {DG.Column} srcCol - The column containing the sequences to be aligned.
37
- * @param {string} unUsedName - The name of the result column.
38
- * @param {string} method - The method used for alignment.
39
- * @param {number} gapOpen - The gap open penalty.
40
- * @param {number} gapExtend - The gap extension penalty.
41
- * @param {DG.Column} clustersCol - The column containing the clusters of the sequences.
42
- * @param logger {ILogger} Logger
43
- */
44
- export async function runPepsea(table: DG.DataFrame, srcCol: DG.Column<string>, unUsedName: string,
34
+
35
+ /** Aligns all sequences in the column using PepSeA Docker container.
36
+ * Does not handle clustering - aligns all rows as a single group.
37
+ * Used by the registered sequenceMSA function. */
38
+ export async function alignWithPepsea(
39
+ srcCol: DG.Column<string>,
40
+ method: string = 'mafft --auto',
41
+ gapOpen: number = 1.53,
42
+ gapExtend: number = 0,
43
+ ): Promise<DG.Column<string>> {
44
+ const container = await Pepsea.getDockerContainer();
45
+ const rowCount = srcCol.length;
46
+
47
+ const body: PepseaBodyUnit[] = [];
48
+ for (let i = 0; i < rowCount; i++) {
49
+ const seq = srcCol.get(i);
50
+ if (seq)
51
+ body.push({ID: i.toString(), HELM: seq});
52
+ }
53
+
54
+ const response = await requestAlignedObjects(container.id, body, method, gapOpen, gapExtend);
55
+ const aligned = parseAlignmentResponse(response, rowCount);
56
+
57
+ const colName = srcCol.dataFrame?.columns?.getUnusedName(`msa(${srcCol.name})`) ?? `msa(${srcCol.name})`;
58
+ return createPepseaResultColumn(colName, aligned);
59
+ }
60
+
61
+
62
+ /** Aligns sequences with PepSeA, supporting per-cluster alignment.
63
+ * Used by tests and legacy code paths. */
64
+ export async function runPepsea(
65
+ table: DG.DataFrame, srcCol: DG.Column<string>, unUsedName: string,
45
66
  method: typeof pepseaMethods[number] = 'ginsi', gapOpen: number = 1.53, gapExtend: number = 0.0,
46
- clustersCol: DG.Column<string | number> | null = null, logger?: ILogger, onlySelected: boolean = false
67
+ clustersCol: DG.Column<string | number> | null = null, logger?: ILogger, onlySelected: boolean = false,
47
68
  ): Promise<DG.Column<string>> {
48
- const pepseaContainer = await Pepsea.getDockerContainer();
49
- const peptideCount = srcCol.length;
50
- clustersCol ??= DG.Column.int('Clusters', peptideCount).init(0);
51
- if (clustersCol.type != DG.COLUMN_TYPE.STRING)
69
+ const container = await Pepsea.getDockerContainer();
70
+ const rowCount = srcCol.length;
71
+
72
+ clustersCol ??= DG.Column.int('Clusters', rowCount).init(0);
73
+ if (clustersCol.type !== DG.COLUMN_TYPE.STRING)
52
74
  clustersCol = clustersCol.convertTo(DG.TYPE.STRING);
53
75
 
54
- const clustersColCategories = clustersCol.categories;
55
- const clustersColData = clustersCol.getRawData();
56
- const bodies: PepseaBodyUnit[][] = new Array(clustersColCategories.length);
57
- const clusterIndexes: number[][] = new Array(clustersColCategories.length);
58
-
59
- // Grouping data by clusters
60
- if (!onlySelected) {
61
- for (let rowIndex = 0; rowIndex < peptideCount; ++rowIndex) {
62
- const clusterCategoryIdx = clustersColData[rowIndex];
63
- const cluster = clustersColCategories[clusterCategoryIdx];
64
- if (!cluster)
65
- continue;
66
-
67
- const clusterId = clusterCategoryIdx;
68
- const helmSeq = srcCol.get(rowIndex);
69
- if (helmSeq) {
70
- (bodies[clusterId] ??= []).push({ID: rowIndex.toString(), HELM: helmSeq});
71
- (clusterIndexes[clusterCategoryIdx] ??= []).push(rowIndex);
72
- }
73
- }
74
- } else {
75
- const selection = table.selection;
76
- for (let rowIndex = -1; (rowIndex = selection.findNext(rowIndex, true)) !== -1;) {
77
- const clusterCategoryIdx = clustersColData[rowIndex];
78
- const cluster = clustersColCategories[clusterCategoryIdx];
79
- if (!cluster)
80
- continue;
81
-
82
- const clusterId = clusterCategoryIdx;
83
- const helmSeq = srcCol.get(rowIndex);
84
- if (helmSeq) {
85
- (bodies[clusterId] ??= []).push({ID: rowIndex.toString(), HELM: helmSeq});
86
- (clusterIndexes[clusterCategoryIdx] ??= []).push(rowIndex);
87
- }
76
+ const categories = clustersCol.categories;
77
+ const data = clustersCol.getRawData();
78
+ const bodies: PepseaBodyUnit[][] = new Array(categories.length);
79
+ const clusterIndexes: number[][] = new Array(categories.length);
80
+
81
+ const rows = onlySelected ? selectedRows(table.selection) : allRows(rowCount);
82
+ for (const rowIndex of rows) {
83
+ const catIdx = data[rowIndex];
84
+ if (!categories[catIdx]) continue;
85
+ const helmSeq = srcCol.get(rowIndex);
86
+ if (helmSeq) {
87
+ (bodies[catIdx] ??= []).push({ID: rowIndex.toString(), HELM: helmSeq});
88
+ (clusterIndexes[catIdx] ??= []).push(rowIndex);
88
89
  }
89
90
  }
90
- checkForSingleSeqClusters(clusterIndexes, clustersColCategories);
91
-
92
- const alignedSequences: string[] = new Array(peptideCount).fill(null);
93
- for (const body of bodies) { // getting aligned sequences for each cluster
94
- if (!body || body.length === 0)
95
- continue;
96
- const alignedObject = await requestAlignedObjects(pepseaContainer.id, body, method, gapOpen, gapExtend, logger);
97
- const alignments = alignedObject.Alignment;
98
-
99
- for (const alignment of alignments) { // filling alignedSequencesCol
100
- alignedSequences[parseInt(alignment.ID)] = Object.entries(alignment)
101
- .filter((v) => !alignmentObjectMetaKeys.includes(v[0]))
102
- .map((v) => v[1] !== '-' ? v[1] : '')
103
- .join(C.PEPSEA.SEPARATOR);
104
- }
91
+ checkForSingleSeqClusters(clusterIndexes, categories);
92
+
93
+ const alignedSequences: string[] = new Array(rowCount).fill(null);
94
+ for (const body of bodies) {
95
+ if (!body || body.length === 0) continue;
96
+ const response = await requestAlignedObjects(container.id, body, method, gapOpen, gapExtend, logger);
97
+ for (const alignment of response.Alignment)
98
+ alignedSequences[parseInt(alignment.ID)] = extractAlignedSequence(alignment);
105
99
  }
106
100
 
107
- const alignedSequencesCol: DG.Column<string> = DG.Column.fromStrings(unUsedName, alignedSequences);
108
- alignedSequencesCol.meta.units = NOTATION.SEPARATOR;
109
- alignedSequencesCol.setTag(bioTAGS.separator, C.PEPSEA.SEPARATOR);
110
- alignedSequencesCol.setTag(bioTAGS.aligned, ALIGNMENT.SEQ_MSA);
111
- alignedSequencesCol.setTag(bioTAGS.alphabet, ALPHABET.UN);
112
- alignedSequencesCol.setTag(bioTAGS.alphabetIsMultichar, 'true');
113
- alignedSequencesCol.semType = DG.SEMTYPE.MACROMOLECULE;
101
+ return createPepseaResultColumn(unUsedName, alignedSequences);
102
+ }
103
+
104
+
105
+ // --- Helpers ---
106
+
107
+ function extractAlignedSequence(alignment: PepseaResponse['Alignment'][0]): string {
108
+ return Object.entries(alignment)
109
+ .filter(([key]) => !ALIGNMENT_META_KEYS.includes(key))
110
+ .map(([, val]) => val !== '-' ? val : '')
111
+ .join(C.PEPSEA.SEPARATOR);
112
+ }
113
+
114
+ function parseAlignmentResponse(response: PepseaResponse, rowCount: number): string[] {
115
+ const aligned: string[] = new Array(rowCount).fill(null);
116
+ for (const alignment of response.Alignment)
117
+ aligned[parseInt(alignment.ID)] = extractAlignedSequence(alignment);
118
+ return aligned;
119
+ }
120
+
121
+ function createPepseaResultColumn(name: string, sequences: string[]): DG.Column<string> {
122
+ const col = DG.Column.fromStrings(name, sequences);
123
+ col.meta.units = NOTATION.SEPARATOR;
124
+ col.setTag(bioTAGS.separator, C.PEPSEA.SEPARATOR);
125
+ col.setTag(bioTAGS.aligned, ALIGNMENT.SEQ_MSA);
126
+ col.setTag(bioTAGS.alphabet, ALPHABET.UN);
127
+ col.setTag(bioTAGS.alphabetIsMultichar, 'true');
128
+ col.semType = DG.SEMTYPE.MACROMOLECULE;
129
+ return col;
130
+ }
114
131
 
115
- return alignedSequencesCol;
132
+ function* selectedRows(selection: DG.BitSet): Generator<number> {
133
+ for (let i = -1; (i = selection.findNext(i, true)) !== -1;)
134
+ yield i;
135
+ }
136
+
137
+ function* allRows(count: number): Generator<number> {
138
+ for (let i = 0; i < count; i++)
139
+ yield i;
116
140
  }
117
141
 
118
142
  async function requestAlignedObjects(
119
- dockerfileId: string, body: PepseaBodyUnit[], method: string, gapOpen: number, gapExtend: number, logger?: ILogger
143
+ dockerfileId: string, body: PepseaBodyUnit[], method: string,
144
+ gapOpen: number, gapExtend: number, logger?: ILogger,
120
145
  ): Promise<PepseaResponse> {
121
146
  const params = {
122
147
  method: 'POST',
@@ -124,37 +149,34 @@ async function requestAlignedObjects(
124
149
  body: JSON.stringify(body),
125
150
  };
126
151
  const path = `/align?method=${method}&gap_open=${gapOpen}&gap_extend=${gapExtend}`;
127
- // new dockerContainers API
128
- const t1: number = window.performance.now();
152
+
153
+ const t1 = window.performance.now();
129
154
  // @ts-ignore
130
155
  const response: Response = await grok.dapi.docker.dockerContainers.fetchProxy(dockerfileId, path, params);
131
- const t2: number = window.performance.now();
132
- _package.logger.debug(`Bio: requestAlignedObjects() dockerContainers.fetchProxy(), ET: ${(t2 - t1)} ms`);
133
- const responseContentType = response.headers.get('content-type');
134
- const isJson: boolean = responseContentType === 'application/json';
135
- if (!response.ok && isJson) {
136
- const responseJson = await response.json();
137
- const pepseaErrorMsg = responseJson['pepsea-error'];
138
- if (!!pepseaErrorMsg)
139
- throw new Error(`PepSeA error: ${pepseaErrorMsg}`);
140
-
141
- const datagrokErrorMsg = responseJson['datagrok-error'];
142
- if (!!datagrokErrorMsg)
143
- throw new Error(`Datagrok error: ${datagrokErrorMsg}`);
144
-
145
- throw new Error(response.statusText);
146
- } else if (!response.ok && !isJson) {
147
- const responseStr = await response.text();
148
- throw new Error(`Error: ${responseStr}`);
149
- } else if (!isJson) {
150
- const responseStr = await response.text();
151
- throw new Error(`Error: PepSeA expected JSON response, got '${responseStr}'.`);
156
+ const t2 = window.performance.now();
157
+ _package.logger.debug(`Bio: requestAlignedObjects() ET: ${(t2 - t1)} ms`);
158
+
159
+ const contentType = response.headers.get('content-type');
160
+ const isJson = contentType === 'application/json';
161
+
162
+ if (!response.ok) {
163
+ if (isJson) {
164
+ const json = await response.json();
165
+ if (json['pepsea-error']) throw new Error(`PepSeA error: ${json['pepsea-error']}`);
166
+ if (json['datagrok-error']) throw new Error(`Datagrok error: ${json['datagrok-error']}`);
167
+ throw new Error(response.statusText);
168
+ }
169
+ const text = await response.text();
170
+ throw new Error(`Error: ${text}`);
152
171
  }
153
- const responseObj = await response.json();
154
- // Check for pepsea stderr output
155
- if ('pepsea-stderr' in responseObj) {
156
- const pepseaStdErr: string = responseObj['pepsea-stderr'] as string;
157
- logger?.warning(pepseaStdErr);
172
+
173
+ if (!isJson) {
174
+ const text = await response.text();
175
+ throw new Error(`Error: PepSeA expected JSON response, got '${text}'.`);
158
176
  }
177
+
178
+ const responseObj = await response.json();
179
+ if ('pepsea-stderr' in responseObj)
180
+ logger?.warning(responseObj['pepsea-stderr'] as string);
159
181
  return responseObj as PepseaResponse;
160
182
  }
@@ -1,5 +1,4 @@
1
1
  import * as DG from 'datagrok-api/dg';
2
- import {pepseaMethods} from './pepsea';
3
2
 
4
3
  export type DataFrameDict = { [key: string]: DG.DataFrame };
5
4
 
@@ -14,7 +13,11 @@ export type SubstitutionsInfo = Map<string, Map<string, Map<number, number[] | U
14
13
  export type SelectionObject = { [position: string]: string[] };
15
14
 
16
15
  export type MultipleSequenceAlignmentUIOptions = {
17
- col?: DG.Column<string> | null, clustersCol?: DG.Column | null,
18
- pepsea?: { method?: typeof pepseaMethods[number], gapOpen?: number, gapExtend?: number },
19
- kalign?: { gapOpen?: number, gapExtend?: number, terminalGap?: number }
16
+ col?: DG.Column<string> | null;
17
+ clustersCol?: DG.Column | null;
18
+ kalign?: { gapOpen?: number; gapExtend?: number; terminalGap?: number };
19
+ /** Name of the MSA engine function to use for non-canonical sequences */
20
+ engine?: string;
21
+ /** Engine-specific parameters (e.g. method, gapOpen, gapExtend) */
22
+ engineParams?: Record<string, any>;
20
23
  };