@datagrok-libraries/bio 5.39.29 → 5.40.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +26 -0
- package/package.json +2 -2
- package/src/monomer-works/monomer-utils.d.ts.map +1 -1
- package/src/monomer-works/monomer-utils.js +34 -31
- package/src/monomer-works/monomer-utils.js.map +1 -1
- package/src/monomer-works/to-atomic-level.d.ts +4 -4
- package/src/monomer-works/to-atomic-level.d.ts.map +1 -1
- package/src/monomer-works/to-atomic-level.js +37 -38
- package/src/monomer-works/to-atomic-level.js.map +1 -1
- package/src/utils/cell-renderer-monomer-placer.d.ts +2 -4
- package/src/utils/cell-renderer-monomer-placer.d.ts.map +1 -1
- package/src/utils/cell-renderer-monomer-placer.js +13 -16
- package/src/utils/cell-renderer-monomer-placer.js.map +1 -1
- package/src/utils/cell-renderer.d.ts +2 -3
- package/src/utils/cell-renderer.d.ts.map +1 -1
- package/src/utils/cell-renderer.js +9 -8
- package/src/utils/cell-renderer.js.map +1 -1
- package/src/utils/fasta-handler.js +2 -2
- package/src/utils/fasta-handler.js.map +1 -1
- package/src/utils/macromolecule/alignment.d.ts +4 -3
- package/src/utils/macromolecule/alignment.d.ts.map +1 -1
- package/src/utils/macromolecule/alignment.js +25 -18
- package/src/utils/macromolecule/alignment.js.map +1 -1
- package/src/utils/macromolecule/consts.d.ts +2 -0
- package/src/utils/macromolecule/consts.d.ts.map +1 -1
- package/src/utils/macromolecule/consts.js +2 -0
- package/src/utils/macromolecule/consts.js.map +1 -1
- package/src/utils/macromolecule/index.d.ts +1 -1
- package/src/utils/macromolecule/index.d.ts.map +1 -1
- package/src/utils/macromolecule/index.js +1 -1
- package/src/utils/macromolecule/index.js.map +1 -1
- package/src/utils/macromolecule/scoring.d.ts +1 -1
- package/src/utils/macromolecule/scoring.d.ts.map +1 -1
- package/src/utils/macromolecule/scoring.js +7 -5
- package/src/utils/macromolecule/scoring.js.map +1 -1
- package/src/utils/macromolecule/types.d.ts +14 -2
- package/src/utils/macromolecule/types.d.ts.map +1 -1
- package/src/utils/macromolecule/types.js +2 -0
- package/src/utils/macromolecule/types.js.map +1 -1
- package/src/utils/macromolecule/utils.d.ts +30 -12
- package/src/utils/macromolecule/utils.d.ts.map +1 -1
- package/src/utils/macromolecule/utils.js +81 -40
- package/src/utils/macromolecule/utils.js.map +1 -1
- package/src/utils/{units-handler.d.ts → seq-handler.d.ts} +45 -24
- package/src/utils/seq-handler.d.ts.map +1 -0
- package/src/utils/{units-handler.js → seq-handler.js} +293 -211
- package/src/utils/seq-handler.js.map +1 -0
- package/src/utils/splitter.d.ts.map +1 -1
- package/src/utils/splitter.js +8 -11
- package/src/utils/splitter.js.map +1 -1
- package/src/utils/units-handler.d.ts.map +0 -1
- package/src/utils/units-handler.js.map +0 -1
|
@@ -1,17 +1,19 @@
|
|
|
1
1
|
import * as DG from 'datagrok-api/dg';
|
|
2
2
|
import wu from 'wu';
|
|
3
|
-
import { NOTATION, candidateAlphabets, positionSeparator } from './macromolecule';
|
|
4
|
-
import {
|
|
3
|
+
import { NOTATION, candidateAlphabets, positionSeparator, splitterAsFasta, getSplitterWithSeparator, splitterAsHelm, } from './macromolecule';
|
|
4
|
+
import { GAP_SYMBOL, } from './macromolecule/types';
|
|
5
|
+
import { detectAlphabet, splitterAsFastaSimple, StringListSeqSplitted } from './macromolecule/utils';
|
|
5
6
|
import { mmDistanceFunctions, MmDistanceFunctionsNames } from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
|
|
6
7
|
import { getMonomerLibHelper } from '../monomer-works/monomer-utils';
|
|
7
8
|
import { HELM_WRAPPERS_REGEXP, PHOSPHATE_SYMBOL } from './const';
|
|
8
|
-
export const
|
|
9
|
+
export const SeqTemps = new class {
|
|
9
10
|
constructor() {
|
|
10
|
-
/** Column's temp slot name for a
|
|
11
|
-
this.
|
|
11
|
+
/** Column's temp slot name for a SeqHandler object */
|
|
12
|
+
this.seqHandler = `seq-handler`;
|
|
13
|
+
this.notationProvider = `seq-handler.notation-provider`;
|
|
12
14
|
}
|
|
13
15
|
}();
|
|
14
|
-
export const
|
|
16
|
+
export const GapOriginals = {
|
|
15
17
|
[NOTATION.FASTA]: '-',
|
|
16
18
|
[NOTATION.SEPARATOR]: '',
|
|
17
19
|
[NOTATION.HELM]: '*',
|
|
@@ -19,12 +21,67 @@ export const GapSymbols = {
|
|
|
19
21
|
/** Class for handling notation units in Macromolecule columns and
|
|
20
22
|
* conversion of notation systems in Macromolecule columns
|
|
21
23
|
*/
|
|
22
|
-
export class
|
|
24
|
+
export class SeqHandler {
|
|
25
|
+
constructor(col) {
|
|
26
|
+
this._splitter = null;
|
|
27
|
+
this.cached = true;
|
|
28
|
+
this._splitted = null;
|
|
29
|
+
this.columnVersion = null;
|
|
30
|
+
this._stats = null;
|
|
31
|
+
this._maxLength = null;
|
|
32
|
+
this._posList = null;
|
|
33
|
+
this._joiner = undefined;
|
|
34
|
+
if (col.type !== DG.TYPE.STRING)
|
|
35
|
+
throw new Error(`Unexpected column type '${col.type}', must be '${DG.TYPE.STRING}'.`);
|
|
36
|
+
this._column = col;
|
|
37
|
+
this._columnVersion = col.version;
|
|
38
|
+
const units = this._column.getTag(DG.TAGS.UNITS);
|
|
39
|
+
if (units !== null && units !== undefined)
|
|
40
|
+
this._units = units;
|
|
41
|
+
else
|
|
42
|
+
throw new Error('Units are not specified in column');
|
|
43
|
+
this._notation = this.getNotation();
|
|
44
|
+
this._defaultGapOriginal = (this.isFasta()) ? GapOriginals[NOTATION.FASTA] :
|
|
45
|
+
(this.isHelm()) ? GapOriginals[NOTATION.HELM] :
|
|
46
|
+
GapOriginals[NOTATION.SEPARATOR];
|
|
47
|
+
if (!this.column.tags.has("aligned" /* TAGS.aligned */) || !this.column.tags.has("alphabet" /* TAGS.alphabet */) ||
|
|
48
|
+
(!this.column.tags.has(".alphabetIsMultichar" /* TAGS.alphabetIsMultichar */) && !this.isHelm() && this.alphabet === "UN" /* ALPHABET.UN */)) {
|
|
49
|
+
// The following detectors and setters are to be called because the column is likely
|
|
50
|
+
// as the UnitsHandler constructor was called on the column.
|
|
51
|
+
if (this.isFasta())
|
|
52
|
+
SeqHandler.setUnitsToFastaColumn(this);
|
|
53
|
+
else if (this.isSeparator()) {
|
|
54
|
+
const separator = col.getTag("separator" /* TAGS.separator */);
|
|
55
|
+
SeqHandler.setUnitsToSeparatorColumn(this, separator);
|
|
56
|
+
}
|
|
57
|
+
else if (this.isHelm())
|
|
58
|
+
SeqHandler.setUnitsToHelmColumn(this);
|
|
59
|
+
else
|
|
60
|
+
throw new Error(`Unexpected units '${this.column.getTag(DG.TAGS.UNITS)}'.`);
|
|
61
|
+
}
|
|
62
|
+
// if (!this.column.tags.has(TAGS.alphabetSize)) {
|
|
63
|
+
// if (this.isHelm())
|
|
64
|
+
// throw new Error(`For column '${this.column.name}' of notation '${this.notation}' ` +
|
|
65
|
+
// `tag '${TAGS.alphabetSize}' is mandatory.`);
|
|
66
|
+
// else if (['UN'].includes(this.alphabet))
|
|
67
|
+
// throw new Error(`For column '${this.column.name}' of alphabet '${this.alphabet}' ` +
|
|
68
|
+
// `tag '${TAGS.alphabetSize}' is mandatory.`);
|
|
69
|
+
// }
|
|
70
|
+
if (!this.column.tags.has(".alphabetIsMultichar" /* TAGS.alphabetIsMultichar */)) {
|
|
71
|
+
if (this.isHelm())
|
|
72
|
+
this.column.setTag(".alphabetIsMultichar" /* TAGS.alphabetIsMultichar */, 'true');
|
|
73
|
+
else if (['UN'].includes(this.alphabet)) {
|
|
74
|
+
throw new Error(`For column '${this.column.name}' of alphabet '${this.alphabet}' ` +
|
|
75
|
+
`tag '${".alphabetIsMultichar" /* TAGS.alphabetIsMultichar */}' is mandatory.`);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
this.notationProvider = this.column.temp[SeqTemps.notationProvider] ?? null;
|
|
79
|
+
}
|
|
23
80
|
static setUnitsToFastaColumn(uh) {
|
|
24
81
|
if (uh.column.semType !== DG.SEMTYPE.MACROMOLECULE || uh.column.getTag(DG.TAGS.UNITS) !== NOTATION.FASTA)
|
|
25
82
|
throw new Error(`The column of notation '${NOTATION.FASTA}' must be '${DG.SEMTYPE.MACROMOLECULE}'.`);
|
|
26
83
|
uh.column.setTag(DG.TAGS.UNITS, NOTATION.FASTA);
|
|
27
|
-
|
|
84
|
+
SeqHandler.setTags(uh);
|
|
28
85
|
}
|
|
29
86
|
static setUnitsToSeparatorColumn(uh, separator) {
|
|
30
87
|
if (uh.column.semType !== DG.SEMTYPE.MACROMOLECULE || uh.column.getTag(DG.TAGS.UNITS) !== NOTATION.SEPARATOR)
|
|
@@ -33,13 +90,13 @@ export class UnitsHandler {
|
|
|
33
90
|
throw new Error(`The column of notation '${NOTATION.SEPARATOR}' must have the separator tag.`);
|
|
34
91
|
uh.column.setTag(DG.TAGS.UNITS, NOTATION.SEPARATOR);
|
|
35
92
|
uh.column.setTag("separator" /* TAGS.separator */, separator);
|
|
36
|
-
|
|
93
|
+
SeqHandler.setTags(uh);
|
|
37
94
|
}
|
|
38
95
|
static setUnitsToHelmColumn(uh) {
|
|
39
96
|
if (uh.column.semType !== DG.SEMTYPE.MACROMOLECULE)
|
|
40
97
|
throw new Error(`The column of notation '${NOTATION.HELM}' must be '${DG.SEMTYPE.MACROMOLECULE}'`);
|
|
41
98
|
uh.column.setTag(DG.TAGS.UNITS, NOTATION.HELM);
|
|
42
|
-
|
|
99
|
+
SeqHandler.setTags(uh);
|
|
43
100
|
}
|
|
44
101
|
/** From detectMacromolecule */
|
|
45
102
|
static setTags(uh) {
|
|
@@ -69,9 +126,10 @@ export class UnitsHandler {
|
|
|
69
126
|
}
|
|
70
127
|
}
|
|
71
128
|
get column() { return this._column; }
|
|
129
|
+
get length() { return this._column.length; }
|
|
72
130
|
get units() { return this._units; }
|
|
73
131
|
get notation() { return this._notation; }
|
|
74
|
-
get
|
|
132
|
+
get defaultGapOriginal() { return this._defaultGapOriginal; }
|
|
75
133
|
get separator() {
|
|
76
134
|
const separator = this.column.getTag("separator" /* TAGS.separator */) ?? undefined;
|
|
77
135
|
if (this.notation === NOTATION.SEPARATOR && separator === undefined)
|
|
@@ -132,35 +190,57 @@ export class UnitsHandler {
|
|
|
132
190
|
else
|
|
133
191
|
return this.column.getTag(".alphabetIsMultichar" /* TAGS.alphabetIsMultichar */) === 'true';
|
|
134
192
|
}
|
|
135
|
-
/** */
|
|
136
|
-
get splitted() {
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
193
|
+
// /** */
|
|
194
|
+
// public get splitted(): ISeqSplitted[] {
|
|
195
|
+
// // TODO: Disable cache or invalidate on changing data
|
|
196
|
+
// if (this._splitted === null) {
|
|
197
|
+
// const splitter = this.splitter;
|
|
198
|
+
// const colLength: number = this._column.length;
|
|
199
|
+
// this._splitted = new Array(colLength);
|
|
200
|
+
// const catIdxList = this._column.getRawData();
|
|
201
|
+
// const catList: string[] = this._column.categories;
|
|
202
|
+
// for (let rowIdx: number = 0; rowIdx < colLength; rowIdx++) {
|
|
203
|
+
// const seq: string = catList[catIdxList[rowIdx]];
|
|
204
|
+
// this._splitted[rowIdx] = splitter(seq);
|
|
205
|
+
// }
|
|
206
|
+
// }
|
|
207
|
+
// return this._splitted;
|
|
208
|
+
// }
|
|
209
|
+
getSplitted(rowIdx) {
|
|
210
|
+
if (!this.cached) {
|
|
211
|
+
const seq = this.column.get(rowIdx);
|
|
212
|
+
return this.splitter(seq);
|
|
213
|
+
}
|
|
214
|
+
else {
|
|
215
|
+
if (this.column.version !== this.columnVersion || this._splitted === null) {
|
|
216
|
+
this.columnVersion = this.column.version;
|
|
217
|
+
this._splitted = new Array(this.column.length);
|
|
218
|
+
}
|
|
219
|
+
let resSS = this._splitted[rowIdx] ? this._splitted[rowIdx].deref() : undefined;
|
|
220
|
+
if (!resSS) {
|
|
221
|
+
const seq = this.column.get(rowIdx);
|
|
222
|
+
resSS = this.splitter(seq);
|
|
223
|
+
this._splitted[rowIdx] = new WeakRef(resSS);
|
|
146
224
|
}
|
|
225
|
+
return resSS;
|
|
147
226
|
}
|
|
148
|
-
return this._splitted;
|
|
149
227
|
}
|
|
150
228
|
get stats() {
|
|
151
229
|
if (this._stats === null) {
|
|
152
230
|
const freq = {};
|
|
153
231
|
let sameLength = true;
|
|
154
232
|
let firstLength = null;
|
|
155
|
-
|
|
233
|
+
const colLen = this.column.length;
|
|
234
|
+
for (let rowIdx = 0; rowIdx < colLen; ++rowIdx) {
|
|
235
|
+
const mSeq = this.getSplitted(rowIdx);
|
|
156
236
|
if (firstLength == null)
|
|
157
237
|
firstLength = mSeq.length;
|
|
158
238
|
else if (mSeq.length !== firstLength)
|
|
159
239
|
sameLength = false;
|
|
160
|
-
for (const
|
|
161
|
-
if (!(
|
|
162
|
-
freq[
|
|
163
|
-
freq[
|
|
240
|
+
for (const cm of mSeq.canonicals) {
|
|
241
|
+
if (!(cm in freq))
|
|
242
|
+
freq[cm] = 0;
|
|
243
|
+
freq[cm] += 1;
|
|
164
244
|
}
|
|
165
245
|
}
|
|
166
246
|
this._stats = { freq: freq, sameLength: sameLength };
|
|
@@ -169,8 +249,8 @@ export class UnitsHandler {
|
|
|
169
249
|
}
|
|
170
250
|
get maxLength() {
|
|
171
251
|
if (this._maxLength === null) {
|
|
172
|
-
this._maxLength = this.
|
|
173
|
-
Math.max(...this.
|
|
252
|
+
this._maxLength = this.column.length === 0 ? 0 :
|
|
253
|
+
Math.max(...wu.count(0).take(this.column.length).map((rowIdx) => this.getSplitted(rowIdx).length));
|
|
174
254
|
}
|
|
175
255
|
return this._maxLength;
|
|
176
256
|
}
|
|
@@ -190,9 +270,12 @@ export class UnitsHandler {
|
|
|
190
270
|
isPeptide() { return this.alphabet === "PT" /* ALPHABET.PT */; }
|
|
191
271
|
isMsa() { return this.aligned ? this.aligned.toUpperCase().includes('MSA') : false; }
|
|
192
272
|
isHelmCompatible() { return this.helmCompatible === 'true'; }
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
273
|
+
/** Checks {@link om} for being a gap
|
|
274
|
+
* @param {string} om Original monomer of sequence symbol
|
|
275
|
+
* @return {boolean}
|
|
276
|
+
*/
|
|
277
|
+
isGap(om) {
|
|
278
|
+
return !om || om === this._defaultGapOriginal;
|
|
196
279
|
}
|
|
197
280
|
/** Associate notation types with the corresponding units */
|
|
198
281
|
/**
|
|
@@ -246,7 +329,9 @@ export class UnitsHandler {
|
|
|
246
329
|
const srcAligned = col.getTag("aligned" /* TAGS.aligned */);
|
|
247
330
|
if (srcAligned)
|
|
248
331
|
newColumn.setTag("aligned" /* TAGS.aligned */, srcAligned);
|
|
249
|
-
|
|
332
|
+
let srcAlphabet = col.getTag("alphabet" /* TAGS.alphabet */);
|
|
333
|
+
if (!srcAlphabet && this.notation === NOTATION.HELM && tgtNotation !== NOTATION.HELM)
|
|
334
|
+
srcAlphabet = "UN" /* ALPHABET.UN */;
|
|
250
335
|
if (srcAlphabet != null)
|
|
251
336
|
newColumn.setTag("alphabet" /* TAGS.alphabet */, srcAlphabet);
|
|
252
337
|
let srcAlphabetSize = col.getTag(".alphabetSize" /* TAGS.alphabetSize */);
|
|
@@ -261,8 +346,9 @@ export class UnitsHandler {
|
|
|
261
346
|
}
|
|
262
347
|
return newColumn;
|
|
263
348
|
}
|
|
264
|
-
|
|
265
|
-
|
|
349
|
+
/** Creates a new column on data of {@link seqList} with the same tags */
|
|
350
|
+
getNewColumnFromList(name, seqList) {
|
|
351
|
+
return this.getNewColumn(this.notation, this.separator, name, seqList);
|
|
266
352
|
}
|
|
267
353
|
/**
|
|
268
354
|
* Create a new empty column using templateCol as a template
|
|
@@ -272,7 +358,7 @@ export class UnitsHandler {
|
|
|
272
358
|
* @return {DG.Column}
|
|
273
359
|
*/
|
|
274
360
|
static getNewColumn(templateCol) {
|
|
275
|
-
const col =
|
|
361
|
+
const col = SeqHandler.forColumn(templateCol);
|
|
276
362
|
const targetNotation = col.notation;
|
|
277
363
|
return col.getNewColumn(targetNotation);
|
|
278
364
|
}
|
|
@@ -302,7 +388,7 @@ export class UnitsHandler {
|
|
|
302
388
|
// WARNING: in this implementation is is impossible to verify the uniqueness
|
|
303
389
|
// of the new column's name
|
|
304
390
|
// TODO: verify the validity of units parameter
|
|
305
|
-
if (!
|
|
391
|
+
if (!SeqHandler.unitsStringIsValid(units))
|
|
306
392
|
throw new Error('Invalid format of \'units\' parameter');
|
|
307
393
|
const newColumn = DG.Column.fromList('string', name, new Array(len).fill(''));
|
|
308
394
|
newColumn.semType = DG.SEMTYPE.MACROMOLECULE;
|
|
@@ -311,6 +397,10 @@ export class UnitsHandler {
|
|
|
311
397
|
}
|
|
312
398
|
/** Gets function to split seq value to monomers */
|
|
313
399
|
getSplitter(limit) {
|
|
400
|
+
let splitter = null;
|
|
401
|
+
splitter = this.notationProvider ? this.notationProvider.splitter : null;
|
|
402
|
+
if (splitter)
|
|
403
|
+
return splitter;
|
|
314
404
|
if (this.units.toLowerCase().startsWith(NOTATION.FASTA)) {
|
|
315
405
|
const alphabet = this.column.getTag("alphabet" /* TAGS.alphabet */);
|
|
316
406
|
if (alphabet !== null && !this.getAlphabetIsMultichar())
|
|
@@ -326,6 +416,9 @@ export class UnitsHandler {
|
|
|
326
416
|
throw new Error(`Unexpected units ${this.units} .`);
|
|
327
417
|
// TODO: Splitter for HELM
|
|
328
418
|
}
|
|
419
|
+
split(seq) {
|
|
420
|
+
return this.splitter(seq);
|
|
421
|
+
}
|
|
329
422
|
getDistanceFunctionName() {
|
|
330
423
|
// TODO add support for helm and separator notation
|
|
331
424
|
if (!this.isFasta())
|
|
@@ -333,10 +426,10 @@ export class UnitsHandler {
|
|
|
333
426
|
if (this.isMsa())
|
|
334
427
|
return MmDistanceFunctionsNames.HAMMING;
|
|
335
428
|
switch (this.alphabet) {
|
|
336
|
-
// As DNA and RNA scoring matrices are same as identity matrices(mostly),
|
|
337
|
-
// we can use very fast and optimized Levenshtein distance library
|
|
338
429
|
case "DNA" /* ALPHABET.DNA */:
|
|
339
430
|
case "RNA" /* ALPHABET.RNA */:
|
|
431
|
+
// As DNA and RNA scoring matrices are same as identity matrices(mostly),
|
|
432
|
+
// we can use very fast and optimized Levenshtein distance library
|
|
340
433
|
return MmDistanceFunctionsNames.LEVENSHTEIN;
|
|
341
434
|
case "PT" /* ALPHABET.PT */:
|
|
342
435
|
return MmDistanceFunctionsNames.LEVENSHTEIN;
|
|
@@ -353,7 +446,7 @@ export class UnitsHandler {
|
|
|
353
446
|
// check first for the column tag to avoid extra processing
|
|
354
447
|
if (this.column.tags.has(".isHelmCompatible" /* TAGS.isHelmCompatible */))
|
|
355
448
|
return this.column.getTag(".isHelmCompatible" /* TAGS.isHelmCompatible */) === 'true';
|
|
356
|
-
// get the
|
|
449
|
+
// get the monomer lib and check against the column
|
|
357
450
|
const monomerLibHelper = await getMonomerLibHelper();
|
|
358
451
|
const bioLib = monomerLibHelper.getBioLib();
|
|
359
452
|
// retrieve peptides
|
|
@@ -363,14 +456,21 @@ export class UnitsHandler {
|
|
|
363
456
|
// get splitter for given separator and check if all monomers are in the lib
|
|
364
457
|
const splitterFunc = getSplitterWithSeparator(this.separator);
|
|
365
458
|
// iterate over the columns, split them and check if all monomers are in the lib
|
|
366
|
-
//TODO maybe add missing
|
|
459
|
+
//TODO maybe add missing threshold so that if there are not too many missing monomers
|
|
367
460
|
// the column is still considered helm compatible
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
461
|
+
const catIdxSet = new Set();
|
|
462
|
+
const rowCount = this.column.length;
|
|
463
|
+
const colRawData = this.column.getRawData();
|
|
464
|
+
for (let rowIdx = 0; rowIdx < rowCount; ++rowIdx) {
|
|
465
|
+
const catI = colRawData[rowIdx];
|
|
466
|
+
if (!(catI in catIdxSet)) {
|
|
467
|
+
catIdxSet.add(catI);
|
|
468
|
+
const monomers = this.getSplitted(rowIdx);
|
|
469
|
+
for (const cm of monomers.canonicals) {
|
|
470
|
+
if (!peptidesSet.has(cm)) {
|
|
471
|
+
this.column.setTag(".isHelmCompatible" /* TAGS.isHelmCompatible */, 'false');
|
|
472
|
+
return false;
|
|
473
|
+
}
|
|
374
474
|
}
|
|
375
475
|
}
|
|
376
476
|
}
|
|
@@ -380,7 +480,7 @@ export class UnitsHandler {
|
|
|
380
480
|
// -- Notation Converter --
|
|
381
481
|
get splitter() {
|
|
382
482
|
if (this._splitter === null)
|
|
383
|
-
this._splitter =
|
|
483
|
+
this._splitter = this.getSplitter();
|
|
384
484
|
return this._splitter;
|
|
385
485
|
}
|
|
386
486
|
toFasta(targetNotation) { return targetNotation === NOTATION.FASTA; }
|
|
@@ -389,38 +489,38 @@ export class UnitsHandler {
|
|
|
389
489
|
/**
|
|
390
490
|
* Convert HELM string to FASTA/SEPARATOR
|
|
391
491
|
*
|
|
392
|
-
* @param {string}
|
|
492
|
+
* @param {string} srcSeq A string to be converted
|
|
393
493
|
* @param {string} tgtNotation Target notation: FASTA or SEPARATOR
|
|
394
494
|
* @param {string} tgtSeparator Optional target separator (for HELM ->
|
|
395
|
-
* @param {string | null}
|
|
495
|
+
* @param {string | null} tgtGapOriginal Optional target gap symbol
|
|
396
496
|
* SEPARATOR)
|
|
397
497
|
* @return {string} Converted string
|
|
398
498
|
*/
|
|
399
|
-
convertHelmToFastaSeparator(
|
|
400
|
-
if (!
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
499
|
+
convertHelmToFastaSeparator(srcSeq, tgtNotation, tgtSeparator, tgtGapOriginal) {
|
|
500
|
+
if (!tgtGapOriginal) {
|
|
501
|
+
tgtGapOriginal = (this.toFasta(tgtNotation)) ?
|
|
502
|
+
GapOriginals[NOTATION.FASTA] :
|
|
503
|
+
GapOriginals[NOTATION.SEPARATOR];
|
|
404
504
|
}
|
|
405
505
|
if (!tgtSeparator)
|
|
406
506
|
tgtSeparator = (this.toFasta(tgtNotation)) ? '' : this.separator;
|
|
407
|
-
const isNucleotide =
|
|
507
|
+
const isNucleotide = srcSeq.startsWith('RNA');
|
|
408
508
|
// items can be monomers or helms
|
|
409
|
-
const helmItemsArray = this.splitter(
|
|
509
|
+
const helmItemsArray = this.splitter(srcSeq);
|
|
410
510
|
const tgtMonomersArray = [];
|
|
411
|
-
for (let
|
|
412
|
-
let
|
|
511
|
+
for (let posIdx = 0; posIdx < helmItemsArray.length; ++posIdx) {
|
|
512
|
+
let om = helmItemsArray.getOriginal(posIdx);
|
|
413
513
|
if (isNucleotide)
|
|
414
|
-
|
|
415
|
-
if (
|
|
416
|
-
tgtMonomersArray.push(
|
|
417
|
-
else if (this.toFasta(tgtNotation) &&
|
|
514
|
+
om = om.replace(HELM_WRAPPERS_REGEXP, '');
|
|
515
|
+
if (om === GapOriginals[NOTATION.HELM])
|
|
516
|
+
tgtMonomersArray.push(tgtGapOriginal);
|
|
517
|
+
else if (this.toFasta(tgtNotation) && om.length > 1) {
|
|
418
518
|
// the case of a multi-character monomer converted to FASTA
|
|
419
|
-
const monomer = '[' +
|
|
519
|
+
const monomer = '[' + om + ']';
|
|
420
520
|
tgtMonomersArray.push(monomer);
|
|
421
521
|
}
|
|
422
522
|
else
|
|
423
|
-
tgtMonomersArray.push(
|
|
523
|
+
tgtMonomersArray.push(om);
|
|
424
524
|
}
|
|
425
525
|
return tgtMonomersArray.join(tgtSeparator);
|
|
426
526
|
}
|
|
@@ -431,14 +531,15 @@ export class UnitsHandler {
|
|
|
431
531
|
* @return {DG.Column} Converted column
|
|
432
532
|
*/
|
|
433
533
|
convert(tgtNotation, tgtSeparator) {
|
|
434
|
-
|
|
534
|
+
// Get joiner from the source column units handler (this) knowing about the source sequence.
|
|
535
|
+
// For example, converting DNA Helm to fasta requires removing the r(X)p decoration.
|
|
536
|
+
const joiner = this.getJoiner({ notation: tgtNotation, separator: tgtSeparator });
|
|
435
537
|
const newColumn = this.getNewColumn(tgtNotation, tgtSeparator);
|
|
436
538
|
// assign the values to the newly created empty column
|
|
437
|
-
newColumn.init((
|
|
438
|
-
const
|
|
439
|
-
return
|
|
539
|
+
newColumn.init((rowIdx) => {
|
|
540
|
+
const srcSS = this.getSplitted(rowIdx);
|
|
541
|
+
return joiner(srcSS);
|
|
440
542
|
});
|
|
441
|
-
// newColumn.setTag(DG.TAGS.UNITS, NOTATION.SEPARATOR);
|
|
442
543
|
return newColumn;
|
|
443
544
|
}
|
|
444
545
|
/**
|
|
@@ -449,20 +550,20 @@ export class UnitsHandler {
|
|
|
449
550
|
getRegion(startIdx, endIdx, name) {
|
|
450
551
|
const regCol = this.getNewColumn(this.notation, this.separator);
|
|
451
552
|
regCol.name = name;
|
|
452
|
-
const maxLength = Math.max(...this.splitted.map((seqS) => seqS.length));
|
|
453
553
|
const startIdxVal = startIdx ?? 0;
|
|
454
554
|
const endIdxVal = endIdx ?? this.maxLength - 1;
|
|
455
|
-
const
|
|
555
|
+
const joiner = this.getJoiner();
|
|
456
556
|
const regLength = endIdxVal - startIdxVal + 1;
|
|
557
|
+
const gapOM = GapOriginals[this.notation];
|
|
457
558
|
regCol.init((rowI) => {
|
|
458
|
-
const seqS = this.
|
|
559
|
+
const seqS = this.getSplitted(rowI);
|
|
459
560
|
// Custom slicing instead of array method to maintain gaps
|
|
460
|
-
const
|
|
561
|
+
const regOMList = new Array(regLength);
|
|
461
562
|
for (let regJPos = 0; regJPos < regLength; ++regJPos) {
|
|
462
563
|
const seqJPos = startIdxVal + regJPos;
|
|
463
|
-
|
|
564
|
+
regOMList[regJPos] = seqJPos < seqS.length ? seqS.getOriginal(seqJPos) : gapOM;
|
|
464
565
|
}
|
|
465
|
-
return
|
|
566
|
+
return joiner(new StringListSeqSplitted(regOMList, gapOM));
|
|
466
567
|
});
|
|
467
568
|
const getRegionOfPositionNames = (str) => {
|
|
468
569
|
const srcPosList = str.split(',').map((p) => p.trim());
|
|
@@ -481,159 +582,140 @@ export class UnitsHandler {
|
|
|
481
582
|
regCol.setTag(".positionLabels" /* TAGS.positionLabels */, getRegionOfPositionNames(srcPositionLabelsStr));
|
|
482
583
|
return regCol;
|
|
483
584
|
}
|
|
484
|
-
|
|
485
|
-
if (this._joiner
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
585
|
+
get joiner() {
|
|
586
|
+
if (!this._joiner)
|
|
587
|
+
this._joiner = this.getJoiner();
|
|
588
|
+
return this._joiner;
|
|
589
|
+
}
|
|
590
|
+
getJoiner(opts) {
|
|
591
|
+
const notation = opts ? opts.notation : this.notation;
|
|
592
|
+
const separator = opts ? opts.separator : this.separator;
|
|
593
|
+
let res;
|
|
594
|
+
const srcSh = this;
|
|
595
|
+
switch (notation) {
|
|
596
|
+
case NOTATION.FASTA: {
|
|
597
|
+
res = function (srcSS) { return srcSh.joinToFasta(srcSS, srcSh.isHelm()); };
|
|
598
|
+
break;
|
|
494
599
|
}
|
|
495
|
-
|
|
496
|
-
|
|
600
|
+
case NOTATION.SEPARATOR: {
|
|
601
|
+
if (!separator)
|
|
602
|
+
throw new Error(`Separator is mandatory for notation '${notation}'.`);
|
|
603
|
+
res = function (srcSS) { return joinToSeparator(srcSS, separator, srcSh.isHelm()); };
|
|
604
|
+
break;
|
|
605
|
+
}
|
|
606
|
+
case NOTATION.HELM: {
|
|
607
|
+
const isDnaOrRna = srcSh.alphabet === "DNA" /* ALPHABET.DNA */ || srcSh.alphabet === "RNA" /* ALPHABET.RNA */;
|
|
608
|
+
const wrappers = srcSh.getHelmWrappers();
|
|
609
|
+
res = function (srcSS) { return joinToHelm(srcSS, wrappers, isDnaOrRna); };
|
|
610
|
+
break;
|
|
611
|
+
}
|
|
612
|
+
default:
|
|
613
|
+
throw new Error(`Unexpected notation '${notation}'.`);
|
|
497
614
|
}
|
|
498
|
-
return
|
|
615
|
+
return res;
|
|
499
616
|
}
|
|
500
617
|
getConverter(tgtUnits, tgtSeparator = undefined) {
|
|
501
618
|
if (tgtUnits === NOTATION.SEPARATOR && !tgtSeparator)
|
|
502
619
|
throw new Error(`Target separator is not specified for target units '${NOTATION.SEPARATOR}'.`);
|
|
503
|
-
const
|
|
620
|
+
const srcSh = this;
|
|
504
621
|
if (tgtUnits === NOTATION.FASTA)
|
|
505
|
-
return function (
|
|
622
|
+
return function (srcSeq) { return srcSh.convertToFasta(srcSeq); };
|
|
506
623
|
if (tgtUnits === NOTATION.HELM)
|
|
507
|
-
return function (
|
|
624
|
+
return function (srcSeq) { return srcSh.convertToHelm(srcSeq); };
|
|
508
625
|
else if (tgtUnits === NOTATION.SEPARATOR)
|
|
509
|
-
return function (
|
|
626
|
+
return function (srcSeq) { return srcSh.convertToSeparator(srcSeq, tgtSeparator); };
|
|
510
627
|
else
|
|
511
628
|
throw new Error();
|
|
512
629
|
}
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
const
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
if (this.isFasta())
|
|
537
|
-
UnitsHandler.setUnitsToFastaColumn(this);
|
|
538
|
-
else if (this.isSeparator()) {
|
|
539
|
-
const separator = col.getTag("separator" /* TAGS.separator */);
|
|
540
|
-
UnitsHandler.setUnitsToSeparatorColumn(this, separator);
|
|
541
|
-
}
|
|
542
|
-
else if (this.isHelm())
|
|
543
|
-
UnitsHandler.setUnitsToHelmColumn(this);
|
|
544
|
-
else
|
|
545
|
-
throw new Error(`Unexpected units '${this.column.getTag(DG.TAGS.UNITS)}'.`);
|
|
630
|
+
/** Gets a column's UnitsHandler object from temp slot or creates a new and stores it to the temp slot. */
|
|
631
|
+
static forColumn(col) {
|
|
632
|
+
// TODO: Invalidate col.temp[Temps.uh] checking column's metadata
|
|
633
|
+
let res = col.temp[SeqTemps.seqHandler];
|
|
634
|
+
if (!res || res.columnVersion !== col.version)
|
|
635
|
+
res = col.temp[SeqTemps.seqHandler] = new SeqHandler(col);
|
|
636
|
+
return res;
|
|
637
|
+
}
|
|
638
|
+
// -- joiners & converters --
|
|
639
|
+
joinToFasta(seqS, isHelm) {
|
|
640
|
+
const resMList = new Array(seqS.length);
|
|
641
|
+
for (let posIdx = 0; posIdx < seqS.length; ++posIdx) {
|
|
642
|
+
const cm = seqS.getOriginal(posIdx);
|
|
643
|
+
let om = seqS.getOriginal(posIdx);
|
|
644
|
+
if (isHelm)
|
|
645
|
+
om = om.replace(HELM_WRAPPERS_REGEXP, '$1');
|
|
646
|
+
if (cm === GAP_SYMBOL)
|
|
647
|
+
om = GapOriginals[NOTATION.FASTA];
|
|
648
|
+
else if (cm === PHOSPHATE_SYMBOL)
|
|
649
|
+
om = '';
|
|
650
|
+
else if (om.length > 1)
|
|
651
|
+
om = '[' + om + ']';
|
|
652
|
+
resMList[posIdx] = om;
|
|
546
653
|
}
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
654
|
+
return resMList.join('');
|
|
655
|
+
}
|
|
656
|
+
convertToFasta(src) {
|
|
657
|
+
const srcUhSplitter = this.splitter;
|
|
658
|
+
const srcSS = this.isHelm() ? this.splitterAsHelmNucl(src) : srcUhSplitter(src);
|
|
659
|
+
return this.joinToFasta(srcSS, this.isHelm());
|
|
660
|
+
}
|
|
661
|
+
convertToSeparator(src, tgtSeparator) {
|
|
662
|
+
const srcSS = this.isHelm() ? this.splitterAsHelmNucl(src) : this.splitter(src);
|
|
663
|
+
return joinToSeparator(srcSS, tgtSeparator, this.isHelm());
|
|
664
|
+
}
|
|
665
|
+
convertToHelm(src) {
|
|
666
|
+
const wrappers = this.getHelmWrappers();
|
|
667
|
+
const isDnaOrRna = src.startsWith('DNA') || src.startsWith('RNA');
|
|
668
|
+
const srcSS = this.splitter(src);
|
|
669
|
+
return joinToHelm(srcSS, wrappers, isDnaOrRna);
|
|
670
|
+
}
|
|
671
|
+
/** Splits Helm sequence adjusting nucleotides to single char symbols. (!) Removes lone phosphorus. */
|
|
672
|
+
splitterAsHelmNucl(src) {
|
|
673
|
+
const srcMList = this.splitter(src);
|
|
674
|
+
const tgtMList = new Array(srcMList.length);
|
|
675
|
+
const isDna = src.startsWith('DNA');
|
|
676
|
+
const isRna = src.startsWith('RNA');
|
|
677
|
+
for (let posIdx = 0; posIdx < srcMList.length; ++posIdx) {
|
|
678
|
+
let om = srcMList.getOriginal(posIdx);
|
|
679
|
+
if (isDna || isRna) {
|
|
680
|
+
om = om.replace(HELM_WRAPPERS_REGEXP, '$1');
|
|
681
|
+
om = om === PHOSPHATE_SYMBOL ? null : om;
|
|
561
682
|
}
|
|
683
|
+
tgtMList[posIdx] = om ? om : null;
|
|
562
684
|
}
|
|
685
|
+
return new StringListSeqSplitted(tgtMList.filter((om) => !!om), GapOriginals[NOTATION.HELM]);
|
|
563
686
|
}
|
|
564
|
-
/** Gets a column's UnitsHandler object from temp slot or creates a new and stores it to the temp slot. */
|
|
565
|
-
static getOrCreate(col) {
|
|
566
|
-
let res = col.temp[Temps.uh];
|
|
567
|
-
if (!res)
|
|
568
|
-
res = col.temp[Temps.uh] = new UnitsHandler(col);
|
|
569
|
-
return res;
|
|
570
|
-
}
|
|
571
|
-
}
|
|
572
|
-
function joinToFasta(srcUh, seqS) {
|
|
573
|
-
const resMList = new Array(seqS.length);
|
|
574
|
-
for (const [srcM, mI] of wu.enumerate(seqS)) {
|
|
575
|
-
let m = srcM;
|
|
576
|
-
if (srcUh.isHelm())
|
|
577
|
-
m = srcM.replace(HELM_WRAPPERS_REGEXP, '$1');
|
|
578
|
-
if (srcUh.isGap(m))
|
|
579
|
-
m = GapSymbols[NOTATION.FASTA];
|
|
580
|
-
else if (m.length > 1)
|
|
581
|
-
m = '[' + seqS[mI] + ']';
|
|
582
|
-
resMList[mI] = m;
|
|
583
|
-
}
|
|
584
|
-
return resMList.join('');
|
|
585
687
|
}
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
return joinToFasta(srcUh, srcMList);
|
|
589
|
-
}
|
|
590
|
-
function joinToSeparator(srcUh, seqS, tgtSeparator) {
|
|
688
|
+
// -- joiners --
|
|
689
|
+
function joinToSeparator(seqS, tgtSeparator, isHelm) {
|
|
591
690
|
const resMList = new Array(seqS.length);
|
|
592
|
-
for (
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
691
|
+
for (let posIdx = 0; posIdx < seqS.length; ++posIdx) {
|
|
692
|
+
const cm = seqS.getCanonical(posIdx);
|
|
693
|
+
let om = seqS.getOriginal(posIdx);
|
|
694
|
+
if (isHelm)
|
|
695
|
+
om = om.replace(HELM_WRAPPERS_REGEXP, '$1');
|
|
696
|
+
if (cm === GAP_SYMBOL)
|
|
697
|
+
om = GapOriginals[NOTATION.SEPARATOR];
|
|
698
|
+
else if (cm === PHOSPHATE_SYMBOL)
|
|
699
|
+
om = '';
|
|
700
|
+
resMList[posIdx] = om;
|
|
701
|
+
}
|
|
702
|
+
return resMList.join(tgtSeparator);
|
|
599
703
|
}
|
|
600
|
-
function
|
|
601
|
-
const
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
else
|
|
613
|
-
m = srcM.length == 1 ? `${leftWrapper}${srcM}${rightWrapper}` : `${leftWrapper}[${srcM}]${rightWrapper}`;
|
|
614
|
-
return m;
|
|
615
|
-
}).toArray();
|
|
616
|
-
return `${prefix}${resMList.join('.')}${postfix}`;
|
|
617
|
-
}
|
|
618
|
-
function convertToHelm(srcUh, src) {
|
|
619
|
-
const isDnaOrRna = src.startsWith('DNA') || src.startsWith('RNA');
|
|
620
|
-
const srcS = srcUh.getSplitter()(src);
|
|
621
|
-
return joinToHelm(srcUh, srcS, isDnaOrRna);
|
|
622
|
-
}
|
|
623
|
-
/** Splits Helm sequence adjusting nucleotides to single char symbols. (!) Removes lone phosphorus. */
|
|
624
|
-
function splitterAsHelmNucl(srcUh, src) {
|
|
625
|
-
const srcMList = srcUh.getSplitter()(src);
|
|
626
|
-
const tgtMList = new Array(srcMList.length);
|
|
627
|
-
const isDna = src.startsWith('DNA');
|
|
628
|
-
const isRna = src.startsWith('RNA');
|
|
629
|
-
for (const [srcM, mI] of wu.enumerate(srcMList)) {
|
|
630
|
-
let m = srcM;
|
|
631
|
-
if (isDna || isRna) {
|
|
632
|
-
m = m.replace(HELM_WRAPPERS_REGEXP, '$1');
|
|
633
|
-
m = m === PHOSPHATE_SYMBOL ? null : m;
|
|
704
|
+
function joinToHelm(srcSS, wrappers, isDnaOrRna) {
|
|
705
|
+
const [prefix, leftWrapper, rightWrapper, postfix] = wrappers;
|
|
706
|
+
const resOMList = new Array(srcSS.length);
|
|
707
|
+
for (let posIdx = 0; posIdx < srcSS.length; ++posIdx) {
|
|
708
|
+
const cm = srcSS.getCanonical(posIdx);
|
|
709
|
+
let om = srcSS.getOriginal(posIdx);
|
|
710
|
+
if (cm === GAP_SYMBOL)
|
|
711
|
+
om = GapOriginals[NOTATION.HELM];
|
|
712
|
+
else {
|
|
713
|
+
if (isDnaOrRna)
|
|
714
|
+
om = om.replace(HELM_WRAPPERS_REGEXP, '$1');
|
|
715
|
+
om = om.length === 1 ? `${leftWrapper}${om}${rightWrapper}` : `${leftWrapper}[${om}]${rightWrapper}`;
|
|
634
716
|
}
|
|
635
|
-
|
|
717
|
+
resOMList[posIdx] = om;
|
|
636
718
|
}
|
|
637
|
-
return
|
|
719
|
+
return `${prefix}${resOMList.join('.')}${postfix}`;
|
|
638
720
|
}
|
|
639
|
-
//# sourceMappingURL=
|
|
721
|
+
//# sourceMappingURL=seq-handler.js.map
|